ANSCORE/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp

#include <iostream>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include "boost/property_tree/ptree.hpp"
#include "boost/property_tree/json_parser.hpp"
#include "boost/foreach.hpp"
#include "boost/optional.hpp"
#include <fstream>
#include <sstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/objdetect.hpp>
#include "ANSLPR.h"
#include "ANSLPR_CPU.h"
#include "ANSOpenCV.h"
#include "ANSRTSP.h"
#include "ANSVideoPlayer.h"
#include "ANSFilePlayer.h"
#include <filesystem>
#include <thread>
#include <mutex>
#include <atomic>
#include <chrono>
#include <deque>
#include <set>
#include <map>
#include <cuda_runtime.h>

template<typename T>
T GetOptionalValue(const boost::property_tree::ptree& pt, std::string attribute, T defaultValue) {
    if (pt.count(attribute)) {
        return pt.get<T>(attribute);
    }
    return defaultValue;
}

template <typename T>
T GetData(const boost::property_tree::ptree& pt, const std::string& key)
{
    T ret;
    if (boost::optional<T> data = pt.get_optional<T>(key))
    {
        ret = data.get();
    }

    return ret;
}

int ANSLPR_CPU_VideoTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle = new ANSCENTER::ANSALPR_CPU();
    std::string licenseKey = "";
    std::string modelZipFile = currentPath.string() + "\\ANS_GenericALPR_v1.0.zip";
    modelZipFile = "C:\\ProgramData\\ANSCENTER\\Shared\\ANS_GenericALPR_v1.0.zip";
    std::string videoFilePath = "C:\\Programs\\DemoAssets\\Videos\\ALRP\\ALPR1.mp4";
    std::string lpnResult;
    bool result = infHandle->Initialize(licenseKey, modelZipFile, "",0.5, 0.5);
    std::cout << "Loading ANSLRP:" << result << std::endl;
    cv::VideoCapture capture(videoFilePath);
    if (!capture.isOpened()) {
        printf("could not read this video file...\n");
        return -1;
    }

    while (true)
    {
        cv::Mat frame;
        if (!capture.read(frame)) // if not success, break loop
        {
            std::cout << "\n Cannot read the video file. please check your video.\n";
            break;
        }
        auto start = std::chrono::system_clock::now();

        infHandle->Inference(frame, lpnResult);
        std::string detectionResult = lpnResult;
        std::cout << "Result:" << detectionResult;
        if (!detectionResult.empty()) {
            pt.clear();
            std::stringstream ss;
            ss.clear();
            ss << detectionResult;
            boost::property_tree::read_json(ss, pt);
            BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
            {
                const boost::property_tree::ptree& result = child.second;
                const auto class_id = GetData<int>(result, "class_id");
                const auto class_name = GetData<std::string>(result, "class_name");
                const auto x = GetData<float>(result, "x");
                const auto y = GetData<float>(result, "y");
                const auto width = GetData<float>(result, "width");
                const auto height = GetData<float>(result, "height");
                cv::rectangle(frame, cv::Rect(x, y, width, height), 123, 2);
                cv::putText(frame, cv::format("%s", class_name), cv::Point(x, y - 5),
                    0, 0.6, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
            }
        }

        auto end = std::chrono::system_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        printf("Time = %lld ms\n", static_cast<long long int>(elapsed.count()));
        cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
        cv::imshow("ANSLPR", frame);
        if (cv::waitKey(30) == 27) // Wait for 'esc' key press to exit
        {
            std::cout << "End of program faces.\n";
        }
    }
    capture.release();
    cv::destroyAllWindows();
}

int ANSLPR_BigSize_VideoTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle = new ANSCENTER::ANSALPR_CPU();
    std::string licenseKey = "";
    std::string modelZipFile = currentPath.string() + "\\ANS_GenericALPR_v1.0.zip";
    modelZipFile = "C:\\ProgramData\\ANSCENTER\\Shared\\ANS_GenericALPR_v1.0.zip";
    std::string videoFilePath = "C:\\Programs\\DemoAssets\\Videos\\ALRP\\3725.mp4";
    std::string lpnResult;
    bool result = infHandle->Initialize(licenseKey, modelZipFile, "",0.5,0.5);
    std::cout << "Loading ANSLRP:" << result << std::endl;
    infHandle->LoadEngine();
    cv::VideoCapture capture(videoFilePath);
    if (!capture.isOpened()) {
        printf("could not read this video file...\n");
        return -1;
    }

    while (true)
    {
        cv::Mat frame;
        if (!capture.read(frame)) // if not success, break loop
        {
            std::cout << "\n Cannot read the video file. please check your video.\n";
            break;
        }
        auto start = std::chrono::system_clock::now();

        infHandle->Inference(frame, lpnResult,"MyCam");
        std::string detectionResult = lpnResult;
        std::cout << "Result:" << detectionResult;
        if (!detectionResult.empty()) {
            pt.clear();
            std::stringstream ss;
            ss.clear();
            ss << detectionResult;
            boost::property_tree::read_json(ss, pt);
            BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
            {
                const boost::property_tree::ptree& result = child.second;
                const auto class_id = GetData<int>(result, "class_id");
                const auto class_name = GetData<std::string>(result, "class_name");
                const auto x = GetData<float>(result, "x");
                const auto y = GetData<float>(result, "y");
                const auto width = GetData<float>(result, "width");
                const auto height = GetData<float>(result, "height");
                cv::rectangle(frame, cv::Rect(x, y, width, height), 123, 2);
                cv::putText(frame, cv::format("%s", class_name), cv::Point(x, y - 5),
                    0, 1.2, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
            }
        }

        auto end = std::chrono::system_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        printf("Time = %lld ms\n", static_cast<long long int>(elapsed.count()));
        cv::resize(frame, frame, cv::Size(frame.cols / 2, frame.rows / 2)); // to half size or even smaller

        cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
        cv::imshow("ANSLPR", frame);
        if (cv::waitKey(30) == 27) // Wait for 'esc' key press to exit
        {
            std::cout << "End of program faces.\n";
        }
    }
    capture.release();
    cv::destroyAllWindows();
}

std::string readJsonFile(const std::string& filePath) {
    boost::property_tree::ptree pt;
    boost::property_tree::read_json(filePath, pt);

    std::ostringstream oss;
    boost::property_tree::write_json(oss, pt, false);

    return oss.str();
}
unsigned char* CVMatToBytes(cv::Mat image, unsigned int& bufferLengh)
{
    int size = int(image.total() * image.elemSize());
    std::cout << "size:" << size << std::endl;
    unsigned char* bytes = new unsigned char[size];  // you will have to delete[] that later
    std::memcpy(bytes, image.data, size * sizeof(unsigned char));
    bufferLengh = size * sizeof(unsigned char);
    return bytes;
}
int ANSLPR_CPU_Inferences_FileTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle;

    std::string licenseKey = "";
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_GenericALPR_v1.0.zip";
    std::string imageFilePath = "C:\\Projects\\ANSVIS\\Documentation\\TestImages\\ALPR\\LP1.jpg";
    std::string StrBox = readJsonFile("C:\\Projects\\ANLS\\Documents\\bboxStr.json");

    int result = CreateANSALPRHandle(& infHandle, "", modelZipFile.c_str(), "",0,0.5,0.5,0);
	std::cout << "Init Result:" << result << std::endl;
    unsigned int bufferLength = 0;

    cv::Mat input = cv::imread(imageFilePath, cv::IMREAD_COLOR);
    cv::Mat frame = input;
    unsigned char* jpeg_string = CVMatToBytes(frame, bufferLength);
    int height = frame.rows;
    int width = frame.cols;
    auto start = std::chrono::system_clock::now();
    std::string detectionResult = ANSALPR_RunInferenceBinaryInCroppedImages(&infHandle, jpeg_string, width, height, StrBox.c_str());
    std::cout << "Result:" << detectionResult;
    if (!detectionResult.empty()) {
        pt.clear();
        std::stringstream ss;
        ss.clear();
        ss << detectionResult;
        boost::property_tree::read_json(ss, pt);
        BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
        {
            const boost::property_tree::ptree& result = child.second;
            const auto class_id = GetData<int>(result, "class_id");
            const auto class_name = GetData<std::string>(result, "class_name");
            const auto x = GetData<float>(result, "x");
            const auto y = GetData<float>(result, "y");
            const auto width = GetData<float>(result, "width");
            const auto height = GetData<float>(result, "height");
            cv::rectangle(frame, cv::Rect(x, y, width, height), 123, 2);
            cv::putText(frame, cv::format("%s:%d", class_name, class_id), cv::Point(x, y - 5),
                0, 0.6, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
        }
    }

    auto end = std::chrono::system_clock::now();
    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
    printf("Time = %lld ms\n", static_cast<long long int>(elapsed.count()));
    //cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
    //cv::imshow("ANSLPR", frame);
    //cv::waitKey(0);
    //cv::destroyAllWindows();
    ReleaseANSALPRHandle(&infHandle);
}
int ANSLPR_CV_VideoTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle;
    std::string licenseKey = "";
    std::string modelZipFile = currentPath.string() + "\\ANS_GenericALPR_v1.0.zip";
    modelZipFile = "C:\\Programs\\DemoAssets\\ModelsForANSVIS\\ANS_GenericALPR_v1.1.zip";
    std::string videoFilePath = "C:\\Programs\\DemoAssets\\Videos\\ALRP\\3725.mp4";
    std::string lpnResult;
	int result = CreateANSALPRHandle(& infHandle, licenseKey.c_str(), modelZipFile.c_str(), "",0,0.5,0.5,0);
    std::cout << "Loading ANSLRP:" << result << std::endl;
    cv::VideoCapture capture(videoFilePath);
    if (!capture.isOpened()) {
        printf("could not read this video file...\n");
        return -1;
    }

    while (true)
    {
        cv::Mat frame;
        if (!capture.read(frame)) // if not success, break loop
        {
            std::cout << "\n Cannot read the video file. please check your video.\n";
            break;
        }
        auto start = std::chrono::system_clock::now();
        std::string jpegImage;

        cv::Mat* image = nullptr;  // ✅ Use a pointer to hold the allocated image
		image = new cv::Mat(frame);  // ✅ Allocate the image
		ANSALPR_RunInferenceComplete_CPP(& infHandle, &image, "MyCam", 0, 0, lpnResult, jpegImage);
        std::string detectionResult = lpnResult;
        std::cout << "Result:" << detectionResult;
        if (!detectionResult.empty()) {
            pt.clear();
            std::stringstream ss;
            ss.clear();
            ss << detectionResult;
            boost::property_tree::read_json(ss, pt);
            BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
            {
                const boost::property_tree::ptree& result = child.second;
                const auto class_id = GetData<int>(result, "class_id");
                const auto class_name = GetData<std::string>(result, "class_name");
                const auto x = GetData<float>(result, "x");
                const auto y = GetData<float>(result, "y");
                const auto width = GetData<float>(result, "width");
                const auto height = GetData<float>(result, "height");
                cv::rectangle(frame, cv::Rect(x, y, width, height), 123, 2);
                cv::putText(frame, cv::format("%s", class_name), cv::Point(x, y - 5),
                    0, 1.2, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
            }
        }

        auto end = std::chrono::system_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        printf("Time = %lld ms\n", static_cast<long long int>(elapsed.count()));
        cv::resize(frame, frame, cv::Size(1920,1080)); // to half size or even smaller
		delete image;
        cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
        cv::imshow("ANSLPR", frame);
        if (cv::waitKey(30) == 27) // Wait for 'esc' key press to exit
        {
            std::cout << "End of program faces.\n";
        }
    }
    capture.release();
    cv::destroyAllWindows();
    ReleaseANSALPRHandle(&infHandle);

}
int ANSLPR_OD_VideoTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle;
    std::string licenseKey = "";
    std::string modelZipFile ="C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";// "C:\\Projects\\ANSVIS\\Models\\ANS_ALPR_v1.2.zip";//  "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ServerOptimised\\ANS_ALPR_v1.1_NVIDIAGeForceRTX4070LaptopGPU.zip";

    std::string videoFilePath =  "C:\\ProgramData\\ANSCENTER\\Shared\\day.mp4";//"E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";//
    std::string lpnResult;
	int engineType = 1;
	double detectionThreshold = 0.5;
	double ocrThreshold = 0.5;
	double detectionColourThreshold = 0.5;
    int result = CreateANSALPRHandle(&infHandle, licenseKey.c_str(), modelZipFile.c_str(), "", engineType, detectionThreshold, ocrThreshold, detectionColourThreshold);
    std::cout << "Loading ANSLRP:" << result << std::endl;

	int loadEngine = LoadANSALPREngineHandle(&infHandle);
	std::cout << "Loading ANSLRP:" << loadEngine << std::endl;
    cv::VideoCapture capture(videoFilePath);
    if (!capture.isOpened()) {
        printf("could not read this video file...\n");
        return -1;
    }

    while (true)
    {
        cv::Mat frame;
        if (!capture.read(frame)) // if not success, break loop
        {
            std::cout << "\n Cannot read the video file. please check your video.\n";
            break;
        }
        auto start = std::chrono::system_clock::now();
        std::string jpegImage;

        cv::Mat* image = nullptr;  // ✅ Use a pointer to hold the allocated image
        image = new cv::Mat(frame);  // ✅ Allocate the image
        ANSALPR_RunInferenceComplete_CPP(&infHandle, &image, "MyCam", 0, 0, lpnResult, jpegImage);
        auto end = std::chrono::system_clock::now();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        printf("Time = %lld ms\n", static_cast<long long int>(elapsed.count()));

        std::string detectionResult = lpnResult;
        //std::cout << "Result:" << detectionResult;
        if (!detectionResult.empty()) {
            pt.clear();
            std::stringstream ss;
            ss.clear();
            ss << detectionResult;
            boost::property_tree::read_json(ss, pt);
            BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
            {
                const boost::property_tree::ptree& result = child.second;
                const auto class_id = GetData<int>(result, "class_id");
                const auto class_name = GetData<std::string>(result, "class_name");
                const auto x = GetData<float>(result, "x");
                const auto y = GetData<float>(result, "y");
                const auto width = GetData<float>(result, "width");
                const auto height = GetData<float>(result, "height");
                cv::rectangle(frame, cv::Rect(x, y, width, height), 123, 2);
                cv::putText(frame, cv::format("%s", class_name), cv::Point(x, y - 5),
                    0, 1.2, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
            }
        }


        cv::resize(frame, frame, cv::Size(1920, 1080)); // to half size or even smaller
        delete image;
        cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
        cv::imshow("ANSLPR", frame);
        if (cv::waitKey(30) == 27) // Wait for 'esc' key press to exit
        {
            std::cout << "End of program faces.\n";
        }
    }
    capture.release();
    cv::destroyAllWindows();
    ReleaseANSALPRHandle(&infHandle);

}

int ANSLPR_OD_Inferences_FileTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle;
    std::string licenseKey = "";
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ServerOptimised\\ANS_ALPR_v1.2_NVIDIAGeForceRTX4070LaptopGPU.zip";
    std::string imageFilePath = "E:\\Programs\\DemoAssets\\Images\\ALPRTest\\WrongOrder\\1109.jpg";//20250912_213850.717.jpg; 20250912_213850.511.jpg;//20250912_213850.411.jpg;//20250912_213850.261.jpg(65H115912:0.73) cororect (20250912_213850.071.jpg:  65H115833)
    std::string lpnResult;
    int engineType = 1;
    double detectionThreshold = 0.3;
    double ocrThreshold = 0.5;
	double colourThreshold = 0.5;
    int result = CreateANSALPRHandle(&infHandle, licenseKey.c_str(), modelZipFile.c_str(), "", engineType, detectionThreshold, ocrThreshold, colourThreshold);
    std::cout << "Loading ANSLRP:" << result << std::endl;

    auto start = std::chrono::system_clock::now();

    int loadEngine = LoadANSALPREngineHandle(&infHandle);
    std::cout << "Init Result:" << result << std::endl;
    auto end = std::chrono::system_clock::now();

	auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
	printf("Time to load engine = %lld ms\n", static_cast<long long int>(elapsed.count()));
    unsigned int bufferLength = 0;
    std::string jpegImage;

    cv::Mat input = cv::imread(imageFilePath, cv::IMREAD_COLOR);

    cv::Mat* image = nullptr;  // ✅ Use a pointer to hold the allocated image
    image = new cv::Mat(input);  // ✅ Allocate the image
    ANSALPR_RunInferenceComplete_CPP(&infHandle, &image, "MyCam", 0, 0, lpnResult, jpegImage);
    std::string detectionResult = lpnResult;
    std::cout << "Result:" << detectionResult;
    if (!detectionResult.empty()) {
        pt.clear();
        std::stringstream ss;
        ss.clear();
        ss << detectionResult;
        boost::property_tree::read_json(ss, pt);
        BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
        {
            const boost::property_tree::ptree& result = child.second;
            const auto class_id = GetData<int>(result, "class_id");
            const auto class_name = GetData<std::string>(result, "class_name");
            const auto x = GetData<float>(result, "x");
            const auto y = GetData<float>(result, "y");
            const auto width = GetData<float>(result, "width");
            const auto height = GetData<float>(result, "height");
            cv::rectangle(input, cv::Rect(x, y, width, height), 123, 2);
            cv::putText(input, cv::format("%s", class_name), cv::Point(x, y - 5),
                0, 1.2, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
        }
    }

    delete image;
    cv::resize(input, input, cv::Size(1920, 1080)); // to half size or even smaller

    cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);

    cv::imshow("ANSLPR", input);

    cv::waitKey(0);
    cv::destroyAllWindows();
    ReleaseANSALPRHandle(&infHandle);
    return 0;
}


int ANSLPR_OD_INDOInferences_FileTest() {
    // Get the current working directory
    std::filesystem::path currentPath = std::filesystem::current_path();
    // Print the current working directory
    std::cout << "Current working directory: " << currentPath << std::endl;
    boost::property_tree::ptree root;
    boost::property_tree::ptree detectionObjects;
    boost::property_tree::ptree pt;
    ANSCENTER::ANSALPR* infHandle;
    std::string licenseKey = "";
    std::string modelZipFile = "E:\\Programs\\DemoAssets\\ModelsForANSVIS\\ANS_ALPR_IND_v1.1.zip";
    std::string imageFilePath = "E:\\Programs\\TrainingWorkingStation\\IndoALPR\\Indonesian License Plate Dataset\\data\\train075.jpg";//20250912_213850.717.jpg; 20250912_213850.511.jpg;//20250912_213850.411.jpg;//20250912_213850.261.jpg(65H115912:0.73) cororect (20250912_213850.071.jpg:  65H115833)
    std::string lpnResult;
    int engineType = 1;
    double detectionThreshold = 0.3;
    double ocrThreshold = 0.5;
    int result = CreateANSALPRHandle(&infHandle, licenseKey.c_str(), modelZipFile.c_str(), "", engineType, detectionThreshold, ocrThreshold, 0.5);
    std::cout << "Loading ANSLRP:" << result << std::endl;

    int loadEngine = LoadANSALPREngineHandle(&infHandle);
    std::cout << "Init Result:" << result << std::endl;
    unsigned int bufferLength = 0;
    std::string jpegImage;

    cv::Mat input = cv::imread(imageFilePath, cv::IMREAD_COLOR);

    cv::Mat* image = nullptr;  // ✅ Use a pointer to hold the allocated image
    image = new cv::Mat(input);  // ✅ Allocate the image
    ANSALPR_RunInferenceComplete_CPP(&infHandle, &image, "MyCam", 0, 0, lpnResult, jpegImage);
    std::string detectionResult = lpnResult;
    std::cout << "Result:" << detectionResult;
    if (!detectionResult.empty()) {
        pt.clear();
        std::stringstream ss;
        ss.clear();
        ss << detectionResult;
        boost::property_tree::read_json(ss, pt);
        BOOST_FOREACH(const boost::property_tree::ptree::value_type & child, pt.get_child("results"))
        {
            const boost::property_tree::ptree& result = child.second;
            const auto class_id = GetData<int>(result, "class_id");
            const auto class_name = GetData<std::string>(result, "class_name");
            const auto x = GetData<float>(result, "x");
            const auto y = GetData<float>(result, "y");
            const auto width = GetData<float>(result, "width");
            const auto height = GetData<float>(result, "height");
            cv::rectangle(input, cv::Rect(x, y, width, height), 123, 2);
            cv::putText(input, cv::format("%s", class_name), cv::Point(x, y - 5),
                0, 1.2, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
        }
    }

    auto end = std::chrono::system_clock::now();
    delete image;
    cv::resize(input, input, cv::Size(1920, 1080)); // to half size or even smaller
    cv::namedWindow("ANSLPR", cv::WINDOW_AUTOSIZE);
    cv::imshow("ANSLPR", input);
    cv::waitKey(0);
    cv::destroyAllWindows();
    ReleaseANSALPRHandle(&infHandle);
    return 0;
}
// ============================================================================
//  Multi-GPU ALPR Stress Test — 4 parallel RTSP→ALPR tasks
//
//  Purpose: Diagnose why dual RTX 5080 performs worse than single RTX 3050.
//  Each task has its own RTSP reader + ALPR engine.  Tasks 0-1 read stream A,
//  tasks 2-3 read stream B.  All 4 run in parallel threads.
//
//  The display composites all 4 views into a single resizable window with a
//  log panel at the bottom showing per-task stats and GPU diagnostics.
// ============================================================================

// Thread-safe logger: collects timestamped messages for on-screen log + file
static const char* LOG_FILE_PATH = "C:\\Temp\\ALPRdebug.txt";

class ThreadSafeLog {
public:
    void init() {
        std::lock_guard<std::mutex> lk(m_mtx);
        m_file.open(LOG_FILE_PATH, std::ios::out | std::ios::trunc);
        if (m_file.is_open()) {
            auto now = std::chrono::system_clock::now();
            auto t = std::chrono::system_clock::to_time_t(now);
            char timeBuf[64];
            struct tm lt;
            localtime_s(&lt, &t);
            strftime(timeBuf, sizeof(timeBuf), "%Y-%m-%d %H:%M:%S", &lt);
            m_file << "================================================================\n";
            m_file << "  ANSLPR Multi-GPU Stress Test Debug Log\n";
            m_file << "  Started: " << timeBuf << "\n";
            m_file << "  Log file: " << LOG_FILE_PATH << "\n";
            m_file << "================================================================\n\n";
            m_file.flush();
        }
    }
    void add(const std::string& msg) {
        std::lock_guard<std::mutex> lk(m_mtx);
        // Full timestamp for file: HH:MM:SS.mmm
        auto now = std::chrono::system_clock::now();
        auto t = std::chrono::system_clock::to_time_t(now);
        auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                       now.time_since_epoch()).count() % 1000;
        struct tm lt;
        localtime_s(&lt, &t);
        char ts[32];
        snprintf(ts, sizeof(ts), "[%02d:%02d:%02d.%03lld] ",
                 lt.tm_hour, lt.tm_min, lt.tm_sec, static_cast<long long>(ms));
        std::string line = std::string(ts) + msg;
        m_lines.push_back(line);
        if (m_lines.size() > 200) m_lines.pop_front();
        // Write to file immediately (flush so user can read mid-run)
        if (m_file.is_open()) {
            m_file << line << "\n";
            m_file.flush();
        }
    }
    std::deque<std::string> getRecent(size_t n) {
        std::lock_guard<std::mutex> lk(m_mtx);
        size_t start = (m_lines.size() > n) ? m_lines.size() - n : 0;
        return std::deque<std::string>(m_lines.begin() + start, m_lines.end());
    }
    void close() {
        std::lock_guard<std::mutex> lk(m_mtx);
        if (m_file.is_open()) m_file.close();
    }
private:
    std::mutex m_mtx;
    std::deque<std::string> m_lines;
    std::ofstream m_file;
};

// Per-task shared state (written by worker thread, read by display thread)
struct TaskState {
    std::mutex        mtx;
    cv::Mat           displayFrame;   // latest frame with detections drawn
    double            fps           = 0.0;
    double            inferenceMs   = 0.0;
    int               frameCount    = 0;
    int               detectionCount= 0;
    std::string       lastPlate;
    bool              engineLoaded  = false;
    bool              streamOk      = false;
    std::string       statusMsg     = "Initializing...";
    // GPU resource tracking (set during engine load)
    int               gpuDeviceId   = -1;   // which GPU this task's engine landed on
    size_t            vramUsedBytes = 0;     // VRAM consumed by this task's engine
    // Grab/Inference timing (updated by worker thread)
    double            lastGrabMs    = 0.0;
    double            lastInfMs     = 0.0;
};

// Snapshot of GPU state for real-time monitoring
struct GpuSnapshot {
    int         deviceId = 0;
    std::string name;
    size_t      totalMiB = 0;
    size_t      freeMiB  = 0;
    size_t      usedMiB  = 0;
};

// Query current GPU VRAM usage for all devices
static std::vector<GpuSnapshot> QueryGpuVram() {
    std::vector<GpuSnapshot> snapshots;
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) return snapshots;
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        int prevDevice;
        cudaGetDevice(&prevDevice);
        cudaSetDevice(i);
        size_t freeMem = 0, totalMem = 0;
        cudaMemGetInfo(&freeMem, &totalMem);
        cudaSetDevice(prevDevice);

        GpuSnapshot s;
        s.deviceId = i;
        s.name     = prop.name;
        s.totalMiB = totalMem / (1024 * 1024);
        s.freeMiB  = freeMem / (1024 * 1024);
        s.usedMiB  = s.totalMiB - s.freeMiB;
        snapshots.push_back(s);
    }
    return snapshots;
}

// Measure per-GPU free VRAM (returns array indexed by device)
static std::vector<size_t> GetPerGpuFreeMiB() {
    std::vector<size_t> result;
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) return result;
    int prevDevice;
    cudaGetDevice(&prevDevice);
    for (int i = 0; i < deviceCount; i++) {
        cudaSetDevice(i);
        size_t freeMem = 0, totalMem = 0;
        cudaMemGetInfo(&freeMem, &totalMem);
        result.push_back(freeMem / (1024 * 1024));
    }
    cudaSetDevice(prevDevice);
    return result;
}

static std::atomic<bool> g_running{true};
static ThreadSafeLog     g_log;

// Log GPU info using CUDA runtime
static void LogGpuInfo() {
    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    if (err != cudaSuccess) {
        g_log.add("CUDA ERROR: cudaGetDeviceCount failed: " + std::string(cudaGetErrorString(err)));
        printf("[GPU] CUDA ERROR: %s\n", cudaGetErrorString(err));
        return;
    }
    printf("============================================================\n");
    printf("  GPU DEVICE REPORT — %d device(s) detected\n", deviceCount);
    printf("============================================================\n");
    g_log.add("GPU DEVICE REPORT: " + std::to_string(deviceCount) + " device(s)");

    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        size_t freeMem = 0, totalMem = 0;
        cudaSetDevice(i);
        cudaMemGetInfo(&freeMem, &totalMem);

        char buf[512];
        snprintf(buf, sizeof(buf),
            "  GPU[%d] %s | SM %d.%d | VRAM: %.0f MiB total, %.0f MiB free",
            i, prop.name, prop.major, prop.minor,
            totalMem / 1048576.0, freeMem / 1048576.0);
        printf("%s\n", buf);
        g_log.add(buf);

        snprintf(buf, sizeof(buf),
            "  GPU[%d] PCIe Bus %d, Device %d | Async Engines: %d | Concurrent Kernels: %d",
            i, prop.pciBusID, prop.pciDeviceID,
            prop.asyncEngineCount, prop.concurrentKernels);
        printf("%s\n", buf);
        g_log.add(buf);
    }
    printf("============================================================\n");
}

// Worker thread: reads RTSP frames and runs ALPR inference
// RTSP client and ALPR engine are pre-created on the main thread to avoid
// race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
static void ALPRWorkerThread(int taskId,
                              ANSCENTER::ANSRTSPClient* rtspClient,
                              ANSCENTER::ANSALPR* alprHandle,
                              TaskState& state) {
    char tag[32];
    snprintf(tag, sizeof(tag), "[Task%d]", taskId);
    std::string prefix(tag);

    g_log.add(prefix + " Worker thread started");
    printf("%s Worker thread started\n", tag);

    // --- Main inference loop ---
    int width = 0, height = 0;
    int64_t pts = 0;
    int emptyFrames = 0;
    std::string cameraId = "Cam" + std::to_string(taskId);

    // FPS tracking with sliding window
    std::deque<std::chrono::steady_clock::time_point> fpsTimestamps;

    // Timing accumulators for periodic benchmarking
    double totalGrabMs = 0, totalInfMs = 0;
    int    grabCount = 0, infCount = 0;
    double maxGrabMs = 0, maxInfMs = 0;
    auto   benchStart = std::chrono::steady_clock::now();

    bool hwDecodeLogged = false;

    while (g_running.load()) {
        // Read frame from RTSP via ANSCV
        auto grabStart = std::chrono::steady_clock::now();
        cv::Mat* framePtr = nullptr;
        GetRTSPCVImage(&rtspClient, width, height, pts, &framePtr);
        auto grabEnd = std::chrono::steady_clock::now();
        double grabMs = std::chrono::duration<double, std::milli>(grabEnd - grabStart).count();

        if (framePtr == nullptr || framePtr->empty()) {
            emptyFrames++;
            if (emptyFrames % 100 == 1) {
                g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")");
            }
            if (emptyFrames > 300) {
                g_log.add(prefix + " Too many empty frames, attempting reconnect...");
                ReconnectRTSP(&rtspClient);
                emptyFrames = 0;
            }
            if (framePtr) delete framePtr;
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
        }
        emptyFrames = 0;

        // Log HW decode status once after first successful frame
        if (!hwDecodeLogged) {
            hwDecodeLogged = true;
            int hwActive = rtspClient->IsHWDecodingActive() ? 1 : 0;
            bool isCuda = rtspClient->IsCudaHWAccel();
            int hwGpu = rtspClient->GetHWDecodingGpuIndex();
            char hwBuf[256];
            const char* hwType = !hwActive ? "INACTIVE (software/CPU)"
                                 : isCuda  ? "ACTIVE (CUDA/NVDEC zero-copy)"
                                           : "ACTIVE (D3D11VA/NVDEC cpu-nv12)";
            snprintf(hwBuf, sizeof(hwBuf), "%s HW Decode: %s (GPU index: %d)",
                tag, hwType, hwGpu);
            g_log.add(hwBuf);
            printf("%s\n", hwBuf);
        }
        totalGrabMs += grabMs;
        grabCount++;
        if (grabMs > maxGrabMs) maxGrabMs = grabMs;

        // Run ALPR inference
        auto infStart = std::chrono::steady_clock::now();
        std::string lpnResult, jpegImage;
        // Pass framePtr directly — NOT a copy. ANSGpuFrameRegistry::lookup()
        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
        auto infEnd = std::chrono::steady_clock::now();
        double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
        totalInfMs += infMs;
        infCount++;
        if (infMs > maxInfMs) maxInfMs = infMs;

        // Parse detections and draw on frame
        cv::Mat display = framePtr->clone();
        int detCount = 0;
        std::string lastPlateText;

        if (!lpnResult.empty()) {
            try {
                boost::property_tree::ptree pt;
                std::stringstream ss(lpnResult);
                boost::property_tree::read_json(ss, pt);
                BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
                    const boost::property_tree::ptree& det = child.second;
                    const auto class_name = GetData<std::string>(det, "class_name");
                    const auto x = GetData<float>(det, "x");
                    const auto y = GetData<float>(det, "y");
                    const auto w = GetData<float>(det, "width");
                    const auto h = GetData<float>(det, "height");
                    cv::rectangle(display, cv::Rect((int)x, (int)y, (int)w, (int)h),
                                  cv::Scalar(0, 255, 0), 2);
                    cv::putText(display, class_name, cv::Point((int)x, (int)y - 5),
                                cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 0), 2);
                    lastPlateText = class_name;
                    detCount++;
                }
            }
            catch (...) {}
        }

        // Update FPS (sliding window over last 2 seconds)
        auto now = std::chrono::steady_clock::now();
        fpsTimestamps.push_back(now);
        while (!fpsTimestamps.empty() &&
               std::chrono::duration<double>(now - fpsTimestamps.front()).count() > 2.0) {
            fpsTimestamps.pop_front();
        }
        double fps = fpsTimestamps.size() / 2.0;

        // Draw OSD on frame
        char osd[128];
        snprintf(osd, sizeof(osd), "Task%d | %.1f FPS | Inf: %.0f ms | #%d",
                 taskId, fps, infMs, state.frameCount + 1);
        cv::putText(display, osd, cv::Point(10, 30),
                    cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 255), 2);

        // Update shared state
        {
            std::lock_guard<std::mutex> lk(state.mtx);
            state.displayFrame = display;
            state.fps = fps;
            state.inferenceMs = infMs;
            state.lastGrabMs = grabMs;
            state.lastInfMs = infMs;
            state.frameCount++;
            state.detectionCount += detCount;
            if (!lastPlateText.empty()) state.lastPlate = lastPlateText;
        }

        // Periodic logging (every 100 frames)
        if ((state.frameCount % 100) == 0) {
            double avgGrab = grabCount > 0 ? totalGrabMs / grabCount : 0;
            double avgInf  = infCount  > 0 ? totalInfMs  / infCount  : 0;
            double elapsed = std::chrono::duration<double>(
                std::chrono::steady_clock::now() - benchStart).count();

            char buf[512];
            snprintf(buf, sizeof(buf),
                "%s Frame %d | FPS=%.1f | Grab: avg=%.1fms max=%.0fms | Inf: avg=%.1fms max=%.0fms | "
                "GrabPct=%.0f%% InfPct=%.0f%% | Det=%d",
                tag, state.frameCount, fps,
                avgGrab, maxGrabMs,
                avgInf, maxInfMs,
                (totalGrabMs / (elapsed * 1000.0)) * 100.0,
                (totalInfMs  / (elapsed * 1000.0)) * 100.0,
                state.detectionCount);
            g_log.add(buf);
            printf("%s\n", buf);

            // Reset accumulators
            totalGrabMs = totalInfMs = 0;
            maxGrabMs = maxInfMs = 0;
            grabCount = infCount = 0;
            benchStart = std::chrono::steady_clock::now();
        }

        delete framePtr;
    }

    g_log.add(prefix + " Worker loop exited");
}

int ANSLPR_MultiGPU_StressTest() {
    ANSCENTER::ANSOPENCV::InitCameraNetwork();

    // --- Initialize log file ---
    g_log.init();

    printf("\n");
    printf("============================================================\n");
    printf("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
    g_log.add("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks");
    g_log.add("============================================================");

    // --- Log GPU info for diagnostics ---
    LogGpuInfo();

    // --- RTSP URLs (4 independent streams, one per task) ---
    const std::string rtspUrl0 = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
    const std::string rtspUrl1 = "rtsp://cafe2471.ddns.net:600/rtsp/streaming?channel=01&subtype=0";
    const std::string rtspUrl2 = "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0";
    const std::string rtspUrl3 = "rtsp://bnunitttd.ddns.net:554/rtsp/streaming?channel=01&subtype=0";

    g_log.add("Stream 0: " + rtspUrl0);
    g_log.add("Stream 1: " + rtspUrl1);
    g_log.add("Stream 2: " + rtspUrl2);
    g_log.add("Stream 3: " + rtspUrl3);

    // --- Task states ---
    TaskState taskStates[4];

    // =========================================================================
    //  Create 4 INDEPENDENT RTSP readers — one per task, each with its own
    //  camera stream.  Each task gets a dedicated RTSP connection.
    // =========================================================================
    const int NUM_STREAMS = 4;
    ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
    const std::string streamUrls[NUM_STREAMS] = { rtspUrl0, rtspUrl1, rtspUrl2, rtspUrl3 };
    // Map: task index -> stream index (1:1 mapping)
    const int taskStreamMap[4] = { 0, 1, 2, 3 };

    for (int s = 0; s < NUM_STREAMS; s++) {
        printf("[Stream%d] Creating RTSP handle for %s...\n", s, streamUrls[s].c_str());
        g_log.add("[Stream" + std::to_string(s) + "] Creating RTSP handle for " + streamUrls[s]);
        int rtspResult = CreateANSRTSPHandle(&rtspClients[s], "", "", "", streamUrls[s].c_str());
        if (rtspResult != 1 || rtspClients[s] == nullptr) {
            printf("[Stream%d] FAILED to create RTSP handle (result=%d)\n", s, rtspResult);
            g_log.add("[Stream" + std::to_string(s) + "] RTSP create FAILED");
            rtspClients[s] = nullptr;
            continue;
        }
        SetRTSPImageQuality(&rtspClients[s], 0);
        SetRTSPHWDecoding(&rtspClients[s], 7);  // HW_DECODING_CUDA: force CUDA/NVDEC zero-copy path
        StartRTSP(&rtspClients[s]);
        g_log.add("[Stream" + std::to_string(s) + "] RTSP started");
    }

    // =========================================================================
    //  Create 4 ALPR engines sequentially
    // =========================================================================
    ANSCENTER::ANSALPR* alprHandles[4] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
    int engineType = 1; // NVIDIA_GPU
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

    for (int i = 0; i < 4; i++) {
        char tag[32];
        snprintf(tag, sizeof(tag), "[Task%d]", i);

        int streamIdx = taskStreamMap[i];
        if (rtspClients[streamIdx] == nullptr) {
            printf("%s Skipped — Stream%d not available\n", tag, streamIdx);
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Stream not available";
            continue;
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].streamOk = true;
            taskStates[i].statusMsg = "Loading ALPR engine...";
        }

        printf("%s Creating ALPR handle (engineType=%d)...\n", tag, engineType);
        g_log.add(std::string(tag) + " Creating ALPR handle...");
        auto engineStart = std::chrono::steady_clock::now();
        int createResult = CreateANSALPRHandle(&alprHandles[i], "", modelZipFile.c_str(), "",
                                                engineType, detThresh, ocrThresh, colThresh);
        if (createResult != 1 || alprHandles[i] == nullptr) {
            printf("%s FAILED to create ALPR handle (result=%d)\n", tag, createResult);
            g_log.add(std::string(tag) + " ALPR create FAILED");
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "ALPR create failed";
            continue;
        }

        printf("%s Loading ALPR engine (TensorRT)...\n", tag);
        g_log.add(std::string(tag) + " Loading ALPR engine...");

        // Snapshot VRAM before engine load to measure consumption
        auto vramBefore = GetPerGpuFreeMiB();

        int loadResult = LoadANSALPREngineHandle(&alprHandles[i]);
        auto engineEnd = std::chrono::steady_clock::now();
        double loadMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();

        if (loadResult != 1) {
            printf("%s FAILED to load ALPR engine (result=%d)\n", tag, loadResult);
            g_log.add(std::string(tag) + " Engine load FAILED");
            ReleaseANSALPRHandle(&alprHandles[i]);
            alprHandles[i] = nullptr;
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Engine load failed";
            continue;
        }

        // Snapshot VRAM after engine load — find which GPU lost the most VRAM
        auto vramAfter = GetPerGpuFreeMiB();
        int bestGpu = 0;
        size_t maxDelta = 0;
        size_t gpuCount = vramBefore.size() < vramAfter.size() ? vramBefore.size() : vramAfter.size();
        for (size_t g = 0; g < gpuCount; g++) {
            size_t delta = (vramBefore[g] > vramAfter[g]) ? (vramBefore[g] - vramAfter[g]) : 0;
            if (delta > maxDelta) {
                maxDelta = delta;
                bestGpu = (int)g;
            }
        }

        char buf[512];
        snprintf(buf, sizeof(buf),
            "%s Engine loaded in %.0f ms | GPU[%d] | VRAM used: %zu MiB (Stream%d)",
            tag, loadMs, bestGpu, maxDelta, streamIdx);
        printf("%s\n", buf);
        g_log.add(buf);

        // Log per-GPU VRAM state after this engine load
        for (size_t g = 0; g < vramAfter.size(); g++) {
            size_t total = 0;
            if (g < vramBefore.size()) {
                // Compute total from free + used
                auto gpus = QueryGpuVram();
                if (g < gpus.size()) total = gpus[g].totalMiB;
            }
            char vbuf[256];
            snprintf(vbuf, sizeof(vbuf),
                "  GPU[%zu] VRAM: %zu MiB free (of %zu MiB)",
                g, vramAfter[g], total);
            printf("%s\n", vbuf);
            g_log.add(vbuf);
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].engineLoaded = true;
            taskStates[i].statusMsg = "Running";
            taskStates[i].gpuDeviceId = bestGpu;
            taskStates[i].vramUsedBytes = maxDelta * 1024 * 1024;
        }
    }

    // --- Align NVDEC decode GPU with inference GPU for NV12 zero-copy ---
    // Each stream should decode on the same GPU as its inference engine to enable
    // direct NVDEC→TensorRT zero-copy (0.5ms vs 17ms preprocess per frame).
    //
    // Strategy: For each stream, count how many tasks run on each GPU (vote).
    // Pick the GPU with the most tasks → maximises the number of NV12 zero-copy hits.
    // If tied, prefer to keep the current decode GPU to avoid a reconnect.
    // Additional tie-breaker: distribute streams across GPUs for decode load balance.
    {
        int streamPreferredGpu[NUM_STREAMS];
        for (int s = 0; s < NUM_STREAMS; s++) streamPreferredGpu[s] = -1;

        // Track how many streams have already been assigned to each GPU (for tie-breaking)
        std::map<int, int> gpuStreamCount;

        for (int s = 0; s < NUM_STREAMS; s++) {
            if (!rtspClients[s]) continue;

            // Count votes: how many tasks on this stream use each GPU
            std::map<int, int> gpuVotes;
            for (int i = 0; i < 4; i++) {
                if (taskStreamMap[i] == s && alprHandles[i]) {
                    gpuVotes[taskStates[i].gpuDeviceId]++;
                }
            }
            if (gpuVotes.empty()) continue;

            // Find the GPU with the most votes
            int currentGpu = rtspClients[s]->GetHWDecodingGpuIndex();
            int bestGpu = -1;
            int bestVotes = 0;
            for (auto& [gpu, votes] : gpuVotes) {
                if (votes > bestVotes) {
                    bestVotes = votes;
                    bestGpu = gpu;
                } else if (votes == bestVotes) {
                    // Tie-break 1: prefer current decode GPU (avoids reconnect)
                    if (gpu == currentGpu && bestGpu != currentGpu) {
                        bestGpu = gpu;
                    }
                    // Tie-break 2: prefer GPU with fewer streams assigned (load balance)
                    else if (bestGpu != currentGpu && gpu != currentGpu) {
                        if (gpuStreamCount[gpu] < gpuStreamCount[bestGpu]) {
                            bestGpu = gpu;
                        }
                    }
                }
            }

            streamPreferredGpu[s] = bestGpu;
            gpuStreamCount[bestGpu]++;

            char buf[512];
            std::string voteStr;
            for (auto& [gpu, votes] : gpuVotes) {
                if (!voteStr.empty()) voteStr += ", ";
                voteStr += "GPU[" + std::to_string(gpu) + "]=" + std::to_string(votes);
            }
            snprintf(buf, sizeof(buf),
                "[Stream%d] GPU vote: {%s} -> preferred GPU[%d] (current: GPU[%d])",
                s, voteStr.c_str(), bestGpu, currentGpu);
            g_log.add(buf);
            printf("%s\n", buf);
        }

        // Apply alignment: reconnect streams whose NVDEC is on the wrong GPU.
        // IMPORTANT: If currentGpu == -1, the decoder hasn't initialized yet.
        // Do NOT reconnect — it disrupts the initial RTSP handshake and causes
        // 80+ seconds of empty frames. Just set preferredGpu; the decoder will
        // use it when it naturally initializes.
        for (int s = 0; s < NUM_STREAMS; s++) {
            if (rtspClients[s] && streamPreferredGpu[s] >= 0) {
                int currentGpu = rtspClients[s]->GetHWDecodingGpuIndex();
                if (currentGpu < 0) {
                    // Decoder not yet initialized — set preferred GPU without reconnect
                    SetRTSPHWDecoding(&rtspClients[s], 7, streamPreferredGpu[s]);
                    char buf[256];
                    snprintf(buf, sizeof(buf),
                        "[Stream%d] NVDEC not yet initialized (GPU[-1]) -- set preferred GPU[%d] (no reconnect)",
                        s, streamPreferredGpu[s]);
                    g_log.add(buf);
                    printf("%s\n", buf);
                } else if (currentGpu != streamPreferredGpu[s]) {
                    // Decoder is active on wrong GPU — reconnect to move it
                    SetRTSPHWDecoding(&rtspClients[s], 7, streamPreferredGpu[s]);
                    ReconnectRTSP(&rtspClients[s]);
                    char buf[256];
                    snprintf(buf, sizeof(buf),
                        "[Stream%d] NVDEC GPU realigned: GPU[%d] -> GPU[%d] (reconnected for zero-copy)",
                        s, currentGpu, streamPreferredGpu[s]);
                    g_log.add(buf);
                    printf("%s\n", buf);
                } else {
                    char buf[256];
                    snprintf(buf, sizeof(buf),
                        "[Stream%d] NVDEC GPU already on GPU[%d] (zero-copy OK)",
                        s, currentGpu);
                    g_log.add(buf);
                    printf("%s\n", buf);
                }
            }
        }
    }

    // --- Enable deep pipeline benchmarking on all ALPR handles ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            alprHandles[i]->ActivateDebugger(true);
        }
    }
    g_log.add("Debug benchmarking ENABLED on all ALPR handles");

    // --- Launch worker threads — tasks sharing a stream get the same RTSP client ---
    g_log.add("Launching worker threads...");
    std::thread workers[4];
    for (int i = 0; i < 4; i++) {
        int streamIdx = taskStreamMap[i];
        if (rtspClients[streamIdx] && alprHandles[i]) {
            workers[i] = std::thread(ALPRWorkerThread, i,
                                     rtspClients[streamIdx], alprHandles[i],
                                     std::ref(taskStates[i]));
        }
    }

    // --- Display loop (main thread) ---
    const int cellW = 640, cellH = 480;
    const int logPanelH = 200;
    cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * 2, cellH * 2 + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
    int snapshotCount = 0;

    while (g_running.load()) {
        // --- Periodic GPU/perf snapshot every 10 seconds (written to log file) ---
        auto now2 = std::chrono::steady_clock::now();
        if (std::chrono::duration<double>(now2 - lastGpuSnapshot).count() >= 10.0) {
            lastGpuSnapshot = now2;
            snapshotCount++;
            double elapsedSec = std::chrono::duration<double>(now2 - testStart).count();
            g_log.add("---- PERIODIC SNAPSHOT #" + std::to_string(snapshotCount)
                       + " (elapsed " + std::to_string((int)elapsedSec) + "s) ----");
            // GPU VRAM
            auto gpuSnap = QueryGpuVram();
            for (const auto& gs : gpuSnap) {
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  GPU[%d] %s | Used: %zu/%zu MiB (%.1f%%)",
                    gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                    gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
                g_log.add(buf);
            }
            // Per-task stats
            double totalFpsSnap = 0;
            for (int t = 0; t < 4; t++) {
                std::lock_guard<std::mutex> lk(taskStates[t].mtx);
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
                    t, taskStates[t].gpuDeviceId,
                    taskStates[t].vramUsedBytes / (1024 * 1024),
                    taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
                    taskStates[t].frameCount, taskStates[t].detectionCount);
                g_log.add(buf);
                totalFpsSnap += taskStates[t].fps;
            }
            char buf[128];
            snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS", totalFpsSnap);
            g_log.add(buf);
            // Multi-GPU check
            std::set<int> gpusUsed;
            for (int t = 0; t < 4; t++) {
                if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
            }
            if (gpusUsed.size() > 1) {
                g_log.add("  MULTI-GPU: YES — tasks distributed across " + std::to_string(gpusUsed.size()) + " GPUs");
            } else if (!gpusUsed.empty()) {
                g_log.add("  MULTI-GPU: NO — all tasks on GPU[" + std::to_string(*gpusUsed.begin()) + "]");
            }
            g_log.add("---- END SNAPSHOT ----");
        }
        // Build 2x2 grid + log panel
        cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));

        // Place each task's frame in its quadrant
        for (int i = 0; i < 4; i++) {
            int row = i / 2, col = i % 2;
            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);

            cv::Mat cell;
            double fps = 0, infMs = 0;
            int fCount = 0, dCount = 0;
            int gpuId = -1;
            size_t vramMiB = 0;
            std::string statusMsg, lastPlate;
            bool engineLoaded = false, streamOk = false;
            {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (!taskStates[i].displayFrame.empty()) {
                    cv::resize(taskStates[i].displayFrame, cell, cv::Size(cellW, cellH));
                }
                fps = taskStates[i].fps;
                infMs = taskStates[i].inferenceMs;
                fCount = taskStates[i].frameCount;
                dCount = taskStates[i].detectionCount;
                statusMsg = taskStates[i].statusMsg;
                lastPlate = taskStates[i].lastPlate;
                engineLoaded = taskStates[i].engineLoaded;
                streamOk = taskStates[i].streamOk;
                gpuId = taskStates[i].gpuDeviceId;
                vramMiB = taskStates[i].vramUsedBytes / (1024 * 1024);
            }

            if (cell.empty()) {
                cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
                cv::putText(cell, "Task " + std::to_string(i) + ": " + statusMsg,
                            cv::Point(20, cellH / 2),
                            cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2);
            }

            // Draw status bar at bottom of each cell (2 lines)
            cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
            char bar1[256], bar2[256];
            snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
                     i, fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
            if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
            } else {
                snprintf(bar2, sizeof(bar2), "GPU: N/A");
            }
            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
            cv::putText(cell, bar1, cv::Point(5, cellH - 28),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
            cv::putText(cell, bar2, cv::Point(5, cellH - 8),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);

            cell.copyTo(canvas(roi));

            // Draw grid lines
            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
                     cv::Scalar(100, 100, 100), 1);
            cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
                     cv::Scalar(100, 100, 100), 1);
        }

        // --- Log panel at bottom ---
        cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
        cv::Mat logPanel = canvas(logRoi);
        logPanel.setTo(cv::Scalar(20, 20, 20));

        // Elapsed time header
        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
        char header[128];
        snprintf(header, sizeof(header),
                 "Elapsed: %.0fs | Press ESC to stop | Resize window freely", elapsed);
        cv::putText(logPanel, header, cv::Point(10, 18),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);

        // Aggregate stats + per-task GPU summary
        double totalFps = 0;
        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            totalFps += taskStates[i].fps;
        }
        char aggLine[256];
        snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
                 totalFps,
                 taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
                 taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
        cv::putText(logPanel, aggLine, cv::Point(10, 38),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);

        // Real-time GPU VRAM monitor (query every frame — cheap call)
        auto gpuSnaps = QueryGpuVram();
        int gpuLineY = 58;
        for (const auto& gs : gpuSnaps) {
            // Count tasks on this GPU and their total VRAM
            int tasksOnGpu = 0;
            size_t taskVramMiB = 0;
            for (int i = 0; i < 4; i++) {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (taskStates[i].gpuDeviceId == gs.deviceId) {
                    tasksOnGpu++;
                    taskVramMiB += taskStates[i].vramUsedBytes / (1024 * 1024);
                }
            }
            char gpuLine[256];
            snprintf(gpuLine, sizeof(gpuLine),
                "GPU[%d] %s | Used: %zu/%zu MiB | Tasks: %d (engine VRAM: %zu MiB)",
                gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                tasksOnGpu, taskVramMiB);
            cv::putText(logPanel, gpuLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(100, 255, 100), 1);
            gpuLineY += 18;
        }

        // Per-task resource line
        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            char tLine[256];
            snprintf(tLine, sizeof(tLine),
                "T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
                i, taskStates[i].gpuDeviceId,
                taskStates[i].vramUsedBytes / (1024 * 1024),
                taskStates[i].fps, taskStates[i].inferenceMs,
                taskStates[i].frameCount, taskStates[i].detectionCount);
            cv::putText(logPanel, tLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(200, 200, 200), 1);
            gpuLineY += 16;
        }

        // Recent log lines (remaining space)
        auto recentLogs = g_log.getRecent(4);
        for (const auto& line : recentLogs) {
            if (gpuLineY > logPanelH - 5) break;
            std::string display = (line.size() > 130) ? line.substr(0, 127) + "..." : line;
            cv::putText(logPanel, display, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(140, 140, 140), 1);
            gpuLineY += 15;
        }

        cv::imshow("ANSLPR Multi-GPU Stress Test", canvas);
        int key = cv::waitKey(30);
        if (key == 27) { // ESC
            g_log.add("ESC pressed — stopping all tasks...");
            printf("\nESC pressed — stopping...\n");
            g_running.store(false);
        }
    }

    // --- Wait for all workers ---
    printf("Waiting for worker threads to finish...\n");
    for (int i = 0; i < 4; i++) {
        if (workers[i].joinable()) workers[i].join();
    }

    // --- Print final summary (console + log file) ---
    double totalElapsed = std::chrono::duration<double>(
        std::chrono::steady_clock::now() - testStart).count();

    g_log.add("================================================================");
    g_log.add("  FINAL PERFORMANCE SUMMARY");
    g_log.add("  Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
    g_log.add("================================================================");

    printf("\n============================================================\n");
    printf("  FINAL PERFORMANCE SUMMARY (runtime: %.0fs)\n", totalElapsed);
    printf("============================================================\n");

    double totalFpsFinal = 0;
    for (int i = 0; i < 4; i++) {
        char buf[512];
        snprintf(buf, sizeof(buf),
            "  Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
            i, taskStates[i].gpuDeviceId,
            taskStates[i].vramUsedBytes / (1024 * 1024),
            taskStates[i].frameCount, taskStates[i].detectionCount,
            taskStates[i].fps, taskStates[i].inferenceMs);
        printf("%s\n", buf);
        g_log.add(buf);
        totalFpsFinal += taskStates[i].fps;
    }

    auto finalGpu = QueryGpuVram();
    for (const auto& gs : finalGpu) {
        char buf[256];
        snprintf(buf, sizeof(buf), "  GPU[%d] %s: %zu/%zu MiB used (%.1f%%)",
               gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
               gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
        printf("%s\n", buf);
        g_log.add(buf);
    }

    // Multi-GPU verdict
    std::set<int> finalGpusUsed;
    for (int i = 0; i < 4; i++) {
        if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
    }
    {
        char buf[256];
        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
        printf("%s\n", buf);
        g_log.add(buf);
    }
    if (finalGpusUsed.size() > 1) {
        char buf[128];
        snprintf(buf, sizeof(buf), "  MULTI-GPU: YES — tasks on %zu different GPUs", finalGpusUsed.size());
        printf("%s\n", buf);
        g_log.add(buf);
    } else if (!finalGpusUsed.empty()) {
        char buf[128];
        snprintf(buf, sizeof(buf), "  MULTI-GPU: NO — all tasks on GPU[%d] only", *finalGpusUsed.begin());
        printf("%s\n", buf);
        g_log.add(buf);
        g_log.add("  DIAGNOSIS: Engine pool sees only 1 GPU. On dual-GPU systems, check:");
        g_log.add("    1. Both GPUs visible to CUDA (nvidia-smi shows 2 devices)");
        g_log.add("    2. TRT engine files are compatible with both GPU architectures");
        g_log.add("    3. No CUDA_VISIBLE_DEVICES env var restricting GPU access");
    }

    printf("============================================================\n");
    g_log.add("================================================================");
    g_log.add("  Log saved to: " + std::string(LOG_FILE_PATH));
    g_log.add("================================================================");

    // --- Release all handles (sequentially on main thread) ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            ReleaseANSALPRHandle(&alprHandles[i]);
        }
    }
    for (int s = 0; s < NUM_STREAMS; s++) {
        if (rtspClients[s]) {
            StopRTSP(&rtspClients[s]);
            ReleaseANSRTSPHandle(&rtspClients[s]);
        }
    }

    g_log.close();
    cv::destroyAllWindows();
    ANSCENTER::ANSOPENCV::DeinitCameraNetwork();

    return 0;
}

// =============================================================================
//  VideoPlayer-based worker thread for SimulatedCam stress test
//  Same structure as ALPRWorkerThread but uses ANSVideoPlayer instead of ANSRTSP
// =============================================================================
static void ALPRWorkerThread_VideoPlayer(int taskId,
                              ANSCENTER::ANSVIDEOPLAYER* vpClient,
                              ANSCENTER::ANSALPR* alprHandle,
                              TaskState& state) {
    char tag[32];
    snprintf(tag, sizeof(tag), "[Task%d]", taskId);
    std::string prefix(tag);

    g_log.add(prefix + " Worker thread started");
    printf("%s Worker thread started\n", tag);

    int width = 0, height = 0;
    int64_t pts = 0;
    int emptyFrames = 0;
    std::string cameraId = "Cam" + std::to_string(taskId);

    // FPS tracking with sliding window
    std::deque<std::chrono::steady_clock::time_point> fpsTimestamps;

    // Timing accumulators for periodic benchmarking
    double totalGrabMs = 0, totalInfMs = 0;
    int    grabCount = 0, infCount = 0;
    double maxGrabMs = 0, maxInfMs = 0;
    auto   benchStart = std::chrono::steady_clock::now();

    while (g_running.load()) {
        // Read frame from VideoPlayer
        auto grabStart = std::chrono::steady_clock::now();
        cv::Mat* framePtr = nullptr;
        GetVideoPlayerCVImage(&vpClient, width, height, pts, &framePtr);
        auto grabEnd = std::chrono::steady_clock::now();
        double grabMs = std::chrono::duration<double, std::milli>(grabEnd - grabStart).count();

        if (framePtr == nullptr || framePtr->empty()) {
            emptyFrames++;
            if (emptyFrames % 100 == 1) {
                g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")");
            }
            if (emptyFrames > 300) {
                g_log.add(prefix + " Too many empty frames, attempting reconnect...");
                ReconnectVideoPlayer(&vpClient);
                emptyFrames = 0;
            }
            if (framePtr) delete framePtr;
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
        }
        emptyFrames = 0;

        totalGrabMs += grabMs;
        grabCount++;
        if (grabMs > maxGrabMs) maxGrabMs = grabMs;

        // Run ALPR inference
        auto infStart = std::chrono::steady_clock::now();
        std::string lpnResult, jpegImage;
        // Pass framePtr directly — NOT a copy. ANSGpuFrameRegistry::lookup()
        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
        auto infEnd = std::chrono::steady_clock::now();
        double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
        totalInfMs += infMs;
        infCount++;
        if (infMs > maxInfMs) maxInfMs = infMs;

        // Parse detections and draw on frame
        cv::Mat display = framePtr->clone();
        int detCount = 0;
        std::string lastPlateText;

        if (!lpnResult.empty()) {
            try {
                boost::property_tree::ptree pt;
                std::stringstream ss(lpnResult);
                boost::property_tree::read_json(ss, pt);
                BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
                    const boost::property_tree::ptree& det = child.second;
                    const auto class_name = GetData<std::string>(det, "class_name");
                    const auto x = GetData<float>(det, "x");
                    const auto y = GetData<float>(det, "y");
                    const auto w = GetData<float>(det, "width");
                    const auto h = GetData<float>(det, "height");
                    cv::rectangle(display, cv::Rect((int)x, (int)y, (int)w, (int)h),
                                  cv::Scalar(0, 255, 0), 2);
                    cv::putText(display, class_name, cv::Point((int)x, (int)y - 5),
                                cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 0), 2);
                    lastPlateText = class_name;
                    detCount++;
                }
            }
            catch (...) {}
        }

        // Update FPS (sliding window over last 2 seconds)
        auto now = std::chrono::steady_clock::now();
        fpsTimestamps.push_back(now);
        while (!fpsTimestamps.empty() &&
               std::chrono::duration<double>(now - fpsTimestamps.front()).count() > 2.0) {
            fpsTimestamps.pop_front();
        }
        double fps = fpsTimestamps.size() / 2.0;

        // Draw OSD on frame
        char osd[128];
        snprintf(osd, sizeof(osd), "Task%d | %.1f FPS | Inf: %.0f ms | #%d",
                 taskId, fps, infMs, state.frameCount + 1);
        cv::putText(display, osd, cv::Point(10, 30),
                    cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 255), 2);

        // Update shared state
        {
            std::lock_guard<std::mutex> lk(state.mtx);
            state.displayFrame = display;
            state.fps = fps;
            state.inferenceMs = infMs;
            state.lastGrabMs = grabMs;
            state.lastInfMs = infMs;
            state.frameCount++;
            state.detectionCount += detCount;
            if (!lastPlateText.empty()) state.lastPlate = lastPlateText;
        }

        // Periodic logging (every 100 frames)
        if ((state.frameCount % 100) == 0) {
            double avgGrab = grabCount > 0 ? totalGrabMs / grabCount : 0;
            double avgInf  = infCount  > 0 ? totalInfMs  / infCount  : 0;
            double elapsed = std::chrono::duration<double>(
                std::chrono::steady_clock::now() - benchStart).count();

            char buf[512];
            snprintf(buf, sizeof(buf),
                "%s Frame %d | FPS=%.1f | Grab: avg=%.1fms max=%.0fms | Inf: avg=%.1fms max=%.0fms | "
                "GrabPct=%.0f%% InfPct=%.0f%% | Det=%d",
                tag, state.frameCount, fps,
                avgGrab, maxGrabMs,
                avgInf, maxInfMs,
                (totalGrabMs / (elapsed * 1000.0)) * 100.0,
                (totalInfMs  / (elapsed * 1000.0)) * 100.0,
                state.detectionCount);
            g_log.add(buf);
            printf("%s\n", buf);

            // Reset accumulators
            totalGrabMs = totalInfMs = 0;
            maxGrabMs = maxInfMs = 0;
            grabCount = infCount = 0;
            benchStart = std::chrono::steady_clock::now();
        }

        delete framePtr;
    }

    g_log.add(prefix + " Worker loop exited");
}

// =============================================================================
//  ANSLPR_MultiGPU_StressTest_SimulatedCam
//  Same structure as ANSLPR_MultiGPU_StressTest but uses local video files
//  via ANSVideoPlayer instead of live RTSP streams.
// =============================================================================
int ANSLPR_MultiGPU_StressTest_SimulatedCam() {
    ANSCENTER::ANSOPENCV::InitCameraNetwork();

    // --- Initialize log file ---
    g_log.init();

    printf("\n");
    printf("============================================================\n");
    printf("  ANSLPR Multi-GPU Stress Test (Simulated Cam)\n");
    printf("  Using local video files via ANSVideoPlayer\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
    g_log.add("  ANSLPR Multi-GPU Stress Test (Simulated Cam)");
    g_log.add("  Using ANSVideoPlayer with local video files");
    g_log.add("============================================================");

    // --- Log GPU info for diagnostics ---
    LogGpuInfo();

    // --- Video file paths (4 files, one per task) ---
    const std::string videoFile0 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";
    const std::string videoFile1 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_1.mp4";
    const std::string videoFile2 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_2.mp4";
    const std::string videoFile3 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_3.mp4";

    g_log.add("Video 0: " + videoFile0);
    g_log.add("Video 1: " + videoFile1);
    g_log.add("Video 2: " + videoFile2);
    g_log.add("Video 3: " + videoFile3);

    // --- Task states ---
    TaskState taskStates[4];

    // =========================================================================
    //  Create 4 VideoPlayer readers — one per task
    // =========================================================================
    const int NUM_STREAMS = 4;
    ANSCENTER::ANSVIDEOPLAYER* vpClients[NUM_STREAMS] = {};
    const std::string videoFiles[NUM_STREAMS] = { videoFile0, videoFile1, videoFile2, videoFile3 };
    const int taskStreamMap[4] = { 0, 1, 2, 3 };

    for (int s = 0; s < NUM_STREAMS; s++) {
        printf("[Stream%d] Creating VideoPlayer for %s\n", s, videoFiles[s].c_str());
        g_log.add("[Stream" + std::to_string(s) + "] Creating VideoPlayer for " + videoFiles[s]);
        int result = CreateANSVideoPlayerHandle(&vpClients[s], "", videoFiles[s].c_str());
        if (result != 1 || vpClients[s] == nullptr) {
            printf("[Stream%d] FAILED to create VideoPlayer (result=%d)\n", s, result);
            g_log.add("[Stream" + std::to_string(s) + "] VideoPlayer create FAILED");
            vpClients[s] = nullptr;
            continue;
        }
        // Don't call StartVideoPlayer here — play() will be called just before worker threads
        // launch, so the video doesn't play to completion during the ~16s engine loading phase.
        SetVideoPlayerDisplayResolution(&vpClients[s], 1920, 1080);
        g_log.add("[Stream" + std::to_string(s) + "] VideoPlayer created (display: 1920x1080)");
    }

    // =========================================================================
    //  Create 4 ALPR engines sequentially
    // =========================================================================
    ANSCENTER::ANSALPR* alprHandles[4] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
    int engineType = 1; // NVIDIA_GPU
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

    for (int i = 0; i < 4; i++) {
        char tag[32];
        snprintf(tag, sizeof(tag), "[Task%d]", i);

        int streamIdx = taskStreamMap[i];
        if (vpClients[streamIdx] == nullptr) {
            printf("%s Skipped — Stream%d not available\n", tag, streamIdx);
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Stream not available";
            continue;
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].streamOk = true;
            taskStates[i].statusMsg = "Loading ALPR engine...";
        }

        printf("%s Creating ALPR handle (engineType=%d)...\n", tag, engineType);
        g_log.add(std::string(tag) + " Creating ALPR handle...");
        auto engineStart = std::chrono::steady_clock::now();
        int createResult = CreateANSALPRHandle(&alprHandles[i], "", modelZipFile.c_str(), "",
                                                engineType, detThresh, ocrThresh, colThresh);
        if (createResult != 1 || alprHandles[i] == nullptr) {
            printf("%s FAILED to create ALPR handle (result=%d)\n", tag, createResult);
            g_log.add(std::string(tag) + " ALPR create FAILED");
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "ALPR create failed";
            continue;
        }

        printf("%s Loading ALPR engine (TensorRT)...\n", tag);
        g_log.add(std::string(tag) + " Loading ALPR engine...");

        // Snapshot VRAM before engine load to measure consumption
        auto vramBefore = GetPerGpuFreeMiB();

        int loadResult = LoadANSALPREngineHandle(&alprHandles[i]);
        auto engineEnd = std::chrono::steady_clock::now();
        double loadMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();

        if (loadResult != 1) {
            printf("%s FAILED to load ALPR engine (result=%d)\n", tag, loadResult);
            g_log.add(std::string(tag) + " Engine load FAILED");
            ReleaseANSALPRHandle(&alprHandles[i]);
            alprHandles[i] = nullptr;
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Engine load failed";
            continue;
        }

        // Snapshot VRAM after engine load — find which GPU lost the most VRAM
        auto vramAfter = GetPerGpuFreeMiB();
        int bestGpu = 0;
        size_t maxDelta = 0;
        size_t gpuCount = vramBefore.size() < vramAfter.size() ? vramBefore.size() : vramAfter.size();
        for (size_t g = 0; g < gpuCount; g++) {
            size_t delta = (vramBefore[g] > vramAfter[g]) ? (vramBefore[g] - vramAfter[g]) : 0;
            if (delta > maxDelta) {
                maxDelta = delta;
                bestGpu = (int)g;
            }
        }

        char buf[512];
        snprintf(buf, sizeof(buf),
            "%s Engine loaded in %.0f ms | GPU[%d] | VRAM used: %zu MiB (Video%d)",
            tag, loadMs, bestGpu, maxDelta, streamIdx);
        printf("%s\n", buf);
        g_log.add(buf);

        // Log per-GPU VRAM state after this engine load
        for (size_t g = 0; g < vramAfter.size(); g++) {
            size_t total = 0;
            if (g < vramBefore.size()) {
                auto gpus = QueryGpuVram();
                if (g < gpus.size()) total = gpus[g].totalMiB;
            }
            char vbuf[256];
            snprintf(vbuf, sizeof(vbuf),
                "  GPU[%zu] VRAM: %zu MiB free (of %zu MiB)",
                g, vramAfter[g], total);
            printf("%s\n", vbuf);
            g_log.add(vbuf);
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].engineLoaded = true;
            taskStates[i].statusMsg = "Running";
            taskStates[i].gpuDeviceId = bestGpu;
            taskStates[i].vramUsedBytes = maxDelta * 1024 * 1024;
        }
    }

    // --- No NVDEC realignment needed — ANSVideoPlayer uses cv::VideoCapture (CPU decode) ---

    // --- Enable deep pipeline benchmarking on all ALPR handles ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            alprHandles[i]->ActivateDebugger(true);
        }
    }
    g_log.add("Debug benchmarking ENABLED on all ALPR handles");

    // --- Start video playback NOW (just before workers need frames) ---
    // This avoids the video playing to completion during the ~16s engine loading phase.
    for (int s = 0; s < NUM_STREAMS; s++) {
        if (vpClients[s]) {
            StartVideoPlayer(&vpClients[s]);
            g_log.add("[Stream" + std::to_string(s) + "] VideoPlayer play() started");
        }
    }

    // --- Launch worker threads ---
    g_log.add("Launching worker threads...");
    std::thread workers[4];
    for (int i = 0; i < 4; i++) {
        int streamIdx = taskStreamMap[i];
        if (vpClients[streamIdx] && alprHandles[i]) {
            workers[i] = std::thread(ALPRWorkerThread_VideoPlayer, i,
                                     vpClients[streamIdx], alprHandles[i],
                                     std::ref(taskStates[i]));
        }
    }

    // --- Display loop (main thread) ---
    const int cellW = 640, cellH = 480;
    const int logPanelH = 200;
    const char* windowName = "ANSLPR Multi-GPU Stress Test (Simulated Cam)";
    cv::namedWindow(windowName, cv::WINDOW_NORMAL);
    cv::resizeWindow(windowName, cellW * 2, cellH * 2 + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
    int snapshotCount = 0;

    while (g_running.load()) {
        // --- Periodic GPU/perf snapshot every 10 seconds ---
        auto now2 = std::chrono::steady_clock::now();
        if (std::chrono::duration<double>(now2 - lastGpuSnapshot).count() >= 10.0) {
            lastGpuSnapshot = now2;
            snapshotCount++;
            double elapsedSec = std::chrono::duration<double>(now2 - testStart).count();
            g_log.add("---- PERIODIC SNAPSHOT #" + std::to_string(snapshotCount)
                       + " (elapsed " + std::to_string((int)elapsedSec) + "s) ----");
            auto gpuSnap = QueryGpuVram();
            for (const auto& gs : gpuSnap) {
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  GPU[%d] %s | Used: %zu/%zu MiB (%.1f%%)",
                    gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                    gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
                g_log.add(buf);
            }
            double totalFpsSnap = 0;
            for (int t = 0; t < 4; t++) {
                std::lock_guard<std::mutex> lk(taskStates[t].mtx);
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
                    t, taskStates[t].gpuDeviceId,
                    taskStates[t].vramUsedBytes / (1024 * 1024),
                    taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
                    taskStates[t].frameCount, taskStates[t].detectionCount);
                g_log.add(buf);
                totalFpsSnap += taskStates[t].fps;
            }
            char buf[128];
            snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS", totalFpsSnap);
            g_log.add(buf);
            std::set<int> gpusUsed;
            for (int t = 0; t < 4; t++) {
                if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
            }
            if (gpusUsed.size() > 1) {
                g_log.add("  MULTI-GPU: YES — tasks distributed across " + std::to_string(gpusUsed.size()) + " GPUs");
            } else if (!gpusUsed.empty()) {
                g_log.add("  MULTI-GPU: NO — all tasks on GPU[" + std::to_string(*gpusUsed.begin()) + "]");
            }
            g_log.add("---- END SNAPSHOT ----");
        }

        // Build 2x2 grid + log panel
        cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));

        for (int i = 0; i < 4; i++) {
            int row = i / 2, col = i % 2;
            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);

            cv::Mat cell;
            double fps = 0, infMs = 0;
            int fCount = 0, dCount = 0;
            int gpuId = -1;
            size_t vramMiB = 0;
            std::string statusMsg, lastPlate;
            bool engineLoaded = false, streamOk = false;
            {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (!taskStates[i].displayFrame.empty()) {
                    cv::resize(taskStates[i].displayFrame, cell, cv::Size(cellW, cellH));
                }
                fps = taskStates[i].fps;
                infMs = taskStates[i].inferenceMs;
                fCount = taskStates[i].frameCount;
                dCount = taskStates[i].detectionCount;
                statusMsg = taskStates[i].statusMsg;
                lastPlate = taskStates[i].lastPlate;
                engineLoaded = taskStates[i].engineLoaded;
                streamOk = taskStates[i].streamOk;
                gpuId = taskStates[i].gpuDeviceId;
                vramMiB = taskStates[i].vramUsedBytes / (1024 * 1024);
            }

            if (cell.empty()) {
                cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
                cv::putText(cell, "Task " + std::to_string(i) + ": " + statusMsg,
                            cv::Point(20, cellH / 2),
                            cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2);
            }

            cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
            char bar1[256], bar2[256];
            snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
                     i, fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
            if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
            } else {
                snprintf(bar2, sizeof(bar2), "GPU: N/A");
            }
            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
            cv::putText(cell, bar1, cv::Point(5, cellH - 28),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
            cv::putText(cell, bar2, cv::Point(5, cellH - 8),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);

            cell.copyTo(canvas(roi));

            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
                     cv::Scalar(100, 100, 100), 1);
            cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
                     cv::Scalar(100, 100, 100), 1);
        }

        // --- Log panel at bottom ---
        cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
        cv::Mat logPanel = canvas(logRoi);
        logPanel.setTo(cv::Scalar(20, 20, 20));

        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
        char header[128];
        snprintf(header, sizeof(header),
                 "Elapsed: %.0fs | Simulated Cam (VideoPlayer) | Press ESC to stop", elapsed);
        cv::putText(logPanel, header, cv::Point(10, 18),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);

        double totalFps = 0;
        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            totalFps += taskStates[i].fps;
        }
        char aggLine[256];
        snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
                 totalFps,
                 taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
                 taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
        cv::putText(logPanel, aggLine, cv::Point(10, 38),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);

        auto gpuSnaps = QueryGpuVram();
        int gpuLineY = 58;
        for (const auto& gs : gpuSnaps) {
            int tasksOnGpu = 0;
            size_t taskVramMiB = 0;
            for (int i = 0; i < 4; i++) {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (taskStates[i].gpuDeviceId == gs.deviceId) {
                    tasksOnGpu++;
                    taskVramMiB += taskStates[i].vramUsedBytes / (1024 * 1024);
                }
            }
            char gpuLine[256];
            snprintf(gpuLine, sizeof(gpuLine),
                "GPU[%d] %s | Used: %zu/%zu MiB | Tasks: %d (engine VRAM: %zu MiB)",
                gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                tasksOnGpu, taskVramMiB);
            cv::putText(logPanel, gpuLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(100, 255, 100), 1);
            gpuLineY += 18;
        }

        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            char tLine[256];
            snprintf(tLine, sizeof(tLine),
                "T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
                i, taskStates[i].gpuDeviceId,
                taskStates[i].vramUsedBytes / (1024 * 1024),
                taskStates[i].fps, taskStates[i].inferenceMs,
                taskStates[i].frameCount, taskStates[i].detectionCount);
            cv::putText(logPanel, tLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(200, 200, 200), 1);
            gpuLineY += 16;
        }

        auto recentLogs = g_log.getRecent(4);
        for (const auto& line : recentLogs) {
            if (gpuLineY > logPanelH - 5) break;
            std::string display = (line.size() > 130) ? line.substr(0, 127) + "..." : line;
            cv::putText(logPanel, display, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(140, 140, 140), 1);
            gpuLineY += 15;
        }

        cv::imshow(windowName, canvas);
        int key = cv::waitKey(30);
        if (key == 27) { // ESC
            g_log.add("ESC pressed — stopping all tasks...");
            printf("\nESC pressed — stopping...\n");
            g_running.store(false);
        }
    }

    // --- Wait for all workers ---
    printf("Waiting for worker threads to finish...\n");
    for (int i = 0; i < 4; i++) {
        if (workers[i].joinable()) workers[i].join();
    }

    // --- Print final summary ---
    double totalElapsed = std::chrono::duration<double>(
        std::chrono::steady_clock::now() - testStart).count();

    g_log.add("================================================================");
    g_log.add("  FINAL PERFORMANCE SUMMARY (Simulated Cam)");
    g_log.add("  Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
    g_log.add("================================================================");

    printf("\n============================================================\n");
    printf("  FINAL PERFORMANCE SUMMARY — Simulated Cam (runtime: %.0fs)\n", totalElapsed);
    printf("============================================================\n");

    double totalFpsFinal = 0;
    for (int i = 0; i < 4; i++) {
        char buf[512];
        snprintf(buf, sizeof(buf),
            "  Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
            i, taskStates[i].gpuDeviceId,
            taskStates[i].vramUsedBytes / (1024 * 1024),
            taskStates[i].frameCount, taskStates[i].detectionCount,
            taskStates[i].fps, taskStates[i].inferenceMs);
        printf("%s\n", buf);
        g_log.add(buf);
        totalFpsFinal += taskStates[i].fps;
    }

    auto finalGpu = QueryGpuVram();
    for (const auto& gs : finalGpu) {
        char buf[256];
        snprintf(buf, sizeof(buf), "  GPU[%d] %s: %zu/%zu MiB used (%.1f%%)",
               gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
               gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
        printf("%s\n", buf);
        g_log.add(buf);
    }

    std::set<int> finalGpusUsed;
    for (int i = 0; i < 4; i++) {
        if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
    }
    {
        char buf[256];
        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
        printf("%s\n", buf);
        g_log.add(buf);
    }
    if (finalGpusUsed.size() > 1) {
        char buf[128];
        snprintf(buf, sizeof(buf), "  MULTI-GPU: YES — tasks on %zu different GPUs", finalGpusUsed.size());
        printf("%s\n", buf);
        g_log.add(buf);
    } else if (!finalGpusUsed.empty()) {
        char buf[128];
        snprintf(buf, sizeof(buf), "  MULTI-GPU: NO — all tasks on GPU[%d] only", *finalGpusUsed.begin());
        printf("%s\n", buf);
        g_log.add(buf);
    }

    printf("============================================================\n");
    g_log.add("================================================================");
    g_log.add("  Log saved to: " + std::string(LOG_FILE_PATH));
    g_log.add("================================================================");

    // --- Release all handles ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            ReleaseANSALPRHandle(&alprHandles[i]);
        }
    }
    for (int s = 0; s < NUM_STREAMS; s++) {
        if (vpClients[s]) {
            StopVideoPlayer(&vpClients[s]);
            ReleaseANSVideoPlayerHandle(&vpClients[s]);
        }
    }

    g_log.close();
    cv::destroyAllWindows();
    ANSCENTER::ANSOPENCV::DeinitCameraNetwork();

    return 0;
}

// =============================================================================
//  Worker thread for FilePlayer-based stress test (uses ANSFILEPLAYER)
//  Key difference from VideoPlayer worker: uses GetFilePlayerCVImage/ReconnectFilePlayer
// =============================================================================
static void ALPRWorkerThread_FilePlayer(int taskId,
                              ANSCENTER::ANSFILEPLAYER* fpClient,
                              ANSCENTER::ANSALPR* alprHandle,
                              TaskState& state) {
    char tag[32];
    snprintf(tag, sizeof(tag), "[Task%d]", taskId);
    std::string prefix(tag);

    g_log.add(prefix + " Worker thread started");
    printf("%s Worker thread started\n", tag);

    int width = 0, height = 0;
    int64_t pts = 0;
    int emptyFrames = 0;
    std::string cameraId = "Cam" + std::to_string(taskId);

    std::deque<std::chrono::steady_clock::time_point> fpsTimestamps;
    double totalGrabMs = 0, totalInfMs = 0;
    int    grabCount = 0, infCount = 0;
    double maxGrabMs = 0, maxInfMs = 0;
    auto   benchStart = std::chrono::steady_clock::now();

    while (g_running.load()) {
        auto grabStart = std::chrono::steady_clock::now();
        cv::Mat* framePtr = nullptr;
        GetFilePlayerCVImage(&fpClient, width, height, pts, &framePtr);
        auto grabEnd = std::chrono::steady_clock::now();
        double grabMs = std::chrono::duration<double, std::milli>(grabEnd - grabStart).count();

        if (framePtr == nullptr || framePtr->empty()) {
            emptyFrames++;
            if (emptyFrames % 100 == 1) {
                g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")");
            }
            if (emptyFrames > 300) {
                g_log.add(prefix + " Too many empty frames, attempting reconnect...");
                ReconnectFilePlayer(&fpClient);
                emptyFrames = 0;
            }
            if (framePtr) delete framePtr;
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
        }
        emptyFrames = 0;

        totalGrabMs += grabMs;
        grabCount++;
        if (grabMs > maxGrabMs) maxGrabMs = grabMs;

        auto infStart = std::chrono::steady_clock::now();
        std::string lpnResult, jpegImage;
        // Pass framePtr directly — NOT a copy. ANSGpuFrameRegistry::lookup()
        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
        auto infEnd = std::chrono::steady_clock::now();
        double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
        totalInfMs += infMs;
        infCount++;
        if (infMs > maxInfMs) maxInfMs = infMs;

        cv::Mat display = framePtr->clone();
        int detCount = 0;
        std::string lastPlateText;

        if (!lpnResult.empty()) {
            try {
                boost::property_tree::ptree pt;
                std::stringstream ss(lpnResult);
                boost::property_tree::read_json(ss, pt);
                BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
                    const boost::property_tree::ptree& det = child.second;
                    const auto class_name = GetData<std::string>(det, "class_name");
                    const auto x = GetData<float>(det, "x");
                    const auto y = GetData<float>(det, "y");
                    const auto w = GetData<float>(det, "width");
                    const auto h = GetData<float>(det, "height");
                    cv::rectangle(display, cv::Rect((int)x, (int)y, (int)w, (int)h),
                                  cv::Scalar(0, 255, 0), 2);
                    cv::putText(display, class_name, cv::Point((int)x, (int)y - 5),
                                cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 0), 2);
                    lastPlateText = class_name;
                    detCount++;
                }
            }
            catch (...) {}
        }

        auto now = std::chrono::steady_clock::now();
        fpsTimestamps.push_back(now);
        while (!fpsTimestamps.empty() &&
               std::chrono::duration<double>(now - fpsTimestamps.front()).count() > 2.0) {
            fpsTimestamps.pop_front();
        }
        double fps = fpsTimestamps.size() / 2.0;

        char osd[128];
        snprintf(osd, sizeof(osd), "Task%d | %.1f FPS | Inf: %.0f ms | #%d",
                 taskId, fps, infMs, state.frameCount + 1);
        cv::putText(display, osd, cv::Point(10, 30),
                    cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 255), 2);

        {
            std::lock_guard<std::mutex> lk(state.mtx);
            state.displayFrame = display;
            state.fps = fps;
            state.inferenceMs = infMs;
            state.lastGrabMs = grabMs;
            state.lastInfMs = infMs;
            state.frameCount++;
            state.detectionCount += detCount;
            if (!lastPlateText.empty()) state.lastPlate = lastPlateText;
        }

        if ((state.frameCount % 100) == 0) {
            double avgGrab = grabCount > 0 ? totalGrabMs / grabCount : 0;
            double avgInf  = infCount  > 0 ? totalInfMs  / infCount  : 0;
            double elapsed = std::chrono::duration<double>(
                std::chrono::steady_clock::now() - benchStart).count();

            char buf[512];
            snprintf(buf, sizeof(buf),
                "%s Frame %d | FPS=%.1f | Grab: avg=%.1fms max=%.0fms | Inf: avg=%.1fms max=%.0fms | "
                "GrabPct=%.0f%% InfPct=%.0f%% | Det=%d",
                tag, state.frameCount, fps,
                avgGrab, maxGrabMs,
                avgInf, maxInfMs,
                (totalGrabMs / (elapsed * 1000.0)) * 100.0,
                (totalInfMs  / (elapsed * 1000.0)) * 100.0,
                state.detectionCount);
            g_log.add(buf);
            printf("%s\n", buf);

            totalGrabMs = totalInfMs = 0;
            maxGrabMs = maxInfMs = 0;
            grabCount = infCount = 0;
            benchStart = std::chrono::steady_clock::now();
        }

        delete framePtr;
    }

    g_log.add(prefix + " Worker loop exited");
}

// =============================================================================
//  ANSLPR_MultiGPU_StressTest_FilePlayer
//  Same as SimulatedCam but uses ANSFILEPLAYER (loops video continuously).
// =============================================================================
int ANSLPR_MultiGPU_StressTest_FilePlayer() {
    ANSCENTER::ANSOPENCV::InitCameraNetwork();
    g_log.init();

    printf("\n");
    printf("============================================================\n");
    printf("  ANSLPR Multi-GPU Stress Test (FilePlayer — looping)\n");
    printf("  Using local video files via ANSFilePlayer (HW decode)\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
    g_log.add("  ANSLPR Multi-GPU Stress Test (FilePlayer — looping)");
    g_log.add("  Using ANSFilePlayer with HW decode + NV12 zero-copy");
    g_log.add("============================================================");

    LogGpuInfo();

    const std::string videoFile0 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";
    const std::string videoFile1 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_1.mp4";
    const std::string videoFile2 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_2.mp4";
    const std::string videoFile3 = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day_3.mp4";

    g_log.add("Video 0: " + videoFile0);
    g_log.add("Video 1: " + videoFile1);
    g_log.add("Video 2: " + videoFile2);
    g_log.add("Video 3: " + videoFile3);

    TaskState taskStates[4];

    // =========================================================================
    //  Create 4 FilePlayer readers
    // =========================================================================
    const int NUM_STREAMS = 4;
    ANSCENTER::ANSFILEPLAYER* fpClients[NUM_STREAMS] = {};
    const std::string videoFiles[NUM_STREAMS] = { videoFile0, videoFile1, videoFile2, videoFile3 };
    const int taskStreamMap[4] = { 0, 1, 2, 3 };

    for (int s = 0; s < NUM_STREAMS; s++) {
        printf("[Stream%d] Creating FilePlayer for %s\n", s, videoFiles[s].c_str());
        g_log.add("[Stream" + std::to_string(s) + "] Creating FilePlayer for " + videoFiles[s]);
        int result = CreateANSFilePlayerHandle(&fpClients[s], "", videoFiles[s].c_str());
        if (result != 1 || fpClients[s] == nullptr) {
            printf("[Stream%d] FAILED to create FilePlayer (result=%d)\n", s, result);
            g_log.add("[Stream" + std::to_string(s) + "] FilePlayer create FAILED");
            fpClients[s] = nullptr;
            continue;
        }
        // Don't start yet — start after engines are loaded
        SetFilePlayerDisplayResolution(&fpClients[s], 1920, 1080);
        g_log.add("[Stream" + std::to_string(s) + "] FilePlayer created (display: 1920x1080)");
    }

    // =========================================================================
    //  Create 4 ALPR engines sequentially
    // =========================================================================
    ANSCENTER::ANSALPR* alprHandles[4] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
    int engineType = 1;
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

    for (int i = 0; i < 4; i++) {
        char tag[32];
        snprintf(tag, sizeof(tag), "[Task%d]", i);

        int streamIdx = taskStreamMap[i];
        if (fpClients[streamIdx] == nullptr) {
            printf("%s Skipped — Stream%d not available\n", tag, streamIdx);
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Stream not available";
            continue;
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].streamOk = true;
            taskStates[i].statusMsg = "Loading ALPR engine...";
        }

        printf("%s Creating ALPR handle (engineType=%d)...\n", tag, engineType);
        g_log.add(std::string(tag) + " Creating ALPR handle...");
        auto engineStart = std::chrono::steady_clock::now();
        int createResult = CreateANSALPRHandle(&alprHandles[i], "", modelZipFile.c_str(), "",
                                                engineType, detThresh, ocrThresh, colThresh);
        if (createResult != 1 || alprHandles[i] == nullptr) {
            printf("%s FAILED to create ALPR handle (result=%d)\n", tag, createResult);
            g_log.add(std::string(tag) + " ALPR create FAILED");
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "ALPR create failed";
            continue;
        }

        printf("%s Loading ALPR engine (TensorRT)...\n", tag);
        g_log.add(std::string(tag) + " Loading ALPR engine...");

        auto vramBefore = GetPerGpuFreeMiB();
        int loadResult = LoadANSALPREngineHandle(&alprHandles[i]);
        auto engineEnd = std::chrono::steady_clock::now();
        double loadMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();

        if (loadResult != 1) {
            printf("%s FAILED to load ALPR engine (result=%d)\n", tag, loadResult);
            g_log.add(std::string(tag) + " Engine load FAILED");
            ReleaseANSALPRHandle(&alprHandles[i]);
            alprHandles[i] = nullptr;
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].statusMsg = "Engine load failed";
            continue;
        }

        auto vramAfter = GetPerGpuFreeMiB();
        int bestGpu = 0;
        size_t maxDelta = 0;
        for (size_t g = 0; g < vramBefore.size() && g < vramAfter.size(); g++) {
            size_t delta = (vramBefore[g] > vramAfter[g]) ? vramBefore[g] - vramAfter[g] : 0;
            if (delta > maxDelta) { maxDelta = delta; bestGpu = (int)g; }
        }

        char ebuf[256];
        snprintf(ebuf, sizeof(ebuf), "%s Engine loaded in %d ms | GPU[%d] | VRAM used: %zu MiB (Video%d)",
                 tag, (int)loadMs, bestGpu, maxDelta, i);
        printf("%s\n", ebuf);
        g_log.add(ebuf);

        for (size_t g = 0; g < vramAfter.size(); g++) {
            size_t total = 0;
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, (int)g) == cudaSuccess) {
                total = prop.totalGlobalMem / (1024 * 1024);
            }
            char vbuf[128];
            snprintf(vbuf, sizeof(vbuf), "  GPU[%zu] VRAM: %zu MiB free (of %zu MiB)", g, vramAfter[g], total);
            printf("%s\n", vbuf);
            g_log.add(vbuf);
        }

        {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            taskStates[i].engineLoaded = true;
            taskStates[i].statusMsg = "Running";
            taskStates[i].gpuDeviceId = bestGpu;
            taskStates[i].vramUsedBytes = maxDelta * 1024 * 1024;
        }
    }

    // --- Enable debug benchmarking ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            alprHandles[i]->ActivateDebugger(true);
        }
    }
    g_log.add("Debug benchmarking ENABLED on all ALPR handles");

    // --- Start video playback NOW (just before workers need frames) ---
    for (int s = 0; s < NUM_STREAMS; s++) {
        if (fpClients[s]) {
            StartFilePlayer(&fpClients[s]);
            g_log.add("[Stream" + std::to_string(s) + "] FilePlayer play() started");
        }
    }

    // --- Launch worker threads ---
    g_log.add("Launching worker threads...");
    std::thread workers[4];
    for (int i = 0; i < 4; i++) {
        int streamIdx = taskStreamMap[i];
        if (fpClients[streamIdx] && alprHandles[i]) {
            workers[i] = std::thread(ALPRWorkerThread_FilePlayer, i,
                                     fpClients[streamIdx], alprHandles[i],
                                     std::ref(taskStates[i]));
        }
    }

    // --- Display loop (main thread) ---
    const int cellW = 640, cellH = 480;
    const int logPanelH = 200;
    const char* windowName = "ANSLPR Stress Test (FilePlayer — looping)";
    cv::namedWindow(windowName, cv::WINDOW_NORMAL);
    cv::resizeWindow(windowName, cellW * 2, cellH * 2 + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
    int snapshotCount = 0;

    while (g_running.load()) {
        auto now2 = std::chrono::steady_clock::now();
        if (std::chrono::duration<double>(now2 - lastGpuSnapshot).count() >= 10.0) {
            lastGpuSnapshot = now2;
            snapshotCount++;
            double elapsedSec = std::chrono::duration<double>(now2 - testStart).count();
            g_log.add("---- PERIODIC SNAPSHOT #" + std::to_string(snapshotCount)
                       + " (elapsed " + std::to_string((int)elapsedSec) + "s) ----");
            auto gpuSnap = QueryGpuVram();
            for (const auto& gs : gpuSnap) {
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  GPU[%d] %s | Used: %zu/%zu MiB (%.1f%%)",
                    gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                    gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
                g_log.add(buf);
            }
            double totalFpsSnap = 0;
            for (int t = 0; t < 4; t++) {
                std::lock_guard<std::mutex> lk(taskStates[t].mtx);
                char buf[256];
                snprintf(buf, sizeof(buf),
                    "  T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
                    t, taskStates[t].gpuDeviceId,
                    taskStates[t].vramUsedBytes / (1024 * 1024),
                    taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
                    taskStates[t].frameCount, taskStates[t].detectionCount);
                g_log.add(buf);
                totalFpsSnap += taskStates[t].fps;
            }
            char buf[128];
            snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS", totalFpsSnap);
            g_log.add(buf);
            std::set<int> gpusUsed;
            for (int t = 0; t < 4; t++) {
                if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
            }
            if (gpusUsed.size() > 1) {
                g_log.add("  MULTI-GPU: YES — tasks distributed across " + std::to_string(gpusUsed.size()) + " GPUs");
            } else if (!gpusUsed.empty()) {
                g_log.add("  MULTI-GPU: NO — all tasks on GPU[" + std::to_string(*gpusUsed.begin()) + "]");
            }
            g_log.add("---- END SNAPSHOT ----");
        }

        // Build 2x2 grid + log panel
        cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));

        for (int i = 0; i < 4; i++) {
            int row = i / 2, col = i % 2;
            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);

            cv::Mat cell;
            double fps = 0, infMs = 0;
            int fCount = 0, dCount = 0;
            int gpuId = -1;
            size_t vramMiB = 0;
            std::string statusMsg, lastPlate;
            bool engineLoaded = false, streamOk = false;
            {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (!taskStates[i].displayFrame.empty()) {
                    cv::resize(taskStates[i].displayFrame, cell, cv::Size(cellW, cellH));
                }
                fps = taskStates[i].fps;
                infMs = taskStates[i].inferenceMs;
                fCount = taskStates[i].frameCount;
                dCount = taskStates[i].detectionCount;
                statusMsg = taskStates[i].statusMsg;
                lastPlate = taskStates[i].lastPlate;
                engineLoaded = taskStates[i].engineLoaded;
                streamOk = taskStates[i].streamOk;
                gpuId = taskStates[i].gpuDeviceId;
                vramMiB = taskStates[i].vramUsedBytes / (1024 * 1024);
            }

            if (cell.empty()) {
                cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
                cv::putText(cell, "Task " + std::to_string(i) + ": " + statusMsg,
                            cv::Point(20, cellH / 2),
                            cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2);
            }

            cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
            char bar1[256], bar2[256];
            snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
                     i, fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
            if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
            } else {
                snprintf(bar2, sizeof(bar2), "GPU: N/A");
            }
            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
            cv::putText(cell, bar1, cv::Point(5, cellH - 28),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
            cv::putText(cell, bar2, cv::Point(5, cellH - 8),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);

            cell.copyTo(canvas(roi));

            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
                     cv::Scalar(100, 100, 100), 1);
            cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
                     cv::Scalar(100, 100, 100), 1);
        }

        // Log panel
        cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
        cv::Mat logPanel = canvas(logRoi);
        logPanel.setTo(cv::Scalar(20, 20, 20));

        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
        char header[128];
        snprintf(header, sizeof(header),
                 "Elapsed: %.0fs | FilePlayer (looping, HW decode) | Press ESC to stop", elapsed);
        cv::putText(logPanel, header, cv::Point(10, 18),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);

        double totalFps = 0;
        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            totalFps += taskStates[i].fps;
        }
        char aggLine[256];
        snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
                 totalFps,
                 taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
                 taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
        cv::putText(logPanel, aggLine, cv::Point(10, 38),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);

        auto gpuSnaps = QueryGpuVram();
        int gpuLineY = 58;
        for (const auto& gs : gpuSnaps) {
            int tasksOnGpu = 0;
            size_t taskVramMiB = 0;
            for (int i = 0; i < 4; i++) {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (taskStates[i].gpuDeviceId == gs.deviceId) {
                    tasksOnGpu++;
                    taskVramMiB += taskStates[i].vramUsedBytes / (1024 * 1024);
                }
            }
            char gpuLine[256];
            snprintf(gpuLine, sizeof(gpuLine),
                "GPU[%d] %s | Used: %zu/%zu MiB | Tasks: %d (engine VRAM: %zu MiB)",
                gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
                tasksOnGpu, taskVramMiB);
            cv::putText(logPanel, gpuLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(100, 255, 100), 1);
            gpuLineY += 18;
        }

        for (int i = 0; i < 4; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            char tLine[256];
            snprintf(tLine, sizeof(tLine),
                "T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
                i, taskStates[i].gpuDeviceId,
                taskStates[i].vramUsedBytes / (1024 * 1024),
                taskStates[i].fps, taskStates[i].inferenceMs,
                taskStates[i].frameCount, taskStates[i].detectionCount);
            cv::putText(logPanel, tLine, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(200, 200, 200), 1);
            gpuLineY += 16;
        }

        auto recentLogs = g_log.getRecent(4);
        for (const auto& line : recentLogs) {
            if (gpuLineY > logPanelH - 5) break;
            std::string display = (line.size() > 130) ? line.substr(0, 127) + "..." : line;
            cv::putText(logPanel, display, cv::Point(10, gpuLineY),
                        cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(140, 140, 140), 1);
            gpuLineY += 15;
        }

        cv::imshow(windowName, canvas);
        int key = cv::waitKey(30);
        if (key == 27) {
            g_log.add("ESC pressed — stopping all tasks...");
            printf("\nESC pressed — stopping...\n");
            g_running.store(false);
        }
    }

    // --- Wait for all workers ---
    printf("Waiting for worker threads to finish...\n");
    for (int i = 0; i < 4; i++) {
        if (workers[i].joinable()) workers[i].join();
    }

    // --- Final summary ---
    double totalElapsed = std::chrono::duration<double>(
        std::chrono::steady_clock::now() - testStart).count();

    g_log.add("================================================================");
    g_log.add("  FINAL PERFORMANCE SUMMARY (FilePlayer — looping)");
    g_log.add("  Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
    g_log.add("================================================================");

    printf("\n============================================================\n");
    printf("  FINAL PERFORMANCE SUMMARY — FilePlayer (runtime: %.0fs)\n", totalElapsed);
    printf("============================================================\n");

    double totalFpsFinal = 0;
    for (int i = 0; i < 4; i++) {
        char buf[512];
        snprintf(buf, sizeof(buf),
            "  Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
            i, taskStates[i].gpuDeviceId,
            taskStates[i].vramUsedBytes / (1024 * 1024),
            taskStates[i].frameCount, taskStates[i].detectionCount,
            taskStates[i].fps, taskStates[i].inferenceMs);
        printf("%s\n", buf);
        g_log.add(buf);
        totalFpsFinal += taskStates[i].fps;
    }

    auto finalGpu = QueryGpuVram();
    for (const auto& gs : finalGpu) {
        char buf[256];
        snprintf(buf, sizeof(buf), "  GPU[%d] %s: %zu/%zu MiB used (%.1f%%)",
               gs.deviceId, gs.name.c_str(), gs.usedMiB, gs.totalMiB,
               gs.totalMiB > 0 ? 100.0 * gs.usedMiB / gs.totalMiB : 0.0);
        printf("%s\n", buf);
        g_log.add(buf);
    }

    {
        char buf[256];
        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
        printf("%s\n", buf);
        g_log.add(buf);
    }

    printf("============================================================\n");
    g_log.add("================================================================");

    // --- Release all handles ---
    for (int i = 0; i < 4; i++) {
        if (alprHandles[i]) {
            ReleaseANSALPRHandle(&alprHandles[i]);
        }
    }
    for (int s = 0; s < NUM_STREAMS; s++) {
        if (fpClients[s]) {
            StopFilePlayer(&fpClients[s]);
            ReleaseANSFilePlayerHandle(&fpClients[s]);
        }
    }

    g_log.close();
    cv::destroyAllWindows();
    ANSCENTER::ANSOPENCV::DeinitCameraNetwork();

    return 0;
}

int main()
{
   // ANSLPR_OD_INDOInferences_FileTest();
    //ANSLPR_OD_Inferences_FileTest();
    //ANSLPR_OD_VideoTest();
    //ANSLPR_BigSize_VideoTest();
   //ANSLPR_CPU_VideoTest();
    //for (int i = 0; i < 100; i++) {
    //    ANSLPR_CPU_Inferences_FileTest();
    //}
    //ANSLPR_MultiGPU_StressTest();
    //ANSLPR_MultiGPU_StressTest_SimulatedCam();
    ANSLPR_MultiGPU_StressTest_FilePlayer();
    return 0;

}