Fix NV12 crash issue when recreate camera object

This commit is contained in:
2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions

View File

@@ -23,6 +23,7 @@
#include <thread>
#include <mutex>
#include <atomic>
#include <random>
#include <chrono>
#include <deque>
#include <set>
@@ -751,8 +752,11 @@ static void LogGpuInfo() {
// Worker thread: reads RTSP frames and runs ALPR inference
// RTSP client and ALPR engine are pre-created on the main thread to avoid
// race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
// Takes rtspClientPtr (pointer to array slot) + streamGuard mutex so the
// CHAOS thread can safely destroy+recreate the stream without use-after-free.
static void ALPRWorkerThread(int taskId,
ANSCENTER::ANSRTSPClient* rtspClient,
ANSCENTER::ANSRTSPClient** rtspClientPtr,
std::mutex* streamGuard,
ANSCENTER::ANSALPR* alprHandle,
TaskState& state) {
char tag[32];
@@ -780,6 +784,23 @@ static void ALPRWorkerThread(int taskId,
bool hwDecodeLogged = false;
while (g_running.load()) {
// Lock the stream guard to prevent CHAOS from destroying the client
// while we're mid-frame-grab or mid-inference.
std::unique_lock<std::mutex> streamLock(*streamGuard);
// Re-read the client pointer each iteration — CHAOS may have
// destroyed+recreated it, so our old pointer could be dangling.
ANSCENTER::ANSRTSPClient* rtspClient = *rtspClientPtr;
if (rtspClient == nullptr) {
streamLock.unlock();
emptyFrames++;
if (emptyFrames % 100 == 1) {
g_log.add(prefix + " Stream destroyed by CHAOS, waiting... (count=" + std::to_string(emptyFrames) + ")");
}
std::this_thread::sleep_for(std::chrono::milliseconds(50));
continue;
}
// Read frame from RTSP via ANSCV
auto grabStart = std::chrono::steady_clock::now();
cv::Mat* framePtr = nullptr;
@@ -797,6 +818,7 @@ static void ALPRWorkerThread(int taskId,
ReconnectRTSP(&rtspClient);
emptyFrames = 0;
}
streamLock.unlock();
if (framePtr) delete framePtr;
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
@@ -829,6 +851,9 @@ static void ALPRWorkerThread(int taskId,
// matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
// a different pointer the registry doesn't know, breaking NV12 zero-copy.
ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
// Release stream lock — inference is done, CHAOS can now safely destroy.
streamLock.unlock();
auto infEnd = std::chrono::steady_clock::now();
double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
totalInfMs += infMs;
@@ -933,19 +958,20 @@ int ANSLPR_MultiGPU_StressTest() {
printf("\n");
printf("============================================================\n");
printf(" ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks\n");
printf(" ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks\n");
printf(" (4 cameras, 5 AI tasks — Task 4 shares Stream 2)\n");
printf(" Press ESC to stop\n");
printf(" Log file: %s\n", LOG_FILE_PATH);
printf("============================================================\n\n");
g_log.add("============================================================");
g_log.add(" ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks");
g_log.add(" ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks");
g_log.add("============================================================");
// --- Log GPU info for diagnostics ---
LogGpuInfo();
// --- RTSP URLs (4 independent streams, one per task) ---
// --- RTSP URLs (4 independent camera streams) ---
const std::string rtspUrl0 = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
const std::string rtspUrl1 = "rtsp://cafe2471.ddns.net:600/rtsp/streaming?channel=01&subtype=0";
const std::string rtspUrl2 = "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0";
@@ -956,18 +982,39 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add("Stream 2: " + rtspUrl2);
g_log.add("Stream 3: " + rtspUrl3);
// =========================================================================
// Architecture: Camera Process + AI Task Process (mimics LabVIEW)
// -----------------------------------------------------------------------
// Camera Process: 4 independent RTSP streams acquire frames from cameras.
// AI Task Process: 5 AI tasks subscribe to camera streams and run inference
// in parallel. Multiple tasks can share one camera stream.
// Task 4 subscribes to Stream 2 (nhathuocngoclinh) to demonstrate the
// shared-camera subscription model used in LabVIEW.
// =========================================================================
const int NUM_STREAMS = 4;
const int NUM_TASKS = 5;
// --- Task states ---
TaskState taskStates[4];
TaskState taskStates[NUM_TASKS];
// =========================================================================
// Create 4 INDEPENDENT RTSP readers one per task, each with its own
// camera stream. Each task gets a dedicated RTSP connection.
// CAMERA PROCESS: Create 4 independent RTSP readers (one per camera).
// These form the camera acquisition layer that AI tasks subscribe to.
// =========================================================================
const int NUM_STREAMS = 4;
ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
const std::string streamUrls[NUM_STREAMS] = { rtspUrl0, rtspUrl1, rtspUrl2, rtspUrl3 };
// Map: task index -> stream index (1:1 mapping)
const int taskStreamMap[4] = { 0, 1, 2, 3 };
// Map: task index -> stream index
// Tasks 0-3 map 1:1 to streams 0-3.
// Task 4 subscribes to Stream 2 (nhathuocngoclinh) — shared camera.
const int taskStreamMap[NUM_TASKS] = { 0, 1, 2, 3, 2 };
// Log task-to-stream subscription mapping
g_log.add("--- AI Task -> Camera Stream subscription ---");
for (int i = 0; i < NUM_TASKS; i++) {
g_log.add(" Task " + std::to_string(i) + " -> Stream " + std::to_string(taskStreamMap[i])
+ " (" + streamUrls[taskStreamMap[i]] + ")");
}
for (int s = 0; s < NUM_STREAMS; s++) {
printf("[Stream%d] Creating RTSP handle for %s...\n", s, streamUrls[s].c_str());
@@ -986,14 +1033,17 @@ int ANSLPR_MultiGPU_StressTest() {
}
// =========================================================================
// Create 4 ALPR engines sequentially
// AI TASK PROCESS: Create 5 ALPR engines sequentially.
// Each AI task gets its own engine and subscribes to a camera stream.
// Task 4 shares Stream 2 (nhathuocngoclinh) with Task 2 — demonstrating
// the LabVIEW pattern where multiple AI tasks subscribe to one camera.
// =========================================================================
ANSCENTER::ANSALPR* alprHandles[4] = {};
ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
int engineType = 1; // NVIDIA_GPU
double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
char tag[32];
snprintf(tag, sizeof(tag), "[Task%d]", i);
@@ -1109,7 +1159,7 @@ int ANSLPR_MultiGPU_StressTest() {
// Count votes: how many tasks on this stream use each GPU
std::map<int, int> gpuVotes;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (taskStreamMap[i] == s && alprHandles[i]) {
gpuVotes[taskStates[i].gpuDeviceId]++;
}
@@ -1194,30 +1244,132 @@ int ANSLPR_MultiGPU_StressTest() {
}
// --- Enable deep pipeline benchmarking on all ALPR handles ---
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (alprHandles[i]) {
alprHandles[i]->ActivateDebugger(true);
}
}
g_log.add("Debug benchmarking ENABLED on all ALPR handles");
// --- Launch worker threads — tasks sharing a stream get the same RTSP client ---
g_log.add("Launching worker threads...");
std::thread workers[4];
for (int i = 0; i < 4; i++) {
// --- Per-stream mutex: prevents CHAOS from destroying a stream while a
// worker is mid-frame-grab or mid-inference (use-after-free fix). ---
std::mutex streamGuards[NUM_STREAMS];
// --- Launch worker threads ---
// Each AI task subscribes to its camera stream via taskStreamMap.
// Tasks sharing a stream (e.g. Task 2 & Task 4 on Stream 2) both get
// the same RTSP client pointer and share the stream's mutex guard.
g_log.add("Launching " + std::to_string(NUM_TASKS) + " worker threads...");
std::thread workers[NUM_TASKS];
for (int i = 0; i < NUM_TASKS; i++) {
int streamIdx = taskStreamMap[i];
if (rtspClients[streamIdx] && alprHandles[i]) {
workers[i] = std::thread(ALPRWorkerThread, i,
rtspClients[streamIdx], alprHandles[i],
&rtspClients[streamIdx],
&streamGuards[streamIdx],
alprHandles[i],
std::ref(taskStates[i]));
}
}
// =========================================================================
// Camera Chaos Thread — simulates camera errors / reconnects
// Mimics LabVIEW behavior: cameras randomly go into Error/Recovering
// state, triggering Stop/Reconnect/Destroy+Recreate cycles that cause
// CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
// This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
// =========================================================================
std::atomic<bool> chaosEnabled{true};
std::thread chaosThread([&]() {
std::mt19937 rng(std::random_device{}());
// Wait 10 seconds for system to stabilize before starting chaos
for (int i = 0; i < 100 && g_running.load(); i++) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
g_log.add("[CHAOS] Camera chaos thread started — every 10s, stop/destroy/recreate one camera (round-robin)");
printf("[CHAOS] Camera chaos thread started — 10s interval, round-robin across %d streams\n", NUM_STREAMS);
int chaosCount = 0;
int nextStream = 0; // Round-robin: cycle through streams 0,1,2,3,0,1,...
while (g_running.load() && chaosEnabled.load()) {
// Fixed 10-second interval between chaos events
for (int s = 0; s < 100 && g_running.load(); s++) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
if (!g_running.load()) break;
int streamIdx = nextStream;
nextStream = (nextStream + 1) % NUM_STREAMS;
chaosCount++;
char buf[512];
auto chaosStart = std::chrono::steady_clock::now();
// Lock stream guard: wait for any in-flight inference to finish
// before touching the RTSP client. This prevents use-after-free
// when CHAOS destroys a stream while a worker is mid-inference.
std::unique_lock<std::mutex> chaosLock(streamGuards[streamIdx]);
// Always use full DESTROY + RECREATE cycle.
// Reconnect() reuses internal player state which can leave stale
// CUDA resources and cause freezes. A clean destroy + recreate
// guarantees a fresh decoder/player with no leftover state.
{
bool wasAlive = (rtspClients[streamIdx] != nullptr);
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: DESTROY + RECREATE (%s)",
chaosCount, streamIdx,
wasAlive ? "camera was running" : "camera was already offline");
g_log.add(buf);
printf("%s\n", buf);
// Stop and release old handle if it exists
if (rtspClients[streamIdx]) {
StopRTSP(&rtspClients[streamIdx]);
ReleaseANSRTSPHandle(&rtspClients[streamIdx]);
rtspClients[streamIdx] = nullptr;
}
// Release lock during offline sleep — worker sees nullptr and skips
int offlineMs = 500 + (rng() % 2500); // 0.5 - 3 seconds offline
chaosLock.unlock();
std::this_thread::sleep_for(std::chrono::milliseconds(offlineMs));
chaosLock.lock();
// Recreate the RTSP handle (under lock again)
int result = CreateANSRTSPHandle(&rtspClients[streamIdx], "", "", "",
streamUrls[streamIdx].c_str());
if (result == 1 && rtspClients[streamIdx]) {
SetRTSPImageQuality(&rtspClients[streamIdx], 0);
SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
StartRTSP(&rtspClients[streamIdx]);
auto chaosEnd = std::chrono::steady_clock::now();
double chaosMs = std::chrono::duration<double, std::milli>(chaosEnd - chaosStart).count();
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATED in %.0f ms (offline %d ms)",
chaosCount, streamIdx, chaosMs, offlineMs);
} else {
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATE FAILED (result=%d)",
chaosCount, streamIdx, result);
}
g_log.add(buf);
printf("%s\n", buf);
}
}
g_log.add("[CHAOS] Camera chaos thread stopped (total events: " + std::to_string(chaosCount) + ")");
printf("[CHAOS] Camera chaos thread stopped (total events: %d)\n", chaosCount);
});
// --- Display loop (main thread) ---
const int cellW = 640, cellH = 480;
const int logPanelH = 200;
// 3x2 grid layout: 5 tasks displayed in 3 columns x 2 rows
const int cellW = 480, cellH = 360; // Smaller cells for 3-column layout
const int logPanelH = 220;
const int gridCols = 3, gridRows = 2;
cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * 2, cellH * 2 + logPanelH);
cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * gridCols, cellH * gridRows + logPanelH);
auto testStart = std::chrono::steady_clock::now();
auto lastGpuSnapshot = std::chrono::steady_clock::now();
@@ -1244,12 +1396,12 @@ int ANSLPR_MultiGPU_StressTest() {
}
// Per-task stats
double totalFpsSnap = 0;
for (int t = 0; t < 4; t++) {
for (int t = 0; t < NUM_TASKS; t++) {
std::lock_guard<std::mutex> lk(taskStates[t].mtx);
char buf[256];
snprintf(buf, sizeof(buf),
" T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
t, taskStates[t].gpuDeviceId,
" T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
t, taskStreamMap[t], taskStates[t].gpuDeviceId,
taskStates[t].vramUsedBytes / (1024 * 1024),
taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
taskStates[t].frameCount, taskStates[t].detectionCount);
@@ -1261,7 +1413,7 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add(buf);
// Multi-GPU check
std::set<int> gpusUsed;
for (int t = 0; t < 4; t++) {
for (int t = 0; t < NUM_TASKS; t++) {
if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
}
if (gpusUsed.size() > 1) {
@@ -1271,12 +1423,12 @@ int ANSLPR_MultiGPU_StressTest() {
}
g_log.add("---- END SNAPSHOT ----");
}
// Build 2x2 grid + log panel
cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));
// Build 3x2 grid + log panel (5 tasks: 3 cols x 2 rows, cell [1][2] empty)
cv::Mat canvas(cellH * gridRows + logPanelH, cellW * gridCols, CV_8UC3, cv::Scalar(30, 30, 30));
// Place each task's frame in its quadrant
for (int i = 0; i < 4; i++) {
int row = i / 2, col = i % 2;
// Place each task's frame in its cell
for (int i = 0; i < NUM_TASKS; i++) {
int row = i / gridCols, col = i % gridCols;
cv::Rect roi(col * cellW, row * cellH, cellW, cellH);
cv::Mat cell;
@@ -1313,8 +1465,8 @@ int ANSLPR_MultiGPU_StressTest() {
// Draw status bar at bottom of each cell (2 lines)
cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
char bar1[256], bar2[256];
snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
i, fps, infMs, fCount, dCount,
snprintf(bar1, sizeof(bar1), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
i, taskStreamMap[i], fps, infMs, fCount, dCount,
lastPlate.empty() ? "-" : lastPlate.c_str());
if (gpuId >= 0) {
snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
@@ -1323,45 +1475,53 @@ int ANSLPR_MultiGPU_StressTest() {
}
cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
cv::putText(cell, bar1, cv::Point(5, cellH - 28),
cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
cv::FONT_HERSHEY_SIMPLEX, 0.4, barColor, 1);
cv::putText(cell, bar2, cv::Point(5, cellH - 8),
cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);
cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(0, 200, 255), 1);
cell.copyTo(canvas(roi));
// Draw grid lines
cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
cv::Scalar(100, 100, 100), 1);
cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
cv::Scalar(100, 100, 100), 1);
}
// Draw grid lines
for (int c = 1; c < gridCols; c++)
cv::line(canvas, cv::Point(c * cellW, 0), cv::Point(c * cellW, cellH * gridRows),
cv::Scalar(100, 100, 100), 1);
for (int r = 1; r < gridRows; r++)
cv::line(canvas, cv::Point(0, r * cellH), cv::Point(cellW * gridCols, r * cellH),
cv::Scalar(100, 100, 100), 1);
// --- Log panel at bottom ---
cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
cv::Rect logRoi(0, cellH * gridRows, cellW * gridCols, logPanelH);
cv::Mat logPanel = canvas(logRoi);
logPanel.setTo(cv::Scalar(20, 20, 20));
// Elapsed time header
auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
char header[128];
char header[256];
snprintf(header, sizeof(header),
"Elapsed: %.0fs | Press ESC to stop | Resize window freely", elapsed);
"Elapsed: %.0fs | %d cameras, %d AI tasks | Press ESC to stop",
elapsed, NUM_STREAMS, NUM_TASKS);
cv::putText(logPanel, header, cv::Point(10, 18),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);
// Aggregate stats + per-task GPU summary
double totalFps = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
totalFps += taskStates[i].fps;
}
char aggLine[256];
snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
totalFps,
taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
// Build dynamic task-GPU summary string
std::string taskGpuStr;
for (int i = 0; i < NUM_TASKS; i++) {
if (i > 0) taskGpuStr += " ";
taskGpuStr += "T" + std::to_string(i) + "(S" + std::to_string(taskStreamMap[i])
+ "):GPU" + std::to_string(taskStates[i].gpuDeviceId);
}
char aggLine[512];
snprintf(aggLine, sizeof(aggLine), "Total: %.1f FPS | %s",
totalFps, taskGpuStr.c_str());
cv::putText(logPanel, aggLine, cv::Point(10, 38),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);
cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 255, 255), 1);
// Real-time GPU VRAM monitor (query every frame — cheap call)
auto gpuSnaps = QueryGpuVram();
@@ -1370,7 +1530,7 @@ int ANSLPR_MultiGPU_StressTest() {
// Count tasks on this GPU and their total VRAM
int tasksOnGpu = 0;
size_t taskVramMiB = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
if (taskStates[i].gpuDeviceId == gs.deviceId) {
tasksOnGpu++;
@@ -1387,13 +1547,13 @@ int ANSLPR_MultiGPU_StressTest() {
gpuLineY += 18;
}
// Per-task resource line
for (int i = 0; i < 4; i++) {
// Per-task resource line (shows which stream each task subscribes to)
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
char tLine[256];
snprintf(tLine, sizeof(tLine),
"T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
i, taskStates[i].gpuDeviceId,
"T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
i, taskStreamMap[i], taskStates[i].gpuDeviceId,
taskStates[i].vramUsedBytes / (1024 * 1024),
taskStates[i].fps, taskStates[i].inferenceMs,
taskStates[i].frameCount, taskStates[i].detectionCount);
@@ -1421,9 +1581,13 @@ int ANSLPR_MultiGPU_StressTest() {
}
}
// --- Stop chaos thread ---
chaosEnabled.store(false);
if (chaosThread.joinable()) chaosThread.join();
// --- Wait for all workers ---
printf("Waiting for worker threads to finish...\n");
for (int i = 0; i < 4; i++) {
printf("Waiting for %d worker threads to finish...\n", NUM_TASKS);
for (int i = 0; i < NUM_TASKS; i++) {
if (workers[i].joinable()) workers[i].join();
}
@@ -1433,19 +1597,21 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add("================================================================");
g_log.add(" FINAL PERFORMANCE SUMMARY");
g_log.add(" " + std::to_string(NUM_STREAMS) + " cameras, " + std::to_string(NUM_TASKS) + " AI tasks");
g_log.add(" Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
g_log.add("================================================================");
printf("\n============================================================\n");
printf(" FINAL PERFORMANCE SUMMARY (runtime: %.0fs)\n", totalElapsed);
printf(" %d cameras, %d AI tasks\n", NUM_STREAMS, NUM_TASKS);
printf("============================================================\n");
double totalFpsFinal = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
char buf[512];
snprintf(buf, sizeof(buf),
" Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
i, taskStates[i].gpuDeviceId,
" Task %d (Stream %d): GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
i, taskStreamMap[i], taskStates[i].gpuDeviceId,
taskStates[i].vramUsedBytes / (1024 * 1024),
taskStates[i].frameCount, taskStates[i].detectionCount,
taskStates[i].fps, taskStates[i].inferenceMs);
@@ -1466,12 +1632,13 @@ int ANSLPR_MultiGPU_StressTest() {
// Multi-GPU verdict
std::set<int> finalGpusUsed;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
}
{
char buf[256];
snprintf(buf, sizeof(buf), " Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
snprintf(buf, sizeof(buf), " Total throughput: %.1f FPS across %d tasks (%d cameras)",
totalFpsFinal, NUM_TASKS, NUM_STREAMS);
printf("%s\n", buf);
g_log.add(buf);
}
@@ -1491,13 +1658,16 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add(" 3. No CUDA_VISIBLE_DEVICES env var restricting GPU access");
}
// Log shared-camera subscription info
g_log.add(" Camera subscription: Task 2 and Task 4 both subscribe to Stream 2 (nhathuocngoclinh)");
printf("============================================================\n");
g_log.add("================================================================");
g_log.add(" Log saved to: " + std::string(LOG_FILE_PATH));
g_log.add("================================================================");
// --- Release all handles (sequentially on main thread) ---
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (alprHandles[i]) {
ReleaseANSALPRHandle(&alprHandles[i]);
}
@@ -2770,9 +2940,9 @@ int main()
//for (int i = 0; i < 100; i++) {
// ANSLPR_CPU_Inferences_FileTest();
//}
//ANSLPR_MultiGPU_StressTest();
ANSLPR_MultiGPU_StressTest();
//ANSLPR_MultiGPU_StressTest_SimulatedCam();
ANSLPR_MultiGPU_StressTest_FilePlayer();
// ANSLPR_MultiGPU_StressTest_FilePlayer();
return 0;
}