Fix NV12 crash issue when recreate camera object

This commit is contained in:
2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions

View File

@@ -12,7 +12,29 @@
"Bash(grep -n \"struct Object\\\\|class Object\" /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/*.h /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/include/*.h)",
"Bash(grep -n \"cudaStream\\\\|cudaMalloc\\\\|cudaFree\\\\|queue\\\\|Task\\\\|mutex\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
"Bash(grep -n \"~Engine\\\\|destructor\\\\|cleanup\\\\|~\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
"Bash(grep -n \"for.*cudaFree\\\\|m_buffers\\\\[\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)"
"Bash(grep -n \"for.*cudaFree\\\\|m_buffers\\\\[\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
"Bash(find /c/Projects/CLionProjects/ANSCORE -name ANSGpuFrameRegistry* -type f)",
"Bash(ls -la /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/*.h)",
"Bash(\"C:\\\\Users\\\\nghia\\\\AppData\\\\Local\\\\Programs\\\\CLion 2026.1\\\\bin\\\\cmake\\\\win\\\\x64\\\\bin\\\\cmake.exe\" --build cmake-build-release --target all -j 30)",
"Bash(cmake --build build --target ANSLPR-UnitTest --config Release)",
"Bash(ls -d C:/Projects/CLionProjects/ANSCORE/cmake-build-*)",
"Bash(ls -d C:/Projects/CLionProjects/ANSCORE/out/*)",
"Bash(cmake --build C:/Projects/CLionProjects/ANSCORE/cmake-build-release --target ANSLPR-UnitTest --config Release)",
"Bash(cmake --build C:/Projects/CLionProjects/ANSCORE/cmake-build-release --target ANSLPR-UnitTest)",
"Bash('C:/Program Files/Microsoft Visual Studio/2022/Community/Common7/Tools/VsDevCmd.bat' -arch=amd64)",
"Bash(cmake -B C:/Projects/CLionProjects/ANSCORE/cmake-build-release -S C:/Projects/CLionProjects/ANSCORE -G Ninja -DCMAKE_BUILD_TYPE=Release)",
"Bash(cmd //C \"call \"\"C:\\\\Program Files\\\\Microsoft Visual Studio\\\\2022\\\\Community\\\\VC\\\\Auxiliary\\\\Build\\\\vcvarsall.bat\"\" amd64 >nul 2>&1 && cmake --build C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\\\\cmake-build-release --target ANSLPR-UnitTest\")",
"Bash(1 EOF cmd /C C:tmpbuild.bat)",
"Read(//tmp/**)",
"Bash(\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvarsall.bat\" amd64)",
"Bash(export INCLUDE=\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/ucrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/um;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/shared;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/winrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/cppwinrt\" export LIB=\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/lib/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/ucrt/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/um/x64\" cmake --build \"C:/Projects/CLionProjects/ANSCORE/cmake-build-release\" --target ANSLPR-UnitTest)",
"Bash(tasklist)",
"Bash(taskkill /F /IM ANSLPR-UnitTest.exe)",
"Bash(export \"INCLUDE=C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/ucrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/um;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/shared;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/winrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/cppwinrt\")",
"Bash(export \"LIB=C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/lib/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/ucrt/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/um/x64\")",
"Bash(grep -E \"\\\\.\\(cpp|h|hpp\\)$\")",
"Bash(find /c/Projects/CLionProjects/ANSCORE -name *Logger* -type f)",
"Bash(find /c/Projects/CLionProjects/ANSCORE -name *SPDLogger* -o -name *ANSLogger*)"
]
}
}

View File

@@ -575,8 +575,13 @@ void CVideoDecoder::Start() {
}
void CVideoDecoder::Stop() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
m_bRunning = FALSE;
// Atomically signal the decoder to stop WITHOUT acquiring _mutex.
// decode() holds _mutex while inside avcodec_send_packet / CUDA calls
// that can block on the nvcuda64 SRW lock for a long time.
// If we waited for _mutex here, Stop() would deadlock whenever a
// concurrent decode() is stuck waiting for a CUDA operation held by
// an inference thread.
m_bRunning.store(FALSE, std::memory_order_release);
log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__);
}

View File

@@ -3,6 +3,7 @@
#include "sys_inc.h"
#include "media_format.h"
#include <string>
#include <atomic>
#include <mutex>
#include <vector>
extern "C"
@@ -152,7 +153,7 @@ private:
int hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1);
private:
BOOL m_bInited;
BOOL m_bRunning;
std::atomic<BOOL> m_bRunning;
BOOL m_bHardwareDecoderEnabled; // Track if hardware decoder is enabled
bool m_bCudaHWAccel; // true when using AV_HWDEVICE_TYPE_CUDA
int m_hwGpuIndex; // GPU index assigned by HWDecoderPool (-1 = legacy)

View File

@@ -1,6 +1,7 @@
#pragma once
#include <cstring>
#include <filesystem>
#include <semaphore>
#include "TRTCompat.h"
// Per-device mutex for CUDA graph capture.
@@ -15,6 +16,95 @@ static std::mutex& graphCaptureMutex() {
return m;
}
// ============================================================================
// GPU INFERENCE THROTTLE
// ============================================================================
// Global counting semaphore that limits how many Engine instances can execute
// CUDA inference simultaneously. Without this, N separate Engine instances
// (one per camera) all submit GPU work at once, causing:
// 1. SM 100% saturation → each inference takes 5-10x longer
// 2. GPU thermal throttling at 85°C → further slowdown
// 3. cudaStreamSynchronize blocking indefinitely → system freeze
//
// Auto-computed from GPU VRAM:
// ≤ 4 GB → 2 concurrent 8 GB → 4 concurrent
// 6 GB → 3 concurrent 12+ GB → 6 concurrent
// Multi-GPU: sum across all GPUs
//
// Excess threads wait on CPU (nearly zero cost) while the bounded set
// runs efficiently on the GPU without thermal throttling.
static std::counting_semaphore<64>& gpuInferenceSemaphore() {
static int maxConcurrent = []() {
int totalSlots = 0;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
if (gpuCount <= 0) return 4; // fallback
for (int i = 0; i < gpuCount; ++i) {
size_t freeMem = 0, totalMem = 0;
cudaSetDevice(i);
cudaMemGetInfo(&freeMem, &totalMem);
int gbTotal = static_cast<int>(totalMem / (1024ULL * 1024ULL * 1024ULL));
// Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU
int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6);
totalSlots += slotsThisGpu;
}
totalSlots = std::clamp(totalSlots, 2, 64);
std::cout << "Info [GPU Throttle]: max concurrent inferences = "
<< totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl;
return totalSlots;
}();
static std::counting_semaphore<64> sem(maxConcurrent);
return sem;
}
// RAII guard for the GPU inference semaphore
struct GpuInferenceGuard {
GpuInferenceGuard() { gpuInferenceSemaphore().acquire(); }
~GpuInferenceGuard() { gpuInferenceSemaphore().release(); }
GpuInferenceGuard(const GpuInferenceGuard&) = delete;
GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete;
};
// ============================================================================
// WDDM-SAFE STREAM SYNCHRONIZATION
// ============================================================================
// Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight
// loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire
// time. When the GPU is busy with inference, this spin blocks ALL other CUDA
// operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy.
// If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock
// while inference is spinning, the entire system deadlocks.
//
// This function replaces cudaStreamSynchronize with a polling loop that
// explicitly releases the SRW lock between queries by sleeping briefly.
// This allows other CUDA operations to interleave with the sync wait.
static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
// Fast path: check if already done (no sleep overhead for quick kernels)
cudaError_t err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
// Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
// a window to acquire the exclusive nvcuda64 SRW lock.
// Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
// caused ~20-second stalls when concurrent cleanup needed exclusive access.
for (int i = 0; i < 10; ++i) {
Sleep(0);
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
}
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
while (true) {
Sleep(1);
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
}
}
template <typename T>
void Engine<T>::warmUp(int iterations) {
if (m_verbose) {
@@ -163,6 +253,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
return runInferenceFromPool(inputs, featureVectors);
}
// ============================================================================
// GPU INFERENCE THROTTLE
// ============================================================================
// Limit how many Engine instances can run CUDA inference simultaneously.
// Without this, 12 cameras each with their own Engine all submit GPU work
// at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs.
// The semaphore lets excess threads wait on CPU (nearly zero cost) while
// a bounded number use the GPU efficiently.
GpuInferenceGuard gpuThrottle;
// ============================================================================
// SINGLE-ENGINE SERIALISATION
// ============================================================================
@@ -376,7 +476,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
return false;
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::cout << "Error: Failed to sync after profile change: "
<< cudaGetErrorString(syncErr) << std::endl;
@@ -642,7 +742,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
if (graphExec) {
// Launch the pre-captured graph (single API call replaces many).
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize(m_inferenceStream);
cudaStreamSynchronize_Safe(m_inferenceStream);
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
@@ -705,7 +805,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+ std::string(cudaGetErrorString(syncErr));

View File

@@ -34,15 +34,40 @@
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <cstdio>
#include <chrono>
#include <opencv2/core/mat.hpp>
#ifdef _WIN32
#include <windows.h>
#endif
// Debug logging for registry operations — both stderr and OutputDebugString.
#ifndef REG_DBG
#ifdef _WIN32
#define REG_DBG(fmt, ...) do { \
char _reg_buf[512]; \
snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_reg_buf); \
fprintf(stderr, "%s", _reg_buf); \
} while(0)
#else
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
#endif
#endif
// Safety constants
static constexpr int MAX_FRAME_REFCOUNT = 64;
static constexpr int FRAME_TTL_SECONDS = 3;
static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
struct GpuPendingFreeEntry {
void* ptr = nullptr;
int deviceIdx = -1;
};
struct GpuFrameData {
// --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy
@@ -83,6 +108,14 @@ struct GpuFrameData {
std::atomic<int> refcount{1};
std::chrono::steady_clock::time_point createdAt;
// --- Owner callback (for per-client inference guard) ---
// When the last reference to this frame drops, onReleaseFn is called
// with ownerClient to decrement the RTSP client's in-flight counter.
// This lets Destroy() wait for in-flight inference to finish before
// freeing NVDEC surfaces (fixes LabVIEW crash).
void* ownerClient = nullptr;
void (*onReleaseFn)(void*) = nullptr;
// Default constructor
GpuFrameData() = default;
@@ -100,6 +133,7 @@ struct GpuFrameData {
, yPlane(o.yPlane), uvPlane(o.uvPlane)
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
, refcount(o.refcount.load()), createdAt(o.createdAt)
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
{
// Null out source to prevent double-free of owned pointers
o.cpuYPlane = nullptr;
@@ -111,6 +145,8 @@ struct GpuFrameData {
o.yPlane = nullptr;
o.uvPlane = nullptr;
o.gpuCacheBytes = 0;
o.ownerClient = nullptr;
o.onReleaseFn = nullptr;
}
// No copy
@@ -140,11 +176,21 @@ public:
if (!mat) return nullptr;
void* oldAvframe = nullptr;
// Capture old frame's owner callback to invoke OUTSIDE m_mutex
void* oldOwner = nullptr;
void (*oldReleaseFn)(void*) = nullptr;
data.createdAt = std::chrono::steady_clock::now();
data.refcount.store(1);
auto* heapData = new GpuFrameData(std::move(data));
REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
(void*)mat, (void*)heapData,
(void*)heapData->yPlane, heapData->gpuCacheY,
(int)heapData->isCudaDevicePtr,
heapData->width, heapData->height);
{
std::lock_guard<std::mutex> lock(m_mutex);
// If this Mat* already has an entry, release the old one
@@ -153,6 +199,8 @@ public:
auto* oldFrame = it->second;
int oldRef = oldFrame->refcount.fetch_sub(1);
if (oldRef <= 1) {
oldOwner = oldFrame->ownerClient;
oldReleaseFn = oldFrame->onReleaseFn;
oldAvframe = oldFrame->avframe;
if (oldFrame->cpuAvframe)
m_pendingFree.push_back(oldFrame->cpuAvframe);
@@ -166,6 +214,12 @@ public:
m_map[mat] = heapData;
m_frameSet.insert(heapData);
}
// Notify old frame's owner OUTSIDE m_mutex
if (oldReleaseFn && oldOwner) {
oldReleaseFn(oldOwner);
}
return oldAvframe; // Caller must av_frame_free if non-null
}
@@ -197,6 +251,11 @@ public:
void release(cv::Mat* mat) {
if (!mat) return;
// Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
void* owner = nullptr;
void (*releaseFn)(void*) = nullptr;
{
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_map.find(mat);
@@ -206,7 +265,16 @@ public:
m_map.erase(it);
int oldRef = frame->refcount.fetch_sub(1);
REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
(void*)mat, oldRef, oldRef - 1,
(void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
if (oldRef <= 1) {
// Capture owner callback before deleting frame
owner = frame->ownerClient;
releaseFn = frame->onReleaseFn;
REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
(void*)frame, (void*)frame->cpuYPlane,
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
// Last reference — free everything
if (frame->avframe)
m_pendingFree.push_back(frame->avframe);
@@ -218,6 +286,14 @@ public:
}
}
// Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
// with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
if (releaseFn && owner) {
REG_DBG("calling onReleaseFn owner=%p", owner);
releaseFn(owner);
}
}
// --- lookup: find GpuFrameData by cv::Mat* (locking) ---
GpuFrameData* lookup(cv::Mat* mat) {
std::lock_guard<std::mutex> lock(m_mutex);
@@ -267,9 +343,10 @@ public:
}
// --- Drain pending GPU device pointers for caller to cudaFree ---
std::vector<void*> drain_gpu_pending() {
// Each entry includes the device index for cudaSetDevice before cudaFree.
std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
std::lock_guard<std::mutex> lock(m_mutex);
std::vector<void*> result;
std::vector<GpuPendingFreeEntry> result;
result.swap(m_pendingGpuFree);
return result;
}
@@ -287,12 +364,21 @@ public:
m_lastEvictCheck = now;
}
// Collect owner callbacks to invoke OUTSIDE m_mutex
struct OwnerCallback { void* client; void (*fn)(void*); };
std::vector<OwnerCallback> callbacks;
{
std::lock_guard<std::mutex> lock(m_mutex);
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
auto* frame = *it;
auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
now - frame->createdAt).count();
if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
// Capture owner callback before deleting
if (frame->onReleaseFn && frame->ownerClient) {
callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
}
// Force cleanup — remove all Mat* keys pointing to this frame
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
if (jt->second == frame)
@@ -314,6 +400,12 @@ public:
}
}
// Notify owners OUTSIDE m_mutex
for (auto& cb : callbacks) {
cb.fn(cb.client);
}
}
// --- VRAM budget management ---
bool canAllocateGpuCache(size_t bytes) const {
return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
@@ -340,6 +432,70 @@ public:
void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
size_t gpuCacheBudget() const { return m_gpuCacheBudget; }
// --- Invalidate owner: nullify all callbacks for a client being destroyed ---
// Called by Destroy() on timeout to prevent callbacks into a deleted object.
void invalidateOwner(void* client) {
if (!client) return;
std::lock_guard<std::mutex> lock(m_mutex);
for (auto* frame : m_frameSet) {
if (frame->ownerClient == client) {
frame->ownerClient = nullptr;
frame->onReleaseFn = nullptr;
}
}
}
// --- Force-release all frames owned by a client ---
// Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
// context is still alive. Without this, unreleased clones (e.g. 70 cloned
// images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
// allocated. When close() destroys the CUDA context, those buffers become
// orphaned and later cudaFree calls crash.
//
// This force-frees ALL owned buffers for frames belonging to this client,
// removes all Mat* keys pointing to them, and deletes the GpuFrameData.
// Returns the number of frames force-released.
int forceReleaseByOwner(void* client) {
if (!client) return 0;
int count = 0;
std::lock_guard<std::mutex> lock(m_mutex);
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
auto* frame = *it;
if (frame->ownerClient == client) {
REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
(void*)frame, frame->refcount.load(),
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
// Remove all Mat* keys pointing to this frame
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
if (jt->second == frame)
jt = m_map.erase(jt);
else
++jt;
}
// Free owned buffers (CPU + GPU pending)
if (frame->avframe)
m_pendingFree.push_back(frame->avframe);
if (frame->cpuAvframe)
m_pendingFree.push_back(frame->cpuAvframe);
freeOwnedBuffers_locked(frame);
it = m_frameSet.erase(it);
delete frame;
++count;
} else {
++it;
}
}
if (count > 0) {
REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
}
return count;
}
private:
ANSGpuFrameRegistry() = default;
@@ -350,6 +506,10 @@ private:
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
// those go to pendingFree for the caller to av_frame_free).
void freeOwnedBuffers_locked(GpuFrameData* frame) {
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
(void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
frame->gpuCacheY, frame->gpuCacheUV,
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
if (frame->cpuYPlane) {
std::free(frame->cpuYPlane);
frame->cpuYPlane = nullptr;
@@ -358,23 +518,17 @@ private:
std::free(frame->cpuUvPlane);
frame->cpuUvPlane = nullptr;
}
// GPU cache freed via CUDA — caller (ANSODEngine) must handle this
// since we can't call cudaFree from this FFmpeg-free header.
// The gpuCacheBytes are tracked; actual deallocation happens in
// NV12PreprocessHelper or a GPU-aware cleanup path.
// GPU cache freed via CUDA — push to deferred list with device index
// so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
if (frame->gpuCacheBytes > 0) {
onGpuCacheFreed(frame->gpuCacheBytes);
// Mark as invalid so no one reads stale pointers
frame->gpuCacheValid = false;
frame->gpuCacheBytes = 0;
// NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
// unless the caller handles GPU cleanup. This is addressed in
// Step 8 (NV12PreprocessHelper) where cudaFree is available.
// For now, push to a separate GPU-free list.
int devIdx = frame->gpuCacheDeviceIdx;
if (frame->gpuCacheY)
m_pendingGpuFree.push_back(frame->gpuCacheY);
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
if (frame->gpuCacheUV)
m_pendingGpuFree.push_back(frame->gpuCacheUV);
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
frame->gpuCacheY = nullptr;
frame->gpuCacheUV = nullptr;
}
@@ -384,7 +538,7 @@ private:
std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
std::unordered_set<GpuFrameData*> m_frameSet; // All unique frames (for TTL scan)
std::vector<void*> m_pendingFree; // AVFrame* pointers to av_frame_free
std::vector<void*> m_pendingGpuFree; // CUDA device pointers to cudaFree
std::vector<GpuPendingFreeEntry> m_pendingGpuFree; // CUDA device pointers to cudaFree
std::atomic<size_t> m_totalGpuCacheBytes{0};
size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
std::chrono::steady_clock::time_point m_lastEvictCheck;
@@ -408,7 +562,7 @@ inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
}
// Drain GPU device pointers that need cudaFree.
// Caller must cudaFree each returned pointer.
inline std::vector<void*> gpu_frame_drain_gpu_pending() {
// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
return ANSGpuFrameRegistry::instance().drain_gpu_pending();
}

View File

@@ -46,13 +46,22 @@ namespace ANSCENTER {
Destroy();
}
void ANSFLVClient::Destroy() {
// Move player out of lock scope — close() does CUDA cleanup
// (cuArrayDestroy/cuMemFree) which must not run under _mutex
// to avoid deadlocking with nvcuda64 SRW lock held by inference.
decltype(_playerClient) clientToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
_playerClient->close();
}
clientToClose = std::move(_playerClient);
}
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSFLVLicense(const std::string& licenseKey) {
@@ -129,8 +138,12 @@ namespace ANSCENTER {
}
}
bool ANSFLVClient::Reconnect() {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
}
_playerClient->close();
std::lock_guard<std::recursive_mutex> lock(_mutex);
Setup();
_isPlaying = _playerClient->play();
return _isPlaying;
@@ -143,10 +156,16 @@ namespace ANSCENTER {
return _isPlaying;
}
bool ANSFLVClient::Stop() {
decltype(_playerClient.get()) player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}

View File

@@ -39,6 +39,8 @@ namespace ANSCENTER {
catch (...) {}
}
void ANSFILEPLAYER::Destroy() {
decltype(_playerClient) clientToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_url = "";
@@ -46,9 +48,7 @@ namespace ANSCENTER {
_isPlaying = false;
_lastJpegImage = "";
_pLastFrame.release();
if (_playerClient) {
_playerClient->close();
}
clientToClose = std::move(_playerClient);
}
catch (const std::exception& e) {
_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
@@ -57,6 +57,10 @@ namespace ANSCENTER {
_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
}
}
if (clientToClose) {
clientToClose->close();
}
}
void ANSFILEPLAYER::CheckLicense() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
@@ -94,8 +98,12 @@ namespace ANSCENTER {
return _playerClient->open(_url);
}
bool ANSFILEPLAYER::Reconnect() {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
}
_playerClient->close();
std::lock_guard<std::recursive_mutex> lock(_mutex);
Setup();
return Start();
}
@@ -105,15 +113,18 @@ namespace ANSCENTER {
return _isPlaying;
}
bool ANSFILEPLAYER::Stop() {
decltype(_playerClient.get()) player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
player = _playerClient.get();
}
if (player && player->pause()) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient->pause()) {
_isPlaying = false;
return true;
}
else {
return false;
}
}
bool ANSFILEPLAYER::IsPaused() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
return _playerClient->isPaused();

View File

@@ -19,8 +19,31 @@ extern "C" {
#include "libavutil/frame.h"
}
#include <cuda_runtime.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#ifdef _WIN32
#include <windows.h>
#endif
// Debug logging macro for GPU frame operations.
// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
#ifndef GPU_FRAME_DBG
#ifdef _WIN32
#define GPU_FRAME_DBG(fmt, ...) do { \
char _gpu_dbg_buf[512]; \
snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_gpu_dbg_buf); \
fprintf(stderr, "%s", _gpu_dbg_buf); \
} while(0)
#else
#define GPU_FRAME_DBG(fmt, ...) \
fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
#endif
#endif
namespace anscv_gpu_ops {
namespace detail {
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
return true;
}
// Drain pending GPU device pointers and actually cudaFree them.
// Must be called from a thread with CUDA context available.
inline void drainAndFreeGpuPending() {
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (gpuPending.empty()) return;
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
int prevDev = -1;
cudaGetDevice(&prevDev);
// Group by device to minimize cudaSetDevice calls and synchronize once per device.
// cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
// (not the default stream). cudaFree on stream 0 doesn't wait for other
// streams, so without this sync, cudaFree can free a buffer while a kernel
// on another stream is still reading from it → cudaErrorIllegalAddress (700)
// which permanently corrupts the CUDA context.
int lastSyncDev = -1;
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0)
cudaSetDevice(entry.deviceIdx);
if (entry.deviceIdx != lastSyncDev) {
cudaDeviceSynchronize();
lastSyncDev = entry.deviceIdx;
}
GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
cudaError_t err = cudaFree(entry.ptr);
if (err != cudaSuccess) {
GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
(int)err, cudaGetErrorString(err));
}
}
}
if (prevDev >= 0)
cudaSetDevice(prevDev);
}
} // namespace detail
} // namespace anscv_gpu_ops
@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
}
}
// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
//
// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
// AVFrame (data[0]/data[1]). The cloned AVFrame keeps the NVDEC surface alive
// until gpu_frame_remove() is called after inference. With 4 cameras each
// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
// same GPU. This decouples the NV12 data lifetime from the NVDEC decoder, so
// player->close() can safely destroy the decoder at any time without invalidating
// pointers that inference engines may be reading. The NVDEC surface is freed
// immediately (av_frame_free), returning it to the decoder's surface pool.
//
// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
//
// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
// NV12 snapshot (no zero-copy, but safe).
//
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
// accessible from another GPU context).
// inference (when decode GPU != inference GPU).
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
AVFrame* cpuNV12 = nullptr) {
if (!mat || !cudaFrame) return;
if (!mat || !cudaFrame) {
GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
return;
}
const int w = cudaFrame->width;
const int h = cudaFrame->height;
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
(void*)mat, w, h, gpuIdx,
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
GpuFrameData data{};
data.gpuIndex = gpuIdx;
data.pts = pts;
data.width = cudaFrame->width;
data.height = cudaFrame->height;
data.pixelFormat = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
data.width = w;
data.height = h;
data.pixelFormat = 23; // AV_PIX_FMT_NV12
// Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
data.isCudaDevicePtr = true;
data.yPlane = cudaFrame->data[0]; // CUDA device ptr: Y plane
data.uvPlane = cudaFrame->data[1]; // CUDA device ptr: UV plane
data.yLinesize = cudaFrame->linesize[0];
data.uvLinesize = cudaFrame->linesize[1];
// Fallback: snapshot CPU NV12 for cross-GPU inference
// Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
if (cpuNV12) {
anscv_gpu_ops::detail::snapshotNV12Planes(
cpuNV12,
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
data.width, data.height);
}
// Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
data.avframe = cudaFrame;
data.cpuAvframe = cpuNV12;
// --- D2D copy: NVDEC surface → owned GPU memory ---
// Estimate VRAM needed for the owned NV12 copy
const size_t yBytes = static_cast<size_t>(w) * h;
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
const size_t totalBytes = yBytes + uvBytes;
bool d2dOk = false;
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
int prevDev = -1;
cudaGetDevice(&prevDev);
if (gpuIdx >= 0)
cudaSetDevice(gpuIdx);
void* ownedY = nullptr;
void* ownedUV = nullptr;
size_t yPitch = 0;
size_t uvPitch = 0;
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
if (e1 == cudaSuccess && e2 == cudaSuccess) {
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
cudaFrame->data[0], cudaFrame->linesize[0],
w, h, cudaMemcpyDeviceToDevice);
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
cudaFrame->data[1], cudaFrame->linesize[1],
w, h / 2, cudaMemcpyDeviceToDevice);
if (e3 == cudaSuccess && e4 == cudaSuccess) {
// Store owned GPU pointers as primary NV12 source
data.isCudaDevicePtr = true;
data.yPlane = static_cast<uint8_t*>(ownedY);
data.uvPlane = static_cast<uint8_t*>(ownedUV);
data.yLinesize = static_cast<int>(yPitch);
data.uvLinesize = static_cast<int>(uvPitch);
// Track in gpuCache for lifecycle management (cudaFree on cleanup)
data.gpuCacheY = ownedY;
data.gpuCacheUV = ownedUV;
data.gpuCacheYPitch = yPitch;
data.gpuCacheUVPitch = uvPitch;
data.gpuCacheDeviceIdx = gpuIdx;
data.gpuCacheValid = true;
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
d2dOk = true;
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
} else {
// D2D copy failed — free allocated memory and fall back
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
(int)e3, (int)e4);
cudaFree(ownedY);
cudaFree(ownedUV);
}
} else {
// Allocation failed — free any partial allocation and fall back
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
(int)e1, (int)e2);
if (e1 == cudaSuccess) cudaFree(ownedY);
if (e2 == cudaSuccess) cudaFree(ownedUV);
}
if (prevDev >= 0)
cudaSetDevice(prevDev);
}
if (!d2dOk) {
// Fall back to CPU NV12 snapshot only (no zero-copy)
GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
(void*)data.cpuYPlane, (void*)data.cpuUvPlane);
data.isCudaDevicePtr = false;
data.yPlane = data.cpuYPlane;
data.uvPlane = data.cpuUvPlane;
data.yLinesize = data.cpuYLinesize;
data.uvLinesize = data.cpuUvLinesize;
}
// Release AVFrames immediately — NVDEC surfaces returned to pool.
// No longer stored in GpuFrameData (owned GPU copy is independent).
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
(void*)cudaFrame, (void*)cpuNV12);
av_frame_free(&cudaFrame);
if (cpuNV12) av_frame_free(&cpuNV12);
data.avframe = nullptr;
data.cpuAvframe = nullptr;
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
(void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
data.gpuCacheY, data.gpuCacheUV);
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
if (old) {
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
av_frame_free(&oldFrame);
}
// Free stale AVFrames evicted by TTL or previous attach
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// Free stale GPU device pointers
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}
// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
// Safe if not in map (no-op).
inline void gpu_frame_remove(cv::Mat* mat) {
if (!mat) return;
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
ANSGpuFrameRegistry::instance().release(mat);
// Free any AVFrames that became pending from this release or prior eviction
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
}
// Free any GPU device pointers that became pending
auto gpuPending = gpu_frame_drain_gpu_pending();
// NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
// If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
// camera threads which do have CUDA context.
// For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
// The GPU pointers are tracked in the budget and will be accounted for.
(void)gpuPending;
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// Free any GPU device pointers from evicted frames
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}

View File

@@ -46,13 +46,19 @@ namespace ANSCENTER {
Destroy();
}
void ANSMJPEGClient::Destroy() {
decltype(_playerClient) clientToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
_playerClient->close();
}
clientToClose = std::move(_playerClient);
}
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSMJPEGLicense(const std::string& licenseKey) {
@@ -129,8 +135,12 @@ namespace ANSCENTER {
}
}
bool ANSMJPEGClient::Reconnect() {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
}
_playerClient->close();
std::lock_guard<std::recursive_mutex> lock(_mutex);
Setup();
_isPlaying = _playerClient->play();
return _isPlaying;
@@ -143,10 +153,16 @@ namespace ANSCENTER {
return _isPlaying;
}
bool ANSMJPEGClient::Stop() {
decltype(_playerClient.get()) player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}

View File

@@ -48,13 +48,19 @@ namespace ANSCENTER {
Destroy();
}
void ANSRTMPClient::Destroy() {
decltype(_playerClient) clientToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
_playerClient->close();
}
clientToClose = std::move(_playerClient);
}
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSRTMPLicense(const std::string& licenseKey) {
@@ -126,8 +132,12 @@ namespace ANSCENTER {
}
bool ANSRTMPClient::Reconnect() {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
}
_playerClient->close();
std::lock_guard<std::recursive_mutex> lock(_mutex);
Setup();
_isPlaying = _playerClient->play();
return _isPlaying;
@@ -140,10 +150,16 @@ namespace ANSCENTER {
return _isPlaying;
}
bool ANSRTMPClient::Stop() {
decltype(_playerClient.get()) player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include <memory>
#include <format>
#include "media_codec.h"
#include <cstdint>
#include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
}
// Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
// Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
#ifndef RTSP_DBG
#ifdef _WIN32
#define RTSP_DBG(fmt, ...) do { \
char _rtsp_buf[512]; \
snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_rtsp_buf); \
fprintf(stderr, "%s", _rtsp_buf); \
} while(0)
#else
#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
#endif
#endif
static bool ansrtspLicenceValid = false;
// Global once_flag to protect license checking
static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
Destroy();
}
void ANSRTSPClient::Destroy() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// Move the player client pointer out of the lock scope, then
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
// cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
// If we hold _mutex during close(), and another thread holds
// the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
// inference), we get a deadlock: Stop() → _mutex → nvcuda64
// vs inference → nvcuda64 → (blocked by exclusive waiter).
decltype(_playerClient) clientToClose;
{
std::unique_lock<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
// Stop the stream first so the video decoder is flushed and
// the RTSP callback thread is no longer feeding frames into
// decode(). Without this, rtsp_close() can block waiting for
// CRtspClient::m_pMutex (held by the callback mid-decode),
// and the hardware decoder flush during destruction can hang
// on the GPU.
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
_playerClient->close();
}
// --- Inference guard: wait for in-flight frames to finish ---
// GetRTSPCVImage increments _inFlightFrames when it hands out
// a GPU frame; the registry decrements it when the frame is
// released after inference completes. We wait here so that
// close() doesn't free NVDEC surfaces while TensorRT is
// still reading from them (the LabVIEW crash root cause).
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Destroy",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release ALL GPU frames owned by this client BEFORE close().
// Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
// cv::Mat*) keep gpuCacheY/gpuCacheUV allocated. We must cudaFree
// them NOW while the CUDA context is still alive. After close()
// destroys the context, cudaFree would crash.
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Drain and cudaFree the GPU buffers while CUDA context is alive
// Sync all GPU streams before freeing to avoid illegal access
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
// Also drain any pending AVFrames
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
clientToClose = std::move(_playerClient);
}
// CUDA cleanup happens here, outside the mutex — now safe.
// All GPU frames owned by this client have been force-freed above.
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
_playerClient->setCrop(crop);
}
bool ANSRTSPClient::Reconnect() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
// return the cached _pLastFrame instead of calling into the player,
// preventing use-after-free when close() destroys CUDA resources.
{
std::unique_lock<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
// --- Inference guard: wait for in-flight frames to finish ---
// Same guard as Destroy(): close() will free NVDEC surfaces, so
// we must wait for any inference engines still reading NV12 data
// via zero-copy CUDA device pointers.
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Reconnect",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release GPU frames before close() — same as Destroy().
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Sync all GPU streams before freeing
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
}
// 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
// _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
// Safe now because GetImage()/GetNV12Frame() won't touch the player
// while _isPlaying == false, and all in-flight frames have been released.
_logger.LogInfo("ANSRTSPClient::Reconnect",
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
_playerClient->close();
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
// 3. Re-setup and play under the mutex.
std::lock_guard<std::recursive_mutex> lock(_mutex);
_logger.LogInfo("ANSRTSPClient::Reconnect",
"calling Setup() + play()", __FILE__, __LINE__);
Setup();
_isPlaying = _playerClient->play();
RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
return _isPlaying;
}
void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,10 +324,22 @@ namespace ANSCENTER {
}
bool ANSRTSPClient::Stop() {
// Grab the player pointer and clear _isPlaying under the lock,
// then call stop() OUTSIDE the mutex. stop() internally calls
// StopVideoDecoder -> decoder->flush() which does CUDA calls
// that can block on the nvcuda64 SRW lock. Holding _mutex
// during that time blocks all other operations on this client
// and contributes to the convoy when many clients stop at once.
CRtspPlayer* player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}
@@ -759,10 +926,12 @@ namespace ANSCENTER {
}
AVFrame* ANSRTSPClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
}
AVFrame* ANSRTSPClient::GetCudaHWFrame() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
return _playerClient->getCudaHWFrame();
}
bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
else result = ptr->Init(licenseKey, username, password, url);
if (result) {
// Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
// fast path. LabVIEW may not call SetRTSPHWDecoding after
// destroy+recreate cycles, so this ensures the new handle always
// uses the GPU decode path instead of falling back to D3D11VA/CPU.
ptr->SetHWDecoding(7); // HW_DECODING_CUDA
*Handle = ptr.release();
extern void anscv_unregister_handle(void*);
extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
try {
extern void anscv_unregister_handle(void*);
anscv_unregister_handle(*Handle);
// unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
// Grab the raw pointer and NULL the caller's handle immediately.
// This prevents the caller (LabVIEW) from issuing new calls.
ANSCENTER::ANSRTSPClient* raw = *Handle;
*Handle = nullptr;
// Mark as not-playing under _mutex ONLY. This makes
// GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
// on any subsequent call, and prevents NEW NV12 GPU surface
// pointers from being handed out.
//
// Do NOT call Destroy()/close() here — close() frees the
// NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
// still be in use by a CUDA inference kernel that received
// the NV12 pointer from a GetRTSPCVImage call that already
// completed before this Release was called.
{
// Use the client's _mutex to safely set _isPlaying = false.
// This is the same lock GetImage/GetNV12Frame acquire.
raw->Stop(); // sets _isPlaying = false, stops playback
}
// Defer the full cleanup (Destroy + delete) to a background thread
// so LabVIEW's UI thread is not blocked. Destroy() now waits
// precisely for in-flight inference to finish (via _inFlightFrames
// counter + condition variable) instead of the old 500ms sleep hack.
std::thread([raw]() {
try { raw->Destroy(); } catch (...) {}
try { delete raw; } catch (...) {}
}).detach();
return 0;
} catch (...) {
if (Handle) *Handle = nullptr;
@@ -882,21 +1084,58 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
// Attach NV12 frame for GPU fast-path inference (side-table registry)
// attach() takes ownership — do NOT av_frame_free here
//
// CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
// It atomically checks _isPlaying and increments _inFlightFrames under
// the same mutex, so Reconnect() cannot call close() while we're doing
// the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
if (inFlightGuardHeld) {
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) {
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
// Also attach CPU NV12 as fallback for cross-GPU inference
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
cudaHW->width, cudaHW->height,
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
} else {
// HW decode not active — try CPU NV12
AVFrame* nv12 = (*Handle)->GetNV12Frame();
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
}
}
// Wire up the registry callback to release the in-flight guard.
// TryIncrementInFlight already incremented; DecrementInFlight fires
// when the last clone of this frame is released after inference.
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
(void*)gpuData,
gpuData ? (void*)gpuData->yPlane : nullptr,
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
gpuData ? gpuData->gpuCacheY : nullptr);
if (gpuData) {
gpuData->ownerClient = *Handle;
gpuData->onReleaseFn = [](void* client) {
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
};
// NOTE: Do NOT call IncrementInFlight() again here —
// TryIncrementInFlight() already did it above.
} else {
// No gpuData registered (attach failed?) — release the guard
(*Handle)->DecrementInFlight();
}
} else {
// Player is stopping/reconnecting — skip CUDA path entirely.
// GetImage() already returned a cached BGR frame, which is safe.
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
}
return 1; // Success
}
catch (const cv::Exception& e) {

View File

@@ -16,6 +16,8 @@
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <atomic>
#include <condition_variable>
namespace ANSCENTER
{
@@ -37,7 +39,36 @@ namespace ANSCENTER
int64_t _pts;
bool _isPlaying;
std::recursive_mutex _mutex;
// --- Per-client inference guard ---
// Tracks how many GPU frames from this client are currently in-flight
// (grabbed by GetRTSPCVImage but not yet released after inference).
// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
// preventing the use-after-free crash when LabVIEW stops a camera
// while AI inference is still reading CUDA device pointers.
std::atomic<int> _inFlightFrames{0};
std::condition_variable_any _inFlightDone;
public:
void IncrementInFlight() { _inFlightFrames.fetch_add(1, std::memory_order_acq_rel); }
void DecrementInFlight() {
if (_inFlightFrames.fetch_sub(1, std::memory_order_acq_rel) <= 1) {
_inFlightDone.notify_all();
}
}
// Atomically check _isPlaying AND increment _inFlightFrames under the
// same mutex. Returns true if the caller may proceed to access CUDA
// resources (GetCudaHWFrame + D2D copy). Returns false if the player
// is stopping/reconnecting — caller must NOT touch CUDA resources.
//
// This closes the race window where Reconnect() sets _isPlaying=false
// and calls close() while GetRTSPCVImage is between GetCudaHWFrame()
// and the D2D copy in gpu_frame_attach_cuda().
bool TryIncrementInFlight() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return false;
_inFlightFrames.fetch_add(1, std::memory_order_acq_rel);
return true;
}
ANSRTSPClient();
~ANSRTSPClient() noexcept;
[[nodiscard]] bool Init(std::string licenseKey, std::string url);

View File

@@ -48,13 +48,19 @@ namespace ANSCENTER {
Destroy();
}
void ANSSRTClient::Destroy() {
decltype(_playerClient) clientToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
_playerClient->close();
}
clientToClose = std::move(_playerClient);
}
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSSRTLicense(const std::string& licenseKey) {
@@ -124,8 +130,12 @@ namespace ANSCENTER {
}
}
bool ANSSRTClient::Reconnect() {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
}
_playerClient->close();
std::lock_guard<std::recursive_mutex> lock(_mutex);
Setup();
_isPlaying = _playerClient->play();
return _isPlaying;
@@ -155,10 +165,16 @@ namespace ANSCENTER {
}
bool ANSSRTClient::Stop() {
decltype(_playerClient.get()) player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}

View File

@@ -40,16 +40,17 @@ namespace ANSCENTER {
catch (...) {}
}
void ANSVIDEOPLAYER::Destroy() {
// Move HW player out of lock scope — close() does CUDA cleanup
// (cuArrayDestroy/cuMemFree) which must not run under _mutex
// to avoid deadlocking with nvcuda64 SRW lock held by inference.
decltype(_hwPlayer) hwPlayerToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
// --- HW decode cleanup ---
if (_hwPlayer) {
try {
_hwPlayer->stop();
_hwPlayer->close();
} catch (...) {}
_hwPlayer.reset(); // releases CFilePlayer + HWDecoderPool slot
try { _hwPlayer->stop(); } catch (...) {}
}
hwPlayerToClose = std::move(_hwPlayer);
_hwDecodeActive = false;
_hwGpuIndex = -1;
_hwCudaAccel = false;
@@ -77,6 +78,13 @@ namespace ANSCENTER {
catch (...) {
_logger.LogError("ANSVIDEOPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
}
} // end lock scope
// CUDA cleanup happens here, outside the mutex
if (hwPlayerToClose) {
try { hwPlayerToClose->close(); } catch (...) {}
hwPlayerToClose.reset();
}
}
static void VerifyGlobalANSVPLicense(const std::string& licenseKey) {
@@ -187,15 +195,25 @@ namespace ANSCENTER {
}
bool ANSVIDEOPLAYER::Reconnect() {
// HW decoder close() does CUDA cleanup — run outside _mutex
// to avoid deadlocking with nvcuda64 SRW lock held by inference.
decltype(_hwPlayer) hwPlayerToClose;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
_isPlaying = false; // GetImage() returns cached frame while we reconnect
if (_hwPlayer) {
try { _hwPlayer->stop(); } catch (...) {}
hwPlayerToClose = std::move(_hwPlayer);
}
}
if (hwPlayerToClose) {
try { hwPlayerToClose->close(); } catch (...) {}
hwPlayerToClose.reset();
}
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_currentFrame = 0;
// --- HW decode: destroy and re-setup ---
if (_hwPlayer) {
try { _hwPlayer->stop(); _hwPlayer->close(); } catch (...) {}
_hwPlayer.reset();
}
_hwDecodeActive = false;
_hwGpuIndex = -1;
_hwCudaAccel = false;
@@ -266,15 +284,17 @@ namespace ANSCENTER {
}
}
bool ANSVIDEOPLAYER::Stop() {
decltype(_hwPlayer.get()) hwPlayer = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
// --- HW decode path ---
if (_hwDecodeActive && _hwPlayer) {
_hwPlayer->stop();
_isPlaying = false;
return true;
hwPlayer = _hwPlayer.get();
// stop() called outside the lock below; skip cap path
}
else {
// --- cv::VideoCapture fallback ---
if (cap.isOpened()) {
try {
@@ -296,11 +316,16 @@ namespace ANSCENTER {
_isPlaying = false;
return true;
}
}
catch (const std::exception& e) {
this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
return false;
}
}
if (hwPlayer) {
hwPlayer->stop();
}
return true;
}
void ANSVIDEOPLAYER::SetBBox(cv::Rect bbox) {
std::lock_guard<std::recursive_mutex> lock(_mutex);

View File

@@ -378,7 +378,7 @@ namespace ANSCENTER {
}
}
std::vector<Object> ANSALPR_CPU::RunInference(const cv::Mat& input, const std::string &cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components have their own fine-grained locks.
std::vector<Object> output;
output.clear();
// Initial validation
@@ -419,17 +419,18 @@ namespace ANSCENTER {
#ifdef FNS_DEBUG // Corrected preprocessor directive
cv::Mat draw = input.clone();
#endif
_detectedArea = cv::Rect(0, 0, frame.cols, frame.rows);
if ((_detectedArea.width > 50) && (_detectedArea.height > 50)) {
// Use local variable instead of shared _detectedArea for thread safety
cv::Rect detectedArea(0, 0, frame.cols, frame.rows);
if ((detectedArea.width > 50) && (detectedArea.height > 50)) {
#ifdef FNS_DEBUG // Corrected preprocessor directive
cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
#endif
// Ensure _lprDetector is valid
if (!_lprDetector) {
this->_logger.LogFatal("ANSALPR_CPU::Inference", "_lprDetector is null", __FILE__, __LINE__);
return output;
}
cv::Mat activeFrame = frame(_detectedArea).clone();
cv::Mat activeFrame = frame(detectedArea).clone();
//std::vector<Object> lprOutputRaw = _lpDetector->RunInference(activeFrame, cameraId);
//std::vector<Object> lprOutput = AdjustLicensePlateBoundingBoxes(lprOutputRaw, _detectedArea, frame.size(), 3.0);
@@ -471,8 +472,12 @@ namespace ANSCENTER {
lprObject.cameraId = cameraId;
lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);
// OCR inference
std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
// OCR inference (ppocr is not thread-safe, use fine-grained lock)
std::vector<PaddleOCR::OCRPredictResult> res_ocr;
{
std::lock_guard<std::mutex> ocrLock(_ocrMutex);
res_ocr = ppocr->ocr(alignedLPR);
}
std::string ocrText;
if (!res_ocr.empty() && res_ocr.size() < 3) {
@@ -515,13 +520,13 @@ namespace ANSCENTER {
return output;
}
bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — delegates to Inference(input, lprResult, cameraId)
if (input.empty()) return false;
if ((input.cols < 5) || (input.rows < 5)) return false;
return Inference(input, lprResult, "CustomCam");
}
bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult, const std::string & cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components have fine-grained locks.
std::vector<Object> output;
output.clear();
if (!_licenseValid) {
@@ -587,7 +592,12 @@ namespace ANSCENTER {
cv::Mat lprImage = frame(lprPos).clone();
lprObject.cameraId = cameraId;
lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);
std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(lprImage);
// ppocr is not thread-safe, use fine-grained lock
std::vector<PaddleOCR::OCRPredictResult> res_ocr;
{
std::lock_guard<std::mutex> ocrLock(_ocrMutex);
res_ocr = ppocr->ocr(lprImage);
}
int detectionSize = res_ocr.size();
if ((detectionSize > 0) && (detectionSize < 3)) {
for (int n = 0; n < res_ocr.size(); n++) { // number of detections
@@ -613,7 +623,7 @@ namespace ANSCENTER {
}
}
bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
if (input.empty()) return false;
if ((input.cols < 5) || (input.rows < 5)) return false;
return Inference(input, Bbox, lprResult, "CustomCam");
@@ -622,7 +632,7 @@ namespace ANSCENTER {
bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,
std::string& lprResult, const std::string& cameraId)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components have fine-grained locks.
// Early validation
if (!_licenseValid) {
@@ -668,16 +678,12 @@ namespace ANSCENTER {
}
try {
// Convert grayscale to BGR if necessary
const cv::Mat* framePtr;
// Convert grayscale to BGR if necessary (use local buffer for thread safety)
cv::Mat localFrame;
if (input.channels() == 1) {
cv::cvtColor(input, this->_frameBuffer, cv::COLOR_GRAY2BGR);
framePtr = &this->_frameBuffer;
cv::cvtColor(input, localFrame, cv::COLOR_GRAY2BGR);
}
else {
framePtr = &input;
}
const cv::Mat& frame = *framePtr;
const cv::Mat& frame = (input.channels() == 1) ? localFrame : input;
const int frameWidth = frame.cols;
const int frameHeight = frame.rows;
@@ -794,7 +800,12 @@ namespace ANSCENTER {
cv::Mat lprImage = frame(plateRect);
cv::Mat alignedLPR = enhanceForOCR(lprImage);
std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
// ppocr is not thread-safe, use fine-grained lock
std::vector<PaddleOCR::OCRPredictResult> res_ocr;
{
std::lock_guard<std::mutex> ocrLock(_ocrMutex);
res_ocr = ppocr->ocr(alignedLPR);
}
const size_t detectionSize = res_ocr.size();
if (detectionSize == 0 || detectionSize >= 3) {

View File

@@ -5,6 +5,7 @@
#include <list>
#include <map>
#include <string>
#include <mutex>
#include <utility>
#include <vector>
#include <include/paddleocr.h>
@@ -157,6 +158,7 @@ namespace ANSCENTER
"43B1", "68L1", "70G1", "36M1", "81N1", "90K1", "17B1", "64E1", "99D1", "60B2", "74L1", "60C1", "68M1", "63B7", "34B1", "69M1", "24B1", "15M1", "83Y1", "48C1", "95H1", "79X1", "17B6", "36E1", "38K1", "25N1", "25U1", "61B1", "36C1", "36B3", "38F1", "99G1", "69N1", "97D1", "92T1", "92B1", "88B1", "97G1", "14U1", "63A1", "26N1", "19D1", "93C1", "73B1", "84B1", "81K1", "18L1", "64D1", "35M1", "61N1", "83P1", "15S1", "82B1", "92U1", "43D1", "22L1", "63B5", "64G1", "27N1", "14X1", "62C1", "81D1", "38G1", "19F1", "34K1", "49P1", "89H1", "14T1", "19M1", "78D1", "76A1", "66K1", "66C1", "71C1", "37K1", "19G1", "15F1", "85C1", "49B1", "21B1", "89F1", "23M1", "66L1", "90B5", "93M1", "14P1", "77N1", "36B8", "86B1", "12U1", "63B3", "21L1", "36G5", "65G1", "82E1", "61H1", "65H1", "84A1", "23F1", "95C1", "99K1", "49G1", "92D1", "36K3", "92N1", "82X1", "83M1", "11N1", "14K1", "19H1", "93H1", "60A1", "79A1", "20D1", "90D1", "81C1", "66P1", "36K1", "92V1", "18B1", "37P1", "22Y1", "23H1", "26D1", "66G1", "78F1", "49C1", "26H1", "38P1", "47T1", "74H1", "63P1", "47D1", "15D1", "23D1", "68E1", "20B1", "49F1", "43K1", "65K1", "27Z1", "92S1", "79H1", "21E1", "35Y1", "14S1", "75E1", "24Y1", "12T1", "27P1", "77B1", "88H1", "60B3", "23P1", "61F1", "99H1", "23K1", "59A3", "26C1", "81B1", "74E1", "66B1", "22S1", "92P1", "93B1", "69B1", "81P1", "12H1", "62K1", "35A1", "77C1", "27V1", "68N1", "12D1", "64K1", "41A1", "12Z1", "76C1", "38B1", "78G1", "74K1", "69H1", "94A1", "61K1", "86B7", "82G1", "14N1", "82M1", "76E1", "18E1", "61C1", "15N1", "90A1", "77F1", "34D1", "47B1", "62S1", "43E1", "81M1", "92X1", "75B1", "34F1", "70H1", "62B1", "26B1", "60B4", "61A1", "12B1", "90T1", "92E1", "34C1", "47G1", "97B1", "25S1", "70E1", "93Y1", "47S1", "37F1", "28N1", "11K1", "38E1", "78M1", "74C1", "12S1", "75S1", "37A1", "28D1", "65L1", "22B1", "99B1", "74G1", "79K1", "76K1", "76H1", "23B1", "15R1", "36B1", "74D1", "62L1", "37E1", "78E1", "89K1", "26M1", "25F1", "48H1", "79D1", "43H1", "76F1", "36L1", "43L1", "21K1", "88L1", "27S1", "92K1", "77D1", "19N1", "66H1", "36H5", "62N1", "18G1", "75D1", "37L1", "68K1", "28C1", "26E1", "35N1", "85H1", "62D1", "27U1", "19E1", "99E1", "14Y1", "49L1", "66M1", "73F1", "70K1", "36F5", "97H1", "93E1", "68P1", "43F1", "48G1", "75K1", "62U1", "86B9", "65F1", "27L1", "70L1", "63B8", "78L1", "11Z1", "68C1", "18D1", "15L1", "99C1", "49E1", "84E1", "69E1", "38A1", "48D1", "68S1", "81E1", "84K1", "63B6", "24T1", "95A1", "86B4", "34M1", "84L1", "24V1", "14M1", "36H1", "15B1", "69F1", "47E1", "38H1", "88D1", "28E1", "60C2", "63B9", "75Y1", "21D1", "35H1", "68F1", "86B5", "15H1", "36B5", "83X1", "17B7", "12V1", "86B8", "95E1", "63B2", "74F1", "86C1", "48K1", "89M1", "85D1", "71C4", "34E1", "97C1", "88E1", "81F1", "60B5", "84M1", "92H1", "28L1", "34H1", "38X1", "82L1", "61E1", "82F1", "62P1", "93F1", "65B1", "93L1", "95B1", "15P1", "77G1", "28M1", "35B1", "68G1", "36C2", "68D1", "69K1", "14L1", "36M3", "24X1", "24Z1", "86A1", "88C1", "15E1", "77E1", "83E1", "47L1", "25T1", "89C1", "71C3", "49D1", "36L6", "48F1", "36B6", "34P1", "84D1", "15C1", "38M1", "85F1", "77K1", "86B3", "74B1", "78H1", "89G1", "64A2", "15K1", "85B1", "49K1", "21H1", "73C1", "47U1", "65E1", "18C1", "69D1", "63B1", "95G1", "19L1", "20G1", "76D1", "29A1", "68T1", "75L1", "12L1", "89L1", "37C1", "27B1", "19C1", "11H1", "81X1", "70B1", "11V1", "43G1", "22A1", "83C1", "75C1", "79C1", "22F1", "92F1", "81G1", "81T1", "28H1", "66N1", "71B1", "18H1", "76P1", "26F1", "81U1", "34N1", "64F1", "76N1", "24S1", "26P1", "63B4", "35T1", "36N1", "47F1", "81L1", "61G1", "77M1", "34G1", "26G1", "97F1", "62H1", "28F1", "62T1", "93G1", "73D1", "65A1", "47P1", "74P1", "82N1", "20E1", "36D1", "60B1", "49M1", "37H1", "37M1", "38D1", "84F1", "88F1", "36B2", "65C1", "92M1", "86B6", "75H1", "38L1", "20C1", "97E1", "85E1", "38N1", "26K1", "89B1", "99F1", "28B1", "34L1", "86B2", "66F1", "77L1", "27Y1", "68H1", "37D1", "92L1", "82K1", "99A1", "69L1", "76M1", "90B4", "48B1", "95D1", "20H1", "64H1", "79Z1", "92G1", "23G1", "21G1", "37G1", "35K1", "81H1", "83Z1", "76T1", "36F1", "36B4", "14B9", "47K1", "20K1", "62M1", "84H1", "62F1", "74A1", "18A1", "73H1", "37N1", "79N1", "61D1", "11P1", "15G1", "47N1", "19K1", "71C2", "81S1", "11M1", "60B7", "60B8", "62G1", "71A1", "24P1", "69A1", "38C1", "49N1", "21C1", "84G1", "37B1", "72A1", "88K1", "88G1", "83V1", "78C1", "73K1", "78K1", "73E189D1", "67A1", "27X1", "62A1", "18K1", "70F1", "36K5", "19B1", "49H1", "66S1", "12P1"};
ALPRChecker alprChecker;
std::vector<std::string> ValidVNCarList = { "94H", "49F", "93A", "20F", "81H", "95R", "38R", "29F", "81F", "28G", "19A", "85B", "2", "43H", "51L", "28C", "21A", "51D", "50F", "24H", "93R", "92H", "71G", "75H", "86G", "30L", "79A", "82B", "79H", "78C", "61E", "70A", "90C", "72G", "34B", "17E", "18E", "78A", "37F", "51E", "71A", "28F", "47E", "83D", "81B", "84C", "71H", "76G", "92E", "36A", "69R", "30M", "27R", "71D", "19B", "34E", "38K", "88G", "68G", "30E", "68E", "25F", "74D", "98K", "89H", "36R", "84D", "61F", "49G", "25H", "17F", "14R", "36H", "47G", "90A", "68A", "83C", "26B", "15B", "61C", "15K", "47H", "78E", "75D", "15C", "63E", "34C", "36F", "38G", "15E", "93F", "22G", "60B", "94D", "62R", "24D", "11R", "12A", "76A", "94C", "97R", "24E", "26A", "15F", "72A", "49H", "62D", "98C", "71B", "61A", "12C", "27A", "78R", "51M", "69E", "76D", "78F", "49R", "81A", "64F", "29D", "18A", "19F", "21E", "92A", "65G", "86E", "62G", "61K", "47A", "23R", "14F", "95D", "36B", "74R", "11H", "24C", "11G", "66D", "63A", "43R", "70F", "86B", "61G", "47M", "67C", "37D", "43G", "14H", "90F", "51G", "86A", "11E", "29K", "85C", "83F", "24B", "98R", "19E", "61B", "90D", "82G", "14K", "74G", "72D", "85A", "19C", "37G", "98E", "74F", "28H", "90E", "89D", "35R", "97H", "83H", "95A", "20C", "65E", "15R", "73C", "37A", "38E", "77G", "94B", "17A", "75R", "98F", "65R", "76R", "20B", "24G", "25B", "73G", "62F", "29G", "77C", "22H", "14D", "23F", "93C", "19R", "15D", "47R", "79D", "60G", "77A", "82C", "63G", "21H", "81E", "25D", "12D", "37R", "36K", "84F", "98G", "28B", "51N", "18F", "50R", "74C", "35C", "30G", "64A", "95F", "18C", "99G", "99B", "37C", "76H", "60K", "67R", "75A", "83R", "28E", "65F", "17D", "92G", "23C", "60R", "90R", "38A", "43D", "50H", "43C", "77H", "47B", "89F", "82F", "65H", "89E", "62C", "24R", "26G", "84E", "17C", "65B", "34A", "12B", "64R", "29H", "71C", "88D", "79F", "76C", "98A", "69H", "22B", "29A", "72R", "67H", "48C", "22D", "60C", "35H", "38H", "63P", "70D", "49D", "18H", "89A", "72E", "92D", "26H", "73R", "85G", "20E", "98H", "69C", "18B", "73B", "22E", "34G", "30K", "20D", "50A", "34D", "15H", "34H", "71E", "62E", "64C", "51R", "82D", "99E", "70R", "18D", "92F", "94R", "24A", "85H", "11C", "73E", "95E", "86C", "94F", "86R", "37K", "23B", "20H", "73D", "95H", "35A", "89B", "82H", "67F", "70H", "97F", "29E", "97A", "51K", "68D", "37B", "82E", "18R", "86H", "35B", "43E", "35F", "95B", "70E", "21D", "27F", "36E", "63D", "68C", "50E", "36G", "75F", "21G", "29B", "93B", "22A", "18G", "43F", "93G", "62A", "83B", "28D", "75C", "22C", "21R", "25E", "23G", "97C", "75E", "79E", "19H", "47K", "65C", "35E", "20R", "68B", "89R", "67A", "75G", "81R", "78B", "77D", "78G", "20K", "36D", "66C", "38F", "27G", "19D", "67B", "84G", "22F", "61D", "20G", "48A", "76F", "48H", "92B", "85R", "26C", "65A", "70B", "38D", "14C", "66A", "73A", "49C", "74E", "68R", "66B", "74A", "49E", "17B", "69D", "51C", "85F", "21F", "99C", "17G", "72H", "94E", "51F", "92R", "60H", "21B", "93D", "19G", "86F", "51A", "66R", "72B", "26D", "64E", "93H", "12H", "97E", "60E", "82A", "60A", "83E", "27D", "64B", "11B", "11D", "76B", "95G", "14A", "61R", "21C", "30F", "23H", "89C", "97G", "62B", "63R", "88B", "98B", "90B", "67G", "69F", "73H", "20A", "72C", "65D", "68H", "51H", "79G", "70C", "90G", "66G", "83A", "77F", "63B", "64G", "25A", "88E", "68F", "99D", "26E", "94A", "48F", "34R", "61H", "90H", "74B", "14G", "12F", "15A", "27E", "69A", "35D", "12E", "85E", "25C", "29M", "89G", "17R", "78D", "84R", "95C", "15G", "28R", "99A", "69G", "48D", "97D", "27C", "78H", "14E", "79R", "73F", "88A", "48E", "48B", "64H", "99R", "14B", "77R", "75B", "88F", "84B", "11A", "67E", "12R", "50M", "11F", "79C", "49A", "43A", "88R", "77E", "48G", "51B", "81D", "74H", "93E", "37H", "88C", "71F", "94G", "38C", "29C", "43B", "30H", "81G", "28A", "26R", "66H", "66E", "17H", "79B", "49B", "63C", "98D", "81C", "69B", "63H", "85D", "26F", "22R", "83G", "37E", "12G", "77B", "35G", "62H", "60D", "60F", "99H", "70G", "76E", "84A", "72F", "25R", "27B", "30A", "47F", "34F", "97B", "23E", "36C", "66F", "48R", "92C", "71R", "23A", "50G", "47C", "82R", "63F", "84H", "38B", "47D", "67D", "25G", "86D", "88H", "64D", "24F", "23D", "99F" };
std::mutex _ocrMutex; // Fine-grained lock for PaddleOCR (not thread-safe)
std::unique_ptr<PaddleOCR::PPOCR> ppocr = std::make_unique<PaddleOCR::PPOCR>();
[[nodiscard]] std::string AnalyseLicensePlateText(const std::string& ocrText);
[[nodiscard]] char convertDigitToLetter(char c);

View File

@@ -863,7 +863,8 @@ namespace ANSCENTER {
}
}
std::vector<Object> ANSALPR_OD::RunInferenceSingleFrame(const cv::Mat& input, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex here — sub-components (detectors, alprChecker) have their own locks.
// LabVIEW semaphore controls concurrency at the caller level.
// Early validation
if (!_licenseValid) {
@@ -916,18 +917,19 @@ namespace ANSCENTER {
cv::Mat draw = input.clone();
#endif
_detectedArea = cv::Rect(0, 0, frameWidth, frameHeight);
// Use local variable instead of shared _detectedArea for thread safety
cv::Rect detectedArea(0, 0, frameWidth, frameHeight);
if (_detectedArea.width <= 50 || _detectedArea.height <= 50) {
if (detectedArea.width <= 50 || detectedArea.height <= 50) {
return {};
}
#ifdef FNS_DEBUG
cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2);
cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2);
#endif
// Run license plate detection
cv::Mat activeFrame = frame(_detectedArea);
cv::Mat activeFrame = frame(detectedArea);
std::vector<Object> lprOutput = _lpDetector->RunInference(activeFrame, cameraId);
if (lprOutput.empty()) {
@@ -1010,7 +1012,7 @@ namespace ANSCENTER {
return {};
}
std::string ANSALPR_OD::DetectLicensePlateString(const cv::Mat& lprROI, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _ocrDetector has its own m_inferenceMutex
try {
// convert lprROI to greyscale if it is not already
if (lprROI.empty()) {
@@ -1277,8 +1279,7 @@ namespace ANSCENTER {
return {};
}
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _lpColourDetector has its own m_inferenceMutex
try {
std::vector<Object> colourOutputs = _lpColourDetector->RunInference(lprROI, cameraId);
@@ -1310,8 +1311,9 @@ namespace ANSCENTER {
return DetectLPColourDetector(lprROI, cameraId);
}
// Check cache first (no GPU work needed)
// Check cache first (fine-grained lock, no GPU work)
{
std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
auto it = _colourCache.find(plateText);
if (it != _colourCache.end()) {
it->second.hitCount++;
@@ -1319,11 +1321,12 @@ namespace ANSCENTER {
}
}
// Cache miss — run the actual classifier
// Cache miss — run the actual classifier (no lock held during GPU inference)
std::string colour = DetectLPColourDetector(lprROI, cameraId);
// Store in cache
// Store in cache (fine-grained lock)
if (!colour.empty()) {
std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
if (_colourCache.size() >= COLOUR_CACHE_MAX_SIZE) {
_colourCache.clear();
}
@@ -1334,13 +1337,14 @@ namespace ANSCENTER {
}
bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — delegates to Inference(input, lprResult, cameraId) which is also lock-free
if (input.empty()) return false;
if ((input.cols < 5) || (input.rows < 5)) return false;
return Inference(input, lprResult, "CustomCam");
}
bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components have their own fine-grained locks.
// LabVIEW semaphore controls concurrency at the caller level.
// Early validation
if (!_licenseValid) {
@@ -1518,14 +1522,14 @@ namespace ANSCENTER {
}
}
bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
if (input.empty()) return false;
if ((input.cols < 5) || (input.rows < 5)) return false;
return Inference(input, Bbox, lprResult, "CustomCam");
}
bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,std::string& lprResult, const std::string& cameraId)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components have their own fine-grained locks.
// Early validation
if (!_licenseValid) {
@@ -2177,12 +2181,10 @@ namespace ANSCENTER {
cv::Mat unsharp;
cv::addWeighted(denoised, 1.8, blurred, -0.8, 0, unsharp);
// Step 5: CLAHE contrast enhancement
if (!_clahe) {
_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
}
// Step 5: CLAHE contrast enhancement (thread-local for thread safety)
thread_local cv::Ptr<cv::CLAHE> tl_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
cv::Mat contrastEnhanced;
_clahe->apply(unsharp, contrastEnhanced);
tl_clahe->apply(unsharp, contrastEnhanced);
// Step 6: Laplacian edge sharpening
cv::Mat lap;
@@ -2718,6 +2720,7 @@ namespace ANSCENTER {
void ANSALPR_OD::ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId)
{
std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
auto& identities = _plateIdentities[cameraId];
// Option B: Auto-detect mode by counting detections.

View File

@@ -24,7 +24,7 @@ namespace ANSCENTER
ANSCENTER::ModelConfig _lpdmodelConfig;
ANSCENTER::ModelConfig _ocrModelConfig;
ANSCENTER::ModelConfig _lpColourModelConfig;
cv::Ptr<cv::CLAHE> _clahe; // Reusable CLAHE instance
// _clahe moved to thread-local in enhanceForOCR() for thread safety
ANSCENTER::NV12PreprocessHelper _nv12Helper; // NV12 crop for high-res plate OCR
std::string _lpdLabels;
@@ -147,6 +147,7 @@ namespace ANSCENTER
int framesSinceLastSeen = 0;
};
// cameraId → list of tracked plate identities
std::mutex _plateIdentitiesMutex; // Fine-grained lock for plate identity tracking
std::unordered_map<std::string, std::vector<SpatialPlateIdentity>> _plateIdentities;
static constexpr float PLATE_SPATIAL_MATCH_THRESHOLD = 0.3f; // IoU threshold for same plate
void ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId);
@@ -176,6 +177,7 @@ namespace ANSCENTER
std::string colour;
int hitCount = 0;
};
std::mutex _colourCacheMutex; // Fine-grained lock for colour cache only
std::unordered_map<std::string, ColourCacheEntry> _colourCache;
static constexpr size_t COLOUR_CACHE_MAX_SIZE = 200;

View File

@@ -118,7 +118,7 @@ namespace ANSCENTER {
}
std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
OCRObjects.clear();
if (!_licenseValid) {
@@ -177,7 +177,7 @@ namespace ANSCENTER {
std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
OCRObjects.clear();
if (!_licenseValid) {
@@ -271,7 +271,7 @@ namespace ANSCENTER {
std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
OCRObjects.clear();
if (!_licenseValid) {

View File

@@ -80,7 +80,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input)
}
std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -164,7 +164,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
}
std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -268,7 +268,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
}
std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -385,7 +385,7 @@ bool ANSONNXOCR::Destroy() {
}
std::pair<std::string, float> ANSONNXOCR::RecognizeText(const cv::Mat& croppedImage) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->recognizeOnly() has its own internal lock
if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
auto result = _engine->recognizeOnly(croppedImage);
return {result.text, result.score};

View File

@@ -90,7 +90,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input) {
}
std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -178,7 +178,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
}
std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -282,7 +282,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
}
std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->ocr() has its own internal lock
std::vector<ANSCENTER::OCRObject> OCRObjects;
if (!_licenseValid) {
@@ -379,7 +379,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
}
std::pair<std::string, float> ANSRTOCR::RecognizeText(const cv::Mat& croppedImage) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — _engine->recognizeOnly() has its own internal lock
if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
auto result = _engine->recognizeOnly(croppedImage);
return {result.text, result.score};

View File

@@ -1455,7 +1455,7 @@ namespace ANSCENTER
}
}
std::vector<Object> ANSODBase::RunStaticInference(const cv::Mat& input, cv::Rect Bbox, const std::string& camera_id) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — only uses local variables and virtual RunInference() which has its own engine lock
std::vector<Object> output;
output.clear();
try {
@@ -2100,7 +2100,8 @@ namespace ANSCENTER
}
}
std::vector<Object> ANSODBase::RunInferenceWithOption(const cv::Mat& input, const std::string& camera_id, const std::string activeROIMode) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components (engines, trackers) have their own locks.
// LabVIEW semaphore controls concurrency at the caller level.
try {
int mode = 0;
double confidenceThreshold = 0.35;
@@ -2116,8 +2117,11 @@ namespace ANSCENTER
if (confidenceThreshold <= 0) confidenceThreshold = 0;
if (confidenceThreshold > 1) confidenceThreshold = 1;
// Update model configuration with the new parameters
if(confidenceThreshold>0)_modelConfig.detectionScoreThreshold = confidenceThreshold;
// Update model configuration with the new parameters (brief lock for config)
if (confidenceThreshold > 0) {
std::lock_guard<std::recursive_mutex> cfgLock(_mutex);
_modelConfig.detectionScoreThreshold = confidenceThreshold;
}
switch (mode) {
case 0: // Normal mode
return RunInference(input, camera_id); //RunInference

View File

@@ -275,6 +275,26 @@ namespace ANSCENTER {
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
// --- Debug: log pointer state before reading ---
{
char _nv12_dbg[512];
snprintf(_nv12_dbg, sizeof(_nv12_dbg),
"[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
"gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
"gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
(void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
(int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
(int)gpuMatch, (int)useZeroCopy,
gpuData->gpuCacheY, gpuData->gpuCacheUV,
(int)gpuData->gpuCacheValid,
gpuData->refcount.load(),
frameW, frameH);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg);
#endif
fprintf(stderr, "%s", _nv12_dbg);
}
// Effective plane pointers — for zero-copy, use CUDA device ptrs;
// for CPU upload, use the CPU snapshot buffers.
uint8_t* effYPlane;
@@ -283,7 +303,7 @@ namespace ANSCENTER {
int effUvLinesize;
if (useZeroCopy) {
// Same GPU: wrap NVDEC device pointers directly
// Same GPU: wrap owned CUDA device pointers directly
effYPlane = gpuData->yPlane;
effUvPlane = gpuData->uvPlane;
effYLinesize = gpuData->yLinesize;
@@ -435,6 +455,18 @@ namespace ANSCENTER {
gpuResized.create(inputH, inputW, CV_8UC3);
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
{
char _nv12_dbg2[256];
snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
"[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
(void*)gpuY.data, gpuY.cols, gpuY.rows,
(void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
inputW, inputH, (int)useZeroCopy);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg2);
#endif
fprintf(stderr, "%s", _nv12_dbg2);
}
launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
stream.waitForCompletion();
@@ -945,7 +977,15 @@ namespace ANSCENTER {
inputW, inputH, frameW, frameH, stream);
}
cudaStreamSynchronize(stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
{
cudaError_t err = cudaStreamQuery(stream);
while (err == cudaErrorNotReady) {
Sleep(0);
err = cudaStreamQuery(stream);
}
}
// (No registry lock to release — data kept alive by refcount)

View File

@@ -8,6 +8,9 @@
#include <cuda_runtime.h>
#include <cstdint>
#ifdef _WIN32
#include <windows.h> // Sleep()
#endif
#include <cstdio>
// ── Shared YUV→RGB computation ───────────────────────────────────────────
@@ -651,7 +654,24 @@ int ANSGpuNV12ToBGR(
width * 3, height,
cudaMemcpyDeviceToHost, t_bufs.stream);
cudaStreamSynchronize(t_bufs.stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
// Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
// cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
// the exclusive SRW lock.
{
cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
if (qerr == cudaErrorNotReady) {
for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
Sleep(0);
qerr = cudaStreamQuery(t_bufs.stream);
}
while (qerr == cudaErrorNotReady) {
Sleep(1);
qerr = cudaStreamQuery(t_bufs.stream);
}
}
}
// Check for errors
cudaError_t err = cudaGetLastError();

View File

@@ -23,6 +23,7 @@
#include <thread>
#include <mutex>
#include <atomic>
#include <random>
#include <chrono>
#include <deque>
#include <set>
@@ -751,8 +752,11 @@ static void LogGpuInfo() {
// Worker thread: reads RTSP frames and runs ALPR inference
// RTSP client and ALPR engine are pre-created on the main thread to avoid
// race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
// Takes rtspClientPtr (pointer to array slot) + streamGuard mutex so the
// CHAOS thread can safely destroy+recreate the stream without use-after-free.
static void ALPRWorkerThread(int taskId,
ANSCENTER::ANSRTSPClient* rtspClient,
ANSCENTER::ANSRTSPClient** rtspClientPtr,
std::mutex* streamGuard,
ANSCENTER::ANSALPR* alprHandle,
TaskState& state) {
char tag[32];
@@ -780,6 +784,23 @@ static void ALPRWorkerThread(int taskId,
bool hwDecodeLogged = false;
while (g_running.load()) {
// Lock the stream guard to prevent CHAOS from destroying the client
// while we're mid-frame-grab or mid-inference.
std::unique_lock<std::mutex> streamLock(*streamGuard);
// Re-read the client pointer each iteration — CHAOS may have
// destroyed+recreated it, so our old pointer could be dangling.
ANSCENTER::ANSRTSPClient* rtspClient = *rtspClientPtr;
if (rtspClient == nullptr) {
streamLock.unlock();
emptyFrames++;
if (emptyFrames % 100 == 1) {
g_log.add(prefix + " Stream destroyed by CHAOS, waiting... (count=" + std::to_string(emptyFrames) + ")");
}
std::this_thread::sleep_for(std::chrono::milliseconds(50));
continue;
}
// Read frame from RTSP via ANSCV
auto grabStart = std::chrono::steady_clock::now();
cv::Mat* framePtr = nullptr;
@@ -797,6 +818,7 @@ static void ALPRWorkerThread(int taskId,
ReconnectRTSP(&rtspClient);
emptyFrames = 0;
}
streamLock.unlock();
if (framePtr) delete framePtr;
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
@@ -829,6 +851,9 @@ static void ALPRWorkerThread(int taskId,
// matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
// a different pointer the registry doesn't know, breaking NV12 zero-copy.
ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
// Release stream lock — inference is done, CHAOS can now safely destroy.
streamLock.unlock();
auto infEnd = std::chrono::steady_clock::now();
double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
totalInfMs += infMs;
@@ -933,19 +958,20 @@ int ANSLPR_MultiGPU_StressTest() {
printf("\n");
printf("============================================================\n");
printf(" ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks\n");
printf(" ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks\n");
printf(" (4 cameras, 5 AI tasks — Task 4 shares Stream 2)\n");
printf(" Press ESC to stop\n");
printf(" Log file: %s\n", LOG_FILE_PATH);
printf("============================================================\n\n");
g_log.add("============================================================");
g_log.add(" ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks");
g_log.add(" ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks");
g_log.add("============================================================");
// --- Log GPU info for diagnostics ---
LogGpuInfo();
// --- RTSP URLs (4 independent streams, one per task) ---
// --- RTSP URLs (4 independent camera streams) ---
const std::string rtspUrl0 = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
const std::string rtspUrl1 = "rtsp://cafe2471.ddns.net:600/rtsp/streaming?channel=01&subtype=0";
const std::string rtspUrl2 = "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0";
@@ -956,18 +982,39 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add("Stream 2: " + rtspUrl2);
g_log.add("Stream 3: " + rtspUrl3);
// =========================================================================
// Architecture: Camera Process + AI Task Process (mimics LabVIEW)
// -----------------------------------------------------------------------
// Camera Process: 4 independent RTSP streams acquire frames from cameras.
// AI Task Process: 5 AI tasks subscribe to camera streams and run inference
// in parallel. Multiple tasks can share one camera stream.
// Task 4 subscribes to Stream 2 (nhathuocngoclinh) to demonstrate the
// shared-camera subscription model used in LabVIEW.
// =========================================================================
const int NUM_STREAMS = 4;
const int NUM_TASKS = 5;
// --- Task states ---
TaskState taskStates[4];
TaskState taskStates[NUM_TASKS];
// =========================================================================
// Create 4 INDEPENDENT RTSP readers one per task, each with its own
// camera stream. Each task gets a dedicated RTSP connection.
// CAMERA PROCESS: Create 4 independent RTSP readers (one per camera).
// These form the camera acquisition layer that AI tasks subscribe to.
// =========================================================================
const int NUM_STREAMS = 4;
ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
const std::string streamUrls[NUM_STREAMS] = { rtspUrl0, rtspUrl1, rtspUrl2, rtspUrl3 };
// Map: task index -> stream index (1:1 mapping)
const int taskStreamMap[4] = { 0, 1, 2, 3 };
// Map: task index -> stream index
// Tasks 0-3 map 1:1 to streams 0-3.
// Task 4 subscribes to Stream 2 (nhathuocngoclinh) — shared camera.
const int taskStreamMap[NUM_TASKS] = { 0, 1, 2, 3, 2 };
// Log task-to-stream subscription mapping
g_log.add("--- AI Task -> Camera Stream subscription ---");
for (int i = 0; i < NUM_TASKS; i++) {
g_log.add(" Task " + std::to_string(i) + " -> Stream " + std::to_string(taskStreamMap[i])
+ " (" + streamUrls[taskStreamMap[i]] + ")");
}
for (int s = 0; s < NUM_STREAMS; s++) {
printf("[Stream%d] Creating RTSP handle for %s...\n", s, streamUrls[s].c_str());
@@ -986,14 +1033,17 @@ int ANSLPR_MultiGPU_StressTest() {
}
// =========================================================================
// Create 4 ALPR engines sequentially
// AI TASK PROCESS: Create 5 ALPR engines sequentially.
// Each AI task gets its own engine and subscribes to a camera stream.
// Task 4 shares Stream 2 (nhathuocngoclinh) with Task 2 — demonstrating
// the LabVIEW pattern where multiple AI tasks subscribe to one camera.
// =========================================================================
ANSCENTER::ANSALPR* alprHandles[4] = {};
ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
int engineType = 1; // NVIDIA_GPU
double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
char tag[32];
snprintf(tag, sizeof(tag), "[Task%d]", i);
@@ -1109,7 +1159,7 @@ int ANSLPR_MultiGPU_StressTest() {
// Count votes: how many tasks on this stream use each GPU
std::map<int, int> gpuVotes;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (taskStreamMap[i] == s && alprHandles[i]) {
gpuVotes[taskStates[i].gpuDeviceId]++;
}
@@ -1194,30 +1244,132 @@ int ANSLPR_MultiGPU_StressTest() {
}
// --- Enable deep pipeline benchmarking on all ALPR handles ---
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (alprHandles[i]) {
alprHandles[i]->ActivateDebugger(true);
}
}
g_log.add("Debug benchmarking ENABLED on all ALPR handles");
// --- Launch worker threads — tasks sharing a stream get the same RTSP client ---
g_log.add("Launching worker threads...");
std::thread workers[4];
for (int i = 0; i < 4; i++) {
// --- Per-stream mutex: prevents CHAOS from destroying a stream while a
// worker is mid-frame-grab or mid-inference (use-after-free fix). ---
std::mutex streamGuards[NUM_STREAMS];
// --- Launch worker threads ---
// Each AI task subscribes to its camera stream via taskStreamMap.
// Tasks sharing a stream (e.g. Task 2 & Task 4 on Stream 2) both get
// the same RTSP client pointer and share the stream's mutex guard.
g_log.add("Launching " + std::to_string(NUM_TASKS) + " worker threads...");
std::thread workers[NUM_TASKS];
for (int i = 0; i < NUM_TASKS; i++) {
int streamIdx = taskStreamMap[i];
if (rtspClients[streamIdx] && alprHandles[i]) {
workers[i] = std::thread(ALPRWorkerThread, i,
rtspClients[streamIdx], alprHandles[i],
&rtspClients[streamIdx],
&streamGuards[streamIdx],
alprHandles[i],
std::ref(taskStates[i]));
}
}
// =========================================================================
// Camera Chaos Thread — simulates camera errors / reconnects
// Mimics LabVIEW behavior: cameras randomly go into Error/Recovering
// state, triggering Stop/Reconnect/Destroy+Recreate cycles that cause
// CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
// This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
// =========================================================================
std::atomic<bool> chaosEnabled{true};
std::thread chaosThread([&]() {
std::mt19937 rng(std::random_device{}());
// Wait 10 seconds for system to stabilize before starting chaos
for (int i = 0; i < 100 && g_running.load(); i++) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
g_log.add("[CHAOS] Camera chaos thread started — every 10s, stop/destroy/recreate one camera (round-robin)");
printf("[CHAOS] Camera chaos thread started — 10s interval, round-robin across %d streams\n", NUM_STREAMS);
int chaosCount = 0;
int nextStream = 0; // Round-robin: cycle through streams 0,1,2,3,0,1,...
while (g_running.load() && chaosEnabled.load()) {
// Fixed 10-second interval between chaos events
for (int s = 0; s < 100 && g_running.load(); s++) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
if (!g_running.load()) break;
int streamIdx = nextStream;
nextStream = (nextStream + 1) % NUM_STREAMS;
chaosCount++;
char buf[512];
auto chaosStart = std::chrono::steady_clock::now();
// Lock stream guard: wait for any in-flight inference to finish
// before touching the RTSP client. This prevents use-after-free
// when CHAOS destroys a stream while a worker is mid-inference.
std::unique_lock<std::mutex> chaosLock(streamGuards[streamIdx]);
// Always use full DESTROY + RECREATE cycle.
// Reconnect() reuses internal player state which can leave stale
// CUDA resources and cause freezes. A clean destroy + recreate
// guarantees a fresh decoder/player with no leftover state.
{
bool wasAlive = (rtspClients[streamIdx] != nullptr);
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: DESTROY + RECREATE (%s)",
chaosCount, streamIdx,
wasAlive ? "camera was running" : "camera was already offline");
g_log.add(buf);
printf("%s\n", buf);
// Stop and release old handle if it exists
if (rtspClients[streamIdx]) {
StopRTSP(&rtspClients[streamIdx]);
ReleaseANSRTSPHandle(&rtspClients[streamIdx]);
rtspClients[streamIdx] = nullptr;
}
// Release lock during offline sleep — worker sees nullptr and skips
int offlineMs = 500 + (rng() % 2500); // 0.5 - 3 seconds offline
chaosLock.unlock();
std::this_thread::sleep_for(std::chrono::milliseconds(offlineMs));
chaosLock.lock();
// Recreate the RTSP handle (under lock again)
int result = CreateANSRTSPHandle(&rtspClients[streamIdx], "", "", "",
streamUrls[streamIdx].c_str());
if (result == 1 && rtspClients[streamIdx]) {
SetRTSPImageQuality(&rtspClients[streamIdx], 0);
SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
StartRTSP(&rtspClients[streamIdx]);
auto chaosEnd = std::chrono::steady_clock::now();
double chaosMs = std::chrono::duration<double, std::milli>(chaosEnd - chaosStart).count();
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATED in %.0f ms (offline %d ms)",
chaosCount, streamIdx, chaosMs, offlineMs);
} else {
snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATE FAILED (result=%d)",
chaosCount, streamIdx, result);
}
g_log.add(buf);
printf("%s\n", buf);
}
}
g_log.add("[CHAOS] Camera chaos thread stopped (total events: " + std::to_string(chaosCount) + ")");
printf("[CHAOS] Camera chaos thread stopped (total events: %d)\n", chaosCount);
});
// --- Display loop (main thread) ---
const int cellW = 640, cellH = 480;
const int logPanelH = 200;
// 3x2 grid layout: 5 tasks displayed in 3 columns x 2 rows
const int cellW = 480, cellH = 360; // Smaller cells for 3-column layout
const int logPanelH = 220;
const int gridCols = 3, gridRows = 2;
cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * 2, cellH * 2 + logPanelH);
cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * gridCols, cellH * gridRows + logPanelH);
auto testStart = std::chrono::steady_clock::now();
auto lastGpuSnapshot = std::chrono::steady_clock::now();
@@ -1244,12 +1396,12 @@ int ANSLPR_MultiGPU_StressTest() {
}
// Per-task stats
double totalFpsSnap = 0;
for (int t = 0; t < 4; t++) {
for (int t = 0; t < NUM_TASKS; t++) {
std::lock_guard<std::mutex> lk(taskStates[t].mtx);
char buf[256];
snprintf(buf, sizeof(buf),
" T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
t, taskStates[t].gpuDeviceId,
" T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
t, taskStreamMap[t], taskStates[t].gpuDeviceId,
taskStates[t].vramUsedBytes / (1024 * 1024),
taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
taskStates[t].frameCount, taskStates[t].detectionCount);
@@ -1261,7 +1413,7 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add(buf);
// Multi-GPU check
std::set<int> gpusUsed;
for (int t = 0; t < 4; t++) {
for (int t = 0; t < NUM_TASKS; t++) {
if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
}
if (gpusUsed.size() > 1) {
@@ -1271,12 +1423,12 @@ int ANSLPR_MultiGPU_StressTest() {
}
g_log.add("---- END SNAPSHOT ----");
}
// Build 2x2 grid + log panel
cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));
// Build 3x2 grid + log panel (5 tasks: 3 cols x 2 rows, cell [1][2] empty)
cv::Mat canvas(cellH * gridRows + logPanelH, cellW * gridCols, CV_8UC3, cv::Scalar(30, 30, 30));
// Place each task's frame in its quadrant
for (int i = 0; i < 4; i++) {
int row = i / 2, col = i % 2;
// Place each task's frame in its cell
for (int i = 0; i < NUM_TASKS; i++) {
int row = i / gridCols, col = i % gridCols;
cv::Rect roi(col * cellW, row * cellH, cellW, cellH);
cv::Mat cell;
@@ -1313,8 +1465,8 @@ int ANSLPR_MultiGPU_StressTest() {
// Draw status bar at bottom of each cell (2 lines)
cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
char bar1[256], bar2[256];
snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
i, fps, infMs, fCount, dCount,
snprintf(bar1, sizeof(bar1), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
i, taskStreamMap[i], fps, infMs, fCount, dCount,
lastPlate.empty() ? "-" : lastPlate.c_str());
if (gpuId >= 0) {
snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
@@ -1323,45 +1475,53 @@ int ANSLPR_MultiGPU_StressTest() {
}
cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
cv::putText(cell, bar1, cv::Point(5, cellH - 28),
cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
cv::FONT_HERSHEY_SIMPLEX, 0.4, barColor, 1);
cv::putText(cell, bar2, cv::Point(5, cellH - 8),
cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);
cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(0, 200, 255), 1);
cell.copyTo(canvas(roi));
// Draw grid lines
cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
cv::Scalar(100, 100, 100), 1);
cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
cv::Scalar(100, 100, 100), 1);
}
// Draw grid lines
for (int c = 1; c < gridCols; c++)
cv::line(canvas, cv::Point(c * cellW, 0), cv::Point(c * cellW, cellH * gridRows),
cv::Scalar(100, 100, 100), 1);
for (int r = 1; r < gridRows; r++)
cv::line(canvas, cv::Point(0, r * cellH), cv::Point(cellW * gridCols, r * cellH),
cv::Scalar(100, 100, 100), 1);
// --- Log panel at bottom ---
cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
cv::Rect logRoi(0, cellH * gridRows, cellW * gridCols, logPanelH);
cv::Mat logPanel = canvas(logRoi);
logPanel.setTo(cv::Scalar(20, 20, 20));
// Elapsed time header
auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
char header[128];
char header[256];
snprintf(header, sizeof(header),
"Elapsed: %.0fs | Press ESC to stop | Resize window freely", elapsed);
"Elapsed: %.0fs | %d cameras, %d AI tasks | Press ESC to stop",
elapsed, NUM_STREAMS, NUM_TASKS);
cv::putText(logPanel, header, cv::Point(10, 18),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);
// Aggregate stats + per-task GPU summary
double totalFps = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
totalFps += taskStates[i].fps;
}
char aggLine[256];
snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
totalFps,
taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
// Build dynamic task-GPU summary string
std::string taskGpuStr;
for (int i = 0; i < NUM_TASKS; i++) {
if (i > 0) taskGpuStr += " ";
taskGpuStr += "T" + std::to_string(i) + "(S" + std::to_string(taskStreamMap[i])
+ "):GPU" + std::to_string(taskStates[i].gpuDeviceId);
}
char aggLine[512];
snprintf(aggLine, sizeof(aggLine), "Total: %.1f FPS | %s",
totalFps, taskGpuStr.c_str());
cv::putText(logPanel, aggLine, cv::Point(10, 38),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);
cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 255, 255), 1);
// Real-time GPU VRAM monitor (query every frame — cheap call)
auto gpuSnaps = QueryGpuVram();
@@ -1370,7 +1530,7 @@ int ANSLPR_MultiGPU_StressTest() {
// Count tasks on this GPU and their total VRAM
int tasksOnGpu = 0;
size_t taskVramMiB = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
if (taskStates[i].gpuDeviceId == gs.deviceId) {
tasksOnGpu++;
@@ -1387,13 +1547,13 @@ int ANSLPR_MultiGPU_StressTest() {
gpuLineY += 18;
}
// Per-task resource line
for (int i = 0; i < 4; i++) {
// Per-task resource line (shows which stream each task subscribes to)
for (int i = 0; i < NUM_TASKS; i++) {
std::lock_guard<std::mutex> lk(taskStates[i].mtx);
char tLine[256];
snprintf(tLine, sizeof(tLine),
"T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
i, taskStates[i].gpuDeviceId,
"T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
i, taskStreamMap[i], taskStates[i].gpuDeviceId,
taskStates[i].vramUsedBytes / (1024 * 1024),
taskStates[i].fps, taskStates[i].inferenceMs,
taskStates[i].frameCount, taskStates[i].detectionCount);
@@ -1421,9 +1581,13 @@ int ANSLPR_MultiGPU_StressTest() {
}
}
// --- Stop chaos thread ---
chaosEnabled.store(false);
if (chaosThread.joinable()) chaosThread.join();
// --- Wait for all workers ---
printf("Waiting for worker threads to finish...\n");
for (int i = 0; i < 4; i++) {
printf("Waiting for %d worker threads to finish...\n", NUM_TASKS);
for (int i = 0; i < NUM_TASKS; i++) {
if (workers[i].joinable()) workers[i].join();
}
@@ -1433,19 +1597,21 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add("================================================================");
g_log.add(" FINAL PERFORMANCE SUMMARY");
g_log.add(" " + std::to_string(NUM_STREAMS) + " cameras, " + std::to_string(NUM_TASKS) + " AI tasks");
g_log.add(" Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
g_log.add("================================================================");
printf("\n============================================================\n");
printf(" FINAL PERFORMANCE SUMMARY (runtime: %.0fs)\n", totalElapsed);
printf(" %d cameras, %d AI tasks\n", NUM_STREAMS, NUM_TASKS);
printf("============================================================\n");
double totalFpsFinal = 0;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
char buf[512];
snprintf(buf, sizeof(buf),
" Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
i, taskStates[i].gpuDeviceId,
" Task %d (Stream %d): GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
i, taskStreamMap[i], taskStates[i].gpuDeviceId,
taskStates[i].vramUsedBytes / (1024 * 1024),
taskStates[i].frameCount, taskStates[i].detectionCount,
taskStates[i].fps, taskStates[i].inferenceMs);
@@ -1466,12 +1632,13 @@ int ANSLPR_MultiGPU_StressTest() {
// Multi-GPU verdict
std::set<int> finalGpusUsed;
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
}
{
char buf[256];
snprintf(buf, sizeof(buf), " Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
snprintf(buf, sizeof(buf), " Total throughput: %.1f FPS across %d tasks (%d cameras)",
totalFpsFinal, NUM_TASKS, NUM_STREAMS);
printf("%s\n", buf);
g_log.add(buf);
}
@@ -1491,13 +1658,16 @@ int ANSLPR_MultiGPU_StressTest() {
g_log.add(" 3. No CUDA_VISIBLE_DEVICES env var restricting GPU access");
}
// Log shared-camera subscription info
g_log.add(" Camera subscription: Task 2 and Task 4 both subscribe to Stream 2 (nhathuocngoclinh)");
printf("============================================================\n");
g_log.add("================================================================");
g_log.add(" Log saved to: " + std::string(LOG_FILE_PATH));
g_log.add("================================================================");
// --- Release all handles (sequentially on main thread) ---
for (int i = 0; i < 4; i++) {
for (int i = 0; i < NUM_TASKS; i++) {
if (alprHandles[i]) {
ReleaseANSALPRHandle(&alprHandles[i]);
}
@@ -2770,9 +2940,9 @@ int main()
//for (int i = 0; i < 100; i++) {
// ANSLPR_CPU_Inferences_FileTest();
//}
//ANSLPR_MultiGPU_StressTest();
ANSLPR_MultiGPU_StressTest();
//ANSLPR_MultiGPU_StressTest_SimulatedCam();
ANSLPR_MultiGPU_StressTest_FilePlayer();
// ANSLPR_MultiGPU_StressTest_FilePlayer();
return 0;
}