From e134ebdf150b4515048fef751862d2c0a1ec703e Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Sat, 4 Apr 2026 20:19:54 +1100 Subject: [PATCH] Use software decoder by default --- .claude/settings.local.json | 20 +- MediaClient/media/rtsp_player.cpp | 10 +- MediaClient/media/video_player.cpp | 108 ++++++++++- MediaClient/media/video_player.h | 8 + core/ANSLicensingSystem/ANSLicense.h | 27 +++ .../include/engine/EngineBuildLoadNetwork.inl | 70 +++++++ .../include/engine/EngineMultiGpu.inl | 24 +++ .../include/engine/EngineRunInference.inl | 178 +++++++++++++++++- .../include/engine/EngineUtilities.inl | 71 ++++--- modules/ANSCV/ANSFLV.cpp | 39 +--- modules/ANSCV/ANSMJPEG.cpp | 37 +--- modules/ANSCV/ANSRTMP.cpp | 37 +--- modules/ANSCV/ANSRTSP.cpp | 168 ++++++++++++----- modules/ANSCV/ANSRTSP.h | 2 + modules/ANSCV/ANSSRT.cpp | 37 +--- modules/ANSCV/ANSVideoPlayer.cpp | 2 +- modules/ANSCV/VideoPlayer.cpp | 2 +- modules/ANSODEngine/ANSODEngine.cpp | 16 ++ modules/ANSODEngine/ANSRTYOLO.cpp | 19 +- modules/ANSODEngine/ANSRTYOLO.h | 1 + modules/ANSODEngine/NV12PreprocessHelper.cpp | 1 + modules/ANSODEngine/dllmain.cpp | 16 ++ modules/ANSODEngine/engine.h | 5 +- tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp | 10 +- 24 files changed, 693 insertions(+), 215 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 9ec122e..ac63ccb 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -51,7 +51,25 @@ "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug1.txt''\\).Count\")", "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION20.log''\\).Count\")", "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION21.log''\\).Count\")", - "Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")" + "Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")", + "Bash(ls -la /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.h)", + "Bash(grep -r \"cudaMalloc\\\\|cudaFree\\\\|cudaStreamCreate\\\\|cudaStreamDestroy\" /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.cpp)", + "Bash(grep -n \"cudaStreamCreate\\\\|cudaEventCreate\\\\|cudaEventDestroy\\\\|cudaStreamDestroy\\\\|cudaStreamSynchronize\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)", + "Bash(dir \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\\\\engines\\\\TensorRTAPI\\\\include\\\\engine\\\\*.h\" /b)", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION26.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION27.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION28.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION29.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION30.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logging4.txt''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION31.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\loging5.txt''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION32.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION33.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION34.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION35.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION36.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION37.log''\\).Count\")" ] } } diff --git a/MediaClient/media/rtsp_player.cpp b/MediaClient/media/rtsp_player.cpp index e572856..8eef47d 100644 --- a/MediaClient/media/rtsp_player.cpp +++ b/MediaClient/media/rtsp_player.cpp @@ -258,7 +258,15 @@ void CRtspPlayer::stop() // Set flags BEFORE stopping decoder so TCP rx thread stops calling decode() m_bPlaying = FALSE; m_bPaused = FALSE; - CVideoPlayer::StopVideoDecoder(); // Stop the video decoder + CVideoPlayer::StopVideoDecoder(); // Stop the video decoder + uninit (free VRAM) + + // Close RTSP connection and shut down RX threads. + // Without this, stopped cameras keep TCP/UDP threads running, + // sockets open, and receiving network data — wasting CPU and + // network resources. With 100 cameras and only 5 running, + // 95 idle threads would consume CPU for no purpose. + // Start() → Setup() → open() will reconnect when needed. + m_rtsp.rtsp_close(); } BOOL CRtspPlayer::pause() diff --git a/MediaClient/media/video_player.cpp b/MediaClient/media/video_player.cpp index 071a30a..2560c47 100644 --- a/MediaClient/media/video_player.cpp +++ b/MediaClient/media/video_player.cpp @@ -1275,6 +1275,90 @@ cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame) return cv::Mat(); } } +cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) { + try { + if (!frame || frame->width <= 0 || frame->height <= 0) { + return cv::Mat(); + } + + const int width = frame->width; + const int height = frame->height; + + // YUV420P has 3 separate planes: Y (full res), U (half), V (half). + // OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer + // with Y on top (H rows) and U,V stacked below (H/2 rows total). + // Layout: [Y: W×H] [U: W/2 × H/2] [V: W/2 × H/2] + // Total height = H * 3/2, width = W, single channel. + + // If all planes are contiguous with matching strides, wrap directly + const int yStride = frame->linesize[0]; + const int uStride = frame->linesize[1]; + const int vStride = frame->linesize[2]; + + // Fast path: planes are packed contiguously with stride == width + if (yStride == width && uStride == width / 2 && vStride == width / 2 && + frame->data[1] == frame->data[0] + width * height && + frame->data[2] == frame->data[1] + (width / 2) * (height / 2)) { + // Contiguous I420 — wrap directly, zero copy + cv::Mat yuv(height * 3 / 2, width, CV_8UC1, frame->data[0]); + cv::Mat bgrImage; + cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420); + if (m_nImageQuality == 1) { + bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0); + } + return bgrImage; + } + + // Slow path: planes have padding (linesize > width) — copy to contiguous buffer + const int uvWidth = width / 2; + const int uvHeight = height / 2; + const int totalSize = width * height + uvWidth * uvHeight * 2; + + cv::Mat yuv(height * 3 / 2, width, CV_8UC1); + uint8_t* dst = yuv.data; + + // Copy Y plane (line by line if stride != width) + if (yStride == width) { + std::memcpy(dst, frame->data[0], width * height); + } else { + for (int row = 0; row < height; ++row) { + std::memcpy(dst + row * width, frame->data[0] + row * yStride, width); + } + } + dst += width * height; + + // Copy U plane + if (uStride == uvWidth) { + std::memcpy(dst, frame->data[1], uvWidth * uvHeight); + } else { + for (int row = 0; row < uvHeight; ++row) { + std::memcpy(dst + row * uvWidth, frame->data[1] + row * uStride, uvWidth); + } + } + dst += uvWidth * uvHeight; + + // Copy V plane + if (vStride == uvWidth) { + std::memcpy(dst, frame->data[2], uvWidth * uvHeight); + } else { + for (int row = 0; row < uvHeight; ++row) { + std::memcpy(dst + row * uvWidth, frame->data[2] + row * vStride, uvWidth); + } + } + + cv::Mat bgrImage; + cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420); + if (m_nImageQuality == 1) { + bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0); + } + return bgrImage; + } + catch (const std::exception& e) { + std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl; + return cv::Mat(); + } +} + cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) { std::lock_guard lock(_mutex); try { @@ -1287,8 +1371,9 @@ cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) { switch (pFrame->format) { case AV_PIX_FMT_NV12: return avframeNV12ToCvMat(pFrame); + case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUVJ420P: - return avframeAnyToCvmat(pFrame); + return avframeYUV420PToCvMat(pFrame); default: return avframeAnyToCvmat(pFrame); @@ -1305,7 +1390,7 @@ CVideoPlayer::CVideoPlayer() : , m_bAudioInited(FALSE) , m_bPlaying(FALSE) , m_bPaused(FALSE) - , m_nHWDecoding(HW_DECODING_AUTO)//(HW_DECODING_AUTO)// HW_DECODING_D3D11 //HW_DECODING_DISABLE + , m_nHWDecoding(HW_DECODING_DISABLE)// Software decode by default — saves VRAM (no NVDEC DPB surfaces) , m_bUpdown(FALSE) , m_bSnapshot(FALSE) , m_nSnapVideoFmt(AV_PIX_FMT_YUVJ420P) @@ -1740,6 +1825,13 @@ void CVideoPlayer::StopVideoDecoder() { // Flush decoder to drain and discard any buffered frames, // so stale reference frames don't corrupt the next session decoder->flush(); + // Free NVDEC decoder context and all GPU surfaces (DPB buffers). + // Stopped cameras should not hold VRAM — with 100 cameras created + // but only 5 running, the 95 idle decoders would consume ~5-10 GB. + // The decoder will be re-initialized automatically when the next + // video packet arrives after Start() is called. + decoder->uninit(); + m_bVideoInited = FALSE; } // Clear queue but KEEP m_currentImage and m_lastJpegImage — // getImage()/getJpegImage() will return the last good frame while decoder stabilizes @@ -1842,6 +1934,13 @@ void CVideoPlayer::setTargetFPS(double intervalMs) m_targetIntervalMs = intervalMs; m_targetFPSInitialized = false; // reset timing on change } +double CVideoPlayer::getLastFrameAgeMs() +{ + std::lock_guard lock(_mutex); + if (!m_lastDecoderFrameTimeSet) return 0.0; + auto now = std::chrono::steady_clock::now(); + return std::chrono::duration(now - m_lastDecoderFrameTime).count(); +} void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq) { if (m_bRecording) @@ -2061,6 +2160,11 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame) } } + // Record wall-clock time of every decoded frame (even rate-limited ones). + // Used by getLastFrameAgeMs() to detect truly stale cameras. + m_lastDecoderFrameTime = std::chrono::steady_clock::now(); + m_lastDecoderFrameTimeSet = true; + // --- Frame rate limiting --- // Skip post-decode processing (clone, queue push, CUDA clone) if not enough // time has elapsed since the last processed frame. The decode itself still diff --git a/MediaClient/media/video_player.h b/MediaClient/media/video_player.h index c8c5220..6fc430c 100644 --- a/MediaClient/media/video_player.h +++ b/MediaClient/media/video_player.h @@ -148,6 +148,7 @@ public: // Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms) virtual void setImageQuality(int mode) { m_nImageQuality = mode; } void setTargetFPS(double intervalMs); // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS) + double getLastFrameAgeMs(); // Milliseconds since last frame arrived from decoder (0 if no frame yet) virtual void setRtpMulticast(BOOL flag) {} virtual void setRtpOverUdp(BOOL flag) {} @@ -223,6 +224,7 @@ protected: cv::Mat avframeAnyToCvmat(const AVFrame* frame); cv::Mat avframeNV12ToCvMat(const AVFrame* frame); + cv::Mat avframeYUV420PToCvMat(const AVFrame* frame); // YUV420P/YUVJ420P → BGR (OpenCV, no sws_scale) cv::Mat avframeYUVJ420PToCvmat(const AVFrame* frame); cv::Mat avframeToCVMat(const AVFrame* frame); @@ -273,6 +275,12 @@ protected: std::chrono::steady_clock::time_point m_lastProcessedTime; // timestamp of last processed frame bool m_targetFPSInitialized = false; // first-frame flag + // Wall-clock timestamp of last frame received from the decoder (NOT from getImage). + // Updated in onVideoFrame() for EVERY decoded frame, even rate-limited ones. + // Used by LabVIEW to detect truly stale cameras vs rate-limited ones. + std::chrono::steady_clock::time_point m_lastDecoderFrameTime; + bool m_lastDecoderFrameTimeSet = false; + BOOL m_bPlaying; BOOL m_bPaused; diff --git a/core/ANSLicensingSystem/ANSLicense.h b/core/ANSLicensingSystem/ANSLicense.h index d965579..b55fbf1 100644 --- a/core/ANSLicensingSystem/ANSLicense.h +++ b/core/ANSLicensingSystem/ANSLicense.h @@ -1,6 +1,33 @@ #ifndef ANSLICENSE_H #define ANSLICENSE_H +// ============================================================================ +// Global debug toggle for DebugView (DbgView) logging. +// Define ANSCORE_DEBUGVIEW=1 to enable verbose OutputDebugStringA logging +// across all ANSCORE modules (ANSCV, ANSODEngine, TensorRT engine, etc.). +// Set to 0 for production builds to eliminate all debug output overhead. +// ============================================================================ +#ifndef ANSCORE_DEBUGVIEW +#define ANSCORE_DEBUGVIEW 1 // 1 = enabled (debug), 0 = disabled (production) +#endif + +// ANS_DBG: Debug logging macro for DebugView (OutputDebugStringA on Windows). +// Usage: ANS_DBG("MyModule", "value=%d ptr=%p", val, ptr); +// Output: [MyModule] value=42 ptr=0x1234 +// When ANSCORE_DEBUGVIEW=0, compiles to nothing (zero overhead). +// NOTE: We avoid #include here to prevent winsock.h/winsock2.h +// conflicts. Instead, forward-declare OutputDebugStringA directly. +#if ANSCORE_DEBUGVIEW && defined(_WIN32) +extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA(const char* lpOutputString); +#define ANS_DBG(tag, fmt, ...) do { \ + char _ans_dbg_buf[1024]; \ + snprintf(_ans_dbg_buf, sizeof(_ans_dbg_buf), "[" tag "] " fmt "\n", ##__VA_ARGS__); \ + OutputDebugStringA(_ans_dbg_buf); \ +} while(0) +#else +#define ANS_DBG(tag, fmt, ...) ((void)0) +#endif + #ifdef ANSLICENSE_EXPORTS #define ANSLICENSE_API __declspec(dllexport) #else diff --git a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl index 323ca14..f594c63 100644 --- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl +++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl @@ -623,6 +623,65 @@ bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array bool Engine::loadNetwork(std::string trtModelPath, const std::array& subVals, const std::array& divVals, bool normalize) { + // Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync + // instead of the default cudaMalloc/cudaFree. The stream-ordered allocator + // respects the cudaMemPool release threshold (set to 0), so freed memory is + // returned to the GPU immediately instead of being cached forever. + // + // The default cudaMalloc/cudaFree allocator caches all freed blocks permanently + // (no API to force release), causing VRAM to grow monotonically when GpuMat + // objects of varying sizes are allocated and freed repeatedly (different batch + // sizes, different image resolutions across cameras). + { + static std::once_flag s_allocatorFlag; + std::call_once(s_allocatorFlag, []() { + // Set release threshold to 0 on all GPUs + int deviceCount = 0; + cudaGetDeviceCount(&deviceCount); + for (int d = 0; d < deviceCount; ++d) { + cudaMemPool_t pool = nullptr; + if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) { + uint64_t threshold = 0; + cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold); + } + } + + // Custom allocator: uses cudaMallocAsync on stream 0 (behaves like + // synchronous cudaMalloc but goes through the stream-ordered pool). + struct AsyncAllocator : cv::cuda::GpuMat::Allocator { + bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override { + // Same logic as OpenCV's default allocator, but using cudaMallocAsync + size_t step = elemSize * cols; + // Align step to 256 bytes (same as default allocator) + step = (step + 255) & ~size_t(255); + void* ptr = nullptr; + cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0 + if (err != cudaSuccess || !ptr) { + // Fallback to regular cudaMalloc if async not supported + err = cudaMalloc(&ptr, step * rows); + if (err != cudaSuccess) return false; + } + mat->data = static_cast(ptr); + mat->step = step; + mat->refcount = static_cast(cv::fastMalloc(sizeof(int))); + *mat->refcount = 1; + return true; + } + void free(cv::cuda::GpuMat* mat) override { + cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0 + cv::fastFree(mat->refcount); + mat->data = nullptr; + mat->datastart = nullptr; + mat->dataend = nullptr; + mat->refcount = nullptr; + } + }; + static AsyncAllocator s_allocator; + cv::cuda::GpuMat::setDefaultAllocator(&s_allocator); + ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release"); + }); + } + m_lastLoadFailedVRAM = false; // reset on each load attempt m_subVals = subVals; m_divVals = divVals; @@ -958,11 +1017,13 @@ trt_cache_create_context: m_context = std::unique_ptr(m_engine->createExecutionContext()); if (!m_context) { + ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null"); logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for " + trtModelPath, true); return false; } + ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str()); if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl; // ============================================================================ @@ -1135,6 +1196,15 @@ trt_cache_create_context: } } + { + size_t vramFree = 0, vramTotal = 0; + cudaMemGetInfo(&vramFree, &vramTotal); + ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total", + totalAllocated / (1024*1024), + (vramTotal - vramFree) / (1024*1024), + vramFree / (1024*1024), + vramTotal / (1024*1024)); + } if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl; // -- Pinned output buffers (CUDA graph prerequisite) ----------------------- diff --git a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl index 3769d2f..99b607d 100644 --- a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl +++ b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl @@ -607,6 +607,7 @@ bool Engine::runInferenceFromPool( // harmless — the second one finds a fresh slot immediately. InferenceSlot* slot = nullptr; bool kickedGrowth = false; + auto _poolAcquireStart = std::chrono::steady_clock::now(); { std::unique_lock lock(m_slotMutex); @@ -630,6 +631,8 @@ bool Engine::runInferenceFromPool( } if (!slot) { + ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot", + n, m_activeCount.load()); // All slots busy. In elastic mode, proactively grow the // pool in the background so the next request has a slot // on a different GPU. We only kick once per wait cycle. @@ -672,7 +675,17 @@ bool Engine::runInferenceFromPool( } // -- 3. Still no slot => reject --------------------------------------- + { + double _acquireMs = std::chrono::duration( + std::chrono::steady_clock::now() - _poolAcquireStart).count(); + if (_acquireMs > 100.0) { + ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu", + _acquireMs, (void*)slot, slot ? slot->deviceIndex : -1, + m_activeCount.load(), m_slots.size()); + } + } if (!slot) { + ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size()); std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all " + std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity) + " slot(s) busy" @@ -699,12 +712,23 @@ bool Engine::runInferenceFromPool( if (currentDev != slot->deviceIndex) { cudaSetDevice(slot->deviceIndex); } + ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu", + slot->deviceIndex, m_activeCount.load(), m_slots.size()); + auto _slotStart = std::chrono::steady_clock::now(); result = slot->engine->runInference(inputs, featureVectors); + auto _slotEnd = std::chrono::steady_clock::now(); + double _slotMs = std::chrono::duration(_slotEnd - _slotStart).count(); + if (_slotMs > 500.0) { + ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu", + _slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size()); + } } catch (const std::exception& ex) { + ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what()); std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl; } catch (...) { + ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception"); std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl; } diff --git a/engines/TensorRTAPI/include/engine/EngineRunInference.inl b/engines/TensorRTAPI/include/engine/EngineRunInference.inl index 00cf74f..4a7a950 100644 --- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl +++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl @@ -1,8 +1,10 @@ #pragma once #include +#include #include #include #include "TRTCompat.h" +#include "ANSLicense.h" // ANS_DBG macro for DebugView logging // Per-device mutex for CUDA graph capture. // TRT's enqueueV3 uses shared internal resources (workspace, memory pools) @@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) { cudaError_t err = cudaStreamQuery(stream); if (err != cudaErrorNotReady) return err; + auto syncStart = std::chrono::steady_clock::now(); + // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions. - // Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree) - // a window to acquire the exclusive nvcuda64 SRW lock. - // Previously used 1000 Sleep(0) iterations which hogged the SRW lock and - // caused ~20-second stalls when concurrent cleanup needed exclusive access. for (int i = 0; i < 10; ++i) { Sleep(0); err = cudaStreamQuery(stream); @@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) { } // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation. + int sleepCount = 0; while (true) { Sleep(1); + sleepCount++; err = cudaStreamQuery(stream); - if (err != cudaErrorNotReady) return err; + if (err != cudaErrorNotReady) { + // Log if sync took too long (>500ms indicates GPU stall) + auto elapsed = std::chrono::duration( + std::chrono::steady_clock::now() - syncStart).count(); + if (elapsed > 500.0) { + ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d", + elapsed, sleepCount, (void*)stream, (int)err); + } + return err; + } } } @@ -368,6 +379,71 @@ bool Engine::runInference(const std::vector>& i return false; } + // ============================================================================ + // SM=100% DETECTOR — tracks inference timing trends to catch the exact + // moment GPU becomes saturated. Logs every 50 inferences with rolling + // average, and immediately when degradation is detected. + // ============================================================================ + // Global (process-wide) counters shared across all engine instances/threads + static std::atomic s_globalInfCount{0}; + static std::atomic s_globalActiveInf{0}; // currently in-flight inferences + static std::atomic s_globalLastAvgMs{0.0}; // last known avg inference time + + const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1; + s_globalActiveInf.fetch_add(1); + + // Per-thread tracking + { + static thread_local int64_t s_infCount = 0; + static thread_local std::chrono::steady_clock::time_point s_lastLog; + static thread_local double s_rollingAvgMs = 0.0; + static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences + static thread_local double s_maxMs = 0.0; + static thread_local bool s_degradationLogged = false; + s_infCount++; + + if (s_infCount == 1) { + s_lastLog = std::chrono::steady_clock::now(); + ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum); + } + + // Log every 50 inferences (more frequent than 500 to catch transitions) + if (s_infCount % 50 == 0) { + auto now = std::chrono::steady_clock::now(); + double elapsed = std::chrono::duration(now - s_lastLog).count(); + double fps = (elapsed > 0) ? (50.0 / elapsed) : 0; + s_lastLog = now; + + size_t vramFree = 0, vramTotal = 0; + cudaMemGetInfo(&vramFree, &vramTotal); + size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024); + size_t vramFreeMB = vramFree / (1024 * 1024); + + ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB", + s_infCount, myInfNum, s_globalActiveInf.load(), + fps, s_rollingAvgMs, s_maxMs, + (int)inputs[0].size(), m_graphExecs.size(), + vramUsedMB, vramFreeMB); + + // Capture baseline from first 100 inferences + if (s_infCount == 100) { + s_baselineMs = s_rollingAvgMs; + ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs); + } + + // Detect degradation: avg >3x baseline AND baseline is set + if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) { + s_degradationLogged = true; + ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d", + s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs, + vramUsedMB, vramFreeMB, s_globalActiveInf.load()); + } + + // Reset max for next window + s_maxMs = 0.0; + } + } + const auto numInputs = m_inputDims.size(); if (inputs.size() != numInputs) { std::cout << "Error: Wrong number of inputs. Expected: " << numInputs @@ -457,6 +533,9 @@ bool Engine::runInference(const std::vector>& i } if (anyDimChanged) { + ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)", + m_lastBatchSize, batchSize, m_graphExecs.size()); + // === First-time diagnostics (verbose, once) === const bool firstTime = !m_batchShapeChangeLogged; @@ -536,7 +615,10 @@ bool Engine::runInference(const std::vector>& i << newDims.d[3] << "]" << std::endl; } + ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]", + tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]); if (!m_context->setInputShape(tensorName, newDims)) { + ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName); std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl; return false; } @@ -576,6 +658,25 @@ bool Engine::runInference(const std::vector>& i m_lastBatchSize = batchSize; m_batchShapeChangeLogged = true; + + // CRITICAL: Invalidate all cached CUDA graphs after shape change. + // Graphs were captured with the OLD context state (old tensor shapes). + // Launching them after setInputShape() produces undefined GPU behavior + // (invalid kernel sequences, SM lockup at 100%, inference hang). + if (!m_graphExecs.empty()) { + size_t destroyed = m_graphExecs.size(); + for (auto& [bs, ge] : m_graphExecs) { + if (ge) cudaGraphExecDestroy(ge); + } + m_graphExecs.clear(); + ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)", + destroyed, batchSize); + if (m_verbose || firstTime) { + std::cout << "Info: Invalidated " << destroyed + << " cached CUDA graphs after shape change" << std::endl; + } + } + if (m_verbose && firstTime) { std::cout << "\nInfo: Input shapes updated successfully for batch size " << batchSize << " ✓\n" << std::endl; @@ -619,6 +720,7 @@ bool Engine::runInference(const std::vector>& i // // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline. + auto _prepStart = std::chrono::steady_clock::now(); cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream); std::vector preprocessedBuffers; preprocessedBuffers.reserve(numInputs); @@ -647,6 +749,14 @@ bool Engine::runInference(const std::vector>& i } } + { + double _prepMs = std::chrono::duration( + std::chrono::steady_clock::now() - _prepStart).count(); + if (_prepMs > 100.0) { + ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize); + } + } + // ============================================================================ // PRE-ALLOCATE OUTPUT STRUCTURE // ============================================================================ @@ -690,6 +800,8 @@ bool Engine::runInference(const std::vector>& i if (canGraph) { auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access if (!graphExec) { + ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)", + batchSize, m_graphExecs.size()); // First call for this batchSize -- capture a new graph. // Serialise captures across all Engine instances on this device to // prevent TRT's shared workspace from creating cross-stream @@ -727,9 +839,13 @@ bool Engine::runInference(const std::vector>& i if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess) graphExec = exec; cudaGraphDestroy(graph); + ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p", + batchSize, (void*)graphExec); } if (!graphExec) { + ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path", + batchSize); std::cout << "Warning: CUDA graph capture failed for batchSize=" << batchSize << " -- falling back to direct inference path." << std::endl; // Disable graph acceleration for this Engine instance. @@ -740,9 +856,17 @@ bool Engine::runInference(const std::vector>& i } if (graphExec) { + ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec); // Launch the pre-captured graph (single API call replaces many). + auto _graphStart = std::chrono::steady_clock::now(); cudaGraphLaunch(graphExec, m_inferenceStream); cudaStreamSynchronize_Safe(m_inferenceStream); + auto _graphEnd = std::chrono::steady_clock::now(); + double _graphMs = std::chrono::duration(_graphEnd - _graphStart).count(); + if (_graphMs > 500.0) { + ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d", + _graphMs, batchSize, s_globalActiveInf.load()); + } // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch). for (int batch = 0; batch < batchSize; ++batch) { @@ -762,8 +886,16 @@ bool Engine::runInference(const std::vector>& i // ---------------------- // Used when pinned buffers are unavailable or graph capture failed. if (!graphUsed) { + ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize); + auto enqueueStart = std::chrono::steady_clock::now(); bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers); + auto enqueueEnd = std::chrono::steady_clock::now(); + double enqueueMs = std::chrono::duration(enqueueEnd - enqueueStart).count(); + if (enqueueMs > 500.0) { + ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize); + } if (!success) { + ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize); std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch=" + std::to_string(batchSize) + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO"); @@ -805,8 +937,16 @@ bool Engine::runInference(const std::vector>& i } } + auto syncStart = std::chrono::steady_clock::now(); cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream); + auto syncEnd = std::chrono::steady_clock::now(); + double syncMs = std::chrono::duration(syncEnd - syncStart).count(); + if (syncMs > 500.0) { + ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize); + } if (syncErr != cudaSuccess) { + ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)", + (int)syncErr, cudaGetErrorString(syncErr)); std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: " + std::string(cudaGetErrorString(syncErr)); std::cout << errMsg << std::endl; @@ -815,5 +955,33 @@ bool Engine::runInference(const std::vector>& i } } + // ============================================================================ + // SM=100% DETECTOR — end-of-inference timing + // ============================================================================ + { + static thread_local double s_ema = 0; + static thread_local std::chrono::steady_clock::time_point s_prevEnd; + static thread_local bool s_firstDone = false; + + auto _now = std::chrono::steady_clock::now(); + if (s_firstDone) { + double sinceLastMs = std::chrono::duration(_now - s_prevEnd).count(); + // If time between consecutive inferences jumps dramatically, + // something blocked the thread (SM=100% or mutex contention) + if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) { + size_t vf = 0, vt = 0; + cudaMemGetInfo(&vf, &vt); + ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free", + sinceLastMs, s_ema, sinceLastMs / s_ema, + s_globalActiveInf.load(), vf / (1024*1024)); + } + s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs); + } + s_prevEnd = _now; + s_firstDone = true; + + s_globalActiveInf.fetch_sub(1); + } + return true; } diff --git a/engines/TensorRTAPI/include/engine/EngineUtilities.inl b/engines/TensorRTAPI/include/engine/EngineUtilities.inl index 66bb59d..bc2e5c9 100644 --- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl +++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl @@ -24,28 +24,32 @@ void Engine::transformOutput(std::vector>> &input, output = std::move(input[0][0]); } template -cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, +cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, size_t height, size_t width, const cv::Scalar& bgcolor) { - // Ensure input is valid if (input.empty()) { - return cv::cuda::GpuMat(); + return cv::cuda::GpuMat(); } - // Create a CUDA stream - cv::cuda::Stream stream; - // Calculate aspect ratio and unpadded dimensions + + // Use a thread_local stream to avoid creating a new CUDA stream per call. + // Creating cv::cuda::Stream() each call leaks stream handles under WDDM. + thread_local cv::cuda::Stream stream; + float r = std::min(static_cast(width) / input.cols, static_cast(height) / input.rows); size_t unpad_w = static_cast(r * input.cols); size_t unpad_h = static_cast(r * input.rows); + // Resize the input image cv::cuda::GpuMat re; - re.create(unpad_h, unpad_w, input.type()); + re.create(static_cast(unpad_h), static_cast(unpad_w), input.type()); cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream); + // Create the output image and fill with the background color cv::cuda::GpuMat out; - out.create(height, width, input.type()); + out.create(static_cast(height), static_cast(width), input.type()); out.setTo(bgcolor, stream); - // Copy the resized content into the top-left corner of the output image + + // Copy the resized content into the top-left corner re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream); stream.waitForCompletion(); return out; @@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector const int W = batchInput[0].cols; const int batch = static_cast(batchInput.size()); const size_t planeSize = static_cast(H) * W; // pixels per channel + const int totalElems = batch * 3 * static_cast(planeSize); - // Output blob: planar NCHW layout stored as a single-channel GpuMat. - // Total elements = batch * 3 * H * W. - cv::cuda::GpuMat blob(1, batch * 3 * static_cast(planeSize), CV_32FC1); + // thread_local cached buffers — reused across calls on the same thread. + // KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak + // caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate + // (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed). + thread_local cv::cuda::GpuMat tl_blob; + thread_local cv::cuda::GpuMat tl_floatImg; + thread_local int tl_blobMaxElems = 0; + + if (totalElems > tl_blobMaxElems) { + tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1); + tl_blobMaxElems = totalElems; + size_t blobBytes = static_cast(totalElems) * sizeof(float); + ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)", + batch, W, H, blobBytes / (1024.0 * 1024.0)); + } + // Use a sub-region of the cached blob for the current batch + cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems); for (int img = 0; img < batch; ++img) { - // 1. Convert to float and normalise while still in HWC (interleaved) format. - // Channel-wise subtract / divide operate correctly on interleaved data. - cv::cuda::GpuMat floatImg; if (normalize) { - batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream); + batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream); } else { - batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream); + batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream); } - cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream); - cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream); + cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream); + cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream); // 2. Split normalised HWC image into CHW planes directly into the blob. size_t offset = static_cast(img) * 3 * planeSize; if (swapRB) { - // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0 std::vector channels{ - cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + 2 * planeSize), // B -> plane 2 - cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + planeSize), // G -> plane 1 - cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset)}; // R -> plane 0 - cv::cuda::split(floatImg, channels, stream); + cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + 2 * planeSize), + cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + planeSize), + cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset)}; + cv::cuda::split(tl_floatImg, channels, stream); } else { - // BGR input -> BGR planes: keep channel order std::vector channels{ cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset), cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + planeSize), cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + 2 * planeSize)}; - cv::cuda::split(floatImg, channels, stream); + cv::cuda::split(tl_floatImg, channels, stream); } } @@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector template void Engine::clearGpuBuffers() { if (!m_buffers.empty()) { // Free ALL I/O GPU buffers (both inputs and outputs). - // Previously only outputs were freed, leaking input allocations from loadNetwork(). for (void* ptr : m_buffers) { if (ptr) { Util::checkCudaErrorCode(cudaFree(ptr)); @@ -247,4 +260,8 @@ template void Engine::clearGpuBuffers() { } m_buffers.clear(); } + + // Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method). + // They are cleaned up automatically when threads exit. + ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released"); } diff --git a/modules/ANSCV/ANSFLV.cpp b/modules/ANSCV/ANSFLV.cpp index 9daa0c7..a861c3f 100644 --- a/modules/ANSCV/ANSFLV.cpp +++ b/modules/ANSCV/ANSFLV.cpp @@ -218,44 +218,25 @@ namespace ANSCENTER { } bool ANSFLVClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) { - // Quick size and type checks - if (img1.size() != img2.size() || img1.type() != img2.type()) { - return false; - } + // Use decoder frame age — returns "stale" only if no decoder output for 5+ seconds. + double ageMs = _playerClient->getLastFrameAgeMs(); + if (ageMs > 5000.0) return true; // Truly stale + if (ageMs > 0.0) return false; // Decoder alive - // Handle empty images - if (img1.empty()) { - return img2.empty(); - } + // Fallback for startup (no frame decoded yet) + if (img1.empty() && img2.empty()) return true; + if (img1.empty() || img2.empty()) return false; + if (img1.size() != img2.size() || img1.type() != img2.type()) return false; + if (img1.data == img2.data) return true; if (img1.isContinuous() && img2.isContinuous()) { const size_t totalBytes = img1.total() * img1.elemSize(); - - // Fast rejection: sample 5 positions across contiguous memory - const size_t quarter = totalBytes / 4; - const size_t half = totalBytes / 2; - const size_t threeQuarter = 3 * totalBytes / 4; - - if (img1.data[0] != img2.data[0] || - img1.data[quarter] != img2.data[quarter] || - img1.data[half] != img2.data[half] || - img1.data[threeQuarter] != img2.data[threeQuarter] || - img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) { - return false; - } - - // Full comparison return std::memcmp(img1.data, img2.data, totalBytes) == 0; } - - // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices) const size_t rowSize = img1.cols * img1.elemSize(); for (int i = 0; i < img1.rows; i++) { - if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) { - return false; - } + if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false; } - return true; } cv::Mat ANSFLVClient::GetImage(int& width, int& height, int64_t& pts) { diff --git a/modules/ANSCV/ANSMJPEG.cpp b/modules/ANSCV/ANSMJPEG.cpp index cab859b..b320bae 100644 --- a/modules/ANSCV/ANSMJPEG.cpp +++ b/modules/ANSCV/ANSMJPEG.cpp @@ -208,44 +208,23 @@ namespace ANSCENTER { } bool ANSMJPEGClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) { - // Quick size and type checks - if (img1.size() != img2.size() || img1.type() != img2.type()) { - return false; - } + double ageMs = _playerClient->getLastFrameAgeMs(); + if (ageMs > 5000.0) return true; + if (ageMs > 0.0) return false; - // Handle empty images - if (img1.empty()) { - return img2.empty(); - } + if (img1.empty() && img2.empty()) return true; + if (img1.empty() || img2.empty()) return false; + if (img1.size() != img2.size() || img1.type() != img2.type()) return false; + if (img1.data == img2.data) return true; if (img1.isContinuous() && img2.isContinuous()) { const size_t totalBytes = img1.total() * img1.elemSize(); - - // Fast rejection: sample 5 positions across contiguous memory - const size_t quarter = totalBytes / 4; - const size_t half = totalBytes / 2; - const size_t threeQuarter = 3 * totalBytes / 4; - - if (img1.data[0] != img2.data[0] || - img1.data[quarter] != img2.data[quarter] || - img1.data[half] != img2.data[half] || - img1.data[threeQuarter] != img2.data[threeQuarter] || - img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) { - return false; - } - - // Full comparison return std::memcmp(img1.data, img2.data, totalBytes) == 0; } - - // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices) const size_t rowSize = img1.cols * img1.elemSize(); for (int i = 0; i < img1.rows; i++) { - if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) { - return false; - } + if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false; } - return true; } cv::Mat ANSMJPEGClient::GetImage(int& width, int& height, int64_t& pts) { diff --git a/modules/ANSCV/ANSRTMP.cpp b/modules/ANSCV/ANSRTMP.cpp index a9a0e3f..6b8343c 100644 --- a/modules/ANSCV/ANSRTMP.cpp +++ b/modules/ANSCV/ANSRTMP.cpp @@ -213,43 +213,22 @@ namespace ANSCENTER { } bool ANSRTMPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) { - // Quick size and type checks - if (img1.size() != img2.size() || img1.type() != img2.type()) { - return false; - } + double ageMs = _playerClient->getLastFrameAgeMs(); + if (ageMs > 5000.0) return true; + if (ageMs > 0.0) return false; - // Handle empty images - if (img1.empty()) { - return img2.empty(); - } + if (img1.empty() && img2.empty()) return true; + if (img1.empty() || img2.empty()) return false; + if (img1.size() != img2.size() || img1.type() != img2.type()) return false; + if (img1.data == img2.data) return true; if (img1.isContinuous() && img2.isContinuous()) { const size_t totalBytes = img1.total() * img1.elemSize(); - - // Fast rejection: sample 5 positions across contiguous memory - // Catches 99.99% of different frames immediately - const size_t quarter = totalBytes / 4; - const size_t half = totalBytes / 2; - const size_t threeQuarter = 3 * totalBytes / 4; - - if (img1.data[0] != img2.data[0] || - img1.data[quarter] != img2.data[quarter] || - img1.data[half] != img2.data[half] || - img1.data[threeQuarter] != img2.data[threeQuarter] || - img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) { - return false; - } - - // Full comparison return std::memcmp(img1.data, img2.data, totalBytes) == 0; } - - // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices) const size_t rowSize = img1.cols * img1.elemSize(); for (int i = 0; i < img1.rows; i++) { - if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) { - return false; - } + if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false; } return true; diff --git a/modules/ANSCV/ANSRTSP.cpp b/modules/ANSCV/ANSRTSP.cpp index 90f3a1c..1ed7a04 100644 --- a/modules/ANSCV/ANSRTSP.cpp +++ b/modules/ANSCV/ANSRTSP.cpp @@ -2,7 +2,9 @@ #include "ANSMatRegistry.h" #include "ANSGpuFrameOps.h" #include "GpuNV12SlotPool.h" +#include "ANSLicense.h" // ANS_DBG macro #include +#include #include #include "media_codec.h" #include @@ -69,6 +71,7 @@ namespace ANSCENTER { } void ANSRTSPClient::Destroy() { + ANS_DBG("RTSP_Lifecycle", "DESTROY called: url=%s playing=%d", _url.c_str(), (int)_isPlaying); // Move the player client pointer out of the lock scope, then // close it OUTSIDE the mutex. close() calls cuArrayDestroy / // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64. @@ -126,6 +129,24 @@ namespace ANSCENTER { // belong to the global GpuNV12SlotPool, not the decoder. if (clientToClose) { clientToClose->close(); + + // Force CUDA runtime to release all cached memory from the destroyed + // NVDEC decoder. Without this, cuMemFree returns memory to the CUDA + // driver's internal cache, and the next camera creation allocates fresh + // memory → VRAM grows by ~200-300MB per destroy/create cycle. + // cudaDeviceSynchronize ensures all pending GPU ops are done, then + // cudaMemPool trim releases the freed blocks back to the OS. + cudaDeviceSynchronize(); + cudaMemPool_t memPool = nullptr; + int currentDev = 0; + cudaGetDevice(¤tDev); + if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) { + cudaMemPoolTrimTo(memPool, 0); // Release all unused memory + } + size_t vramFree = 0, vramTotal = 0; + cudaMemGetInfo(&vramFree, &vramTotal); + ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB", + currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024)); } } static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) { @@ -211,6 +232,7 @@ namespace ANSCENTER { _playerClient->setCrop(crop); } bool ANSRTSPClient::Reconnect() { + ANS_DBG("RTSP_Lifecycle", "RECONNECT called: url=%s playing=%d", _url.c_str(), (int)_isPlaying); // 1. Mark as not-playing under the mutex FIRST. This makes GetImage() // return the cached _pLastFrame instead of calling into the player, // and blocks new TryIncrementInFlight calls (no new NV12 attachments). @@ -253,8 +275,30 @@ namespace ANSCENTER { // completed (or timed out), so close() is safe. _logger.LogInfo("ANSRTSPClient::Reconnect", "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__); + auto _rc0 = std::chrono::steady_clock::now(); RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this); _playerClient->close(); + auto _rc1 = std::chrono::steady_clock::now(); + + // Force CUDA runtime to release cached memory from the destroyed NVDEC decoder. + cudaDeviceSynchronize(); + auto _rc2 = std::chrono::steady_clock::now(); + cudaMemPool_t memPool = nullptr; + int currentDev = 0; + cudaGetDevice(¤tDev); + if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) { + cudaMemPoolTrimTo(memPool, 0); + } + auto _rc3 = std::chrono::steady_clock::now(); + { + size_t vf = 0, vt = 0; + cudaMemGetInfo(&vf, &vt); + double closeMs = std::chrono::duration(_rc1 - _rc0).count(); + double syncMs = std::chrono::duration(_rc2 - _rc1).count(); + double trimMs = std::chrono::duration(_rc3 - _rc2).count(); + ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB", + closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024)); + } RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this); // 3. Re-setup and play under the mutex. @@ -283,12 +327,9 @@ namespace ANSCENTER { } bool ANSRTSPClient::Stop() { - // Grab the player pointer and clear _isPlaying under the lock, - // then call stop() OUTSIDE the mutex. stop() internally calls - // StopVideoDecoder -> decoder->flush() which does CUDA calls - // that can block on the nvcuda64 SRW lock. Holding _mutex - // during that time blocks all other operations on this client - // and contributes to the convoy when many clients stop at once. + // Stop playback but keep the RTSP connection and NVDEC decoder alive. + // LabVIEW uses Stop/Start to pause cameras when no AI task is subscribed. + // The camera resumes instantly on Start() without re-connecting. CRtspPlayer* player = nullptr; { std::lock_guard lock(_mutex); @@ -300,6 +341,7 @@ namespace ANSCENTER { if (player) { player->stop(); } + ANS_DBG("RTSP_Lifecycle", "STOP complete: handle=%p (connection kept alive)", (void*)this); return true; } bool ANSRTSPClient::Pause() { @@ -342,45 +384,44 @@ namespace ANSCENTER { } bool ANSRTSPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) { - // Quick size and type checks - if (img1.size() != img2.size() || img1.type() != img2.type()) { - return false; + double ageMs = _playerClient->getLastFrameAgeMs(); + + if (ageMs > 5000.0) { + ANS_DBG("RTSP_Stale", "FROZEN DETECTED: ageMs=%.1f url=%s playing=%d — camera truly stale", + ageMs, _url.c_str(), (int)_isPlaying); + return true; // Truly stale — no decoder output for 5+ seconds + } + if (ageMs > 0.0) { + return false; // Decoder is receiving frames — camera is alive } - // Handle empty images - if (img1.empty()) { - return img2.empty(); - } + // ageMs == 0 means no frame has been decoded yet (startup). + // Fall back to pixel comparison for backward compatibility. + if (img1.empty() && img2.empty()) return true; + if (img1.empty() || img2.empty()) return false; + if (img1.size() != img2.size() || img1.type() != img2.type()) return false; + // Same data pointer = same cv::Mat (shallow copy) + if (img1.data == img2.data) return true; + + // Quick 5-point sampling if (img1.isContinuous() && img2.isContinuous()) { const size_t totalBytes = img1.total() * img1.elemSize(); - - // Fast rejection: sample 5 positions across contiguous memory - // Catches 99.99% of different frames immediately const size_t quarter = totalBytes / 4; const size_t half = totalBytes / 2; - const size_t threeQuarter = 3 * totalBytes / 4; - if (img1.data[0] != img2.data[0] || img1.data[quarter] != img2.data[quarter] || img1.data[half] != img2.data[half] || - img1.data[threeQuarter] != img2.data[threeQuarter] || img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) { return false; } - - // Full comparison return std::memcmp(img1.data, img2.data, totalBytes) == 0; } - // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices) const size_t rowSize = img1.cols * img1.elemSize(); for (int i = 0; i < img1.rows; i++) { - if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) { - return false; - } + if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false; } - return true; } cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) { @@ -414,6 +455,20 @@ namespace ANSCENTER { if (currentPts == _pts && !_pLastFrame.empty()) { width = _imageWidth; height = _imageHeight; + // Return timestamp based on decoder frame age so LabVIEW can distinguish + // "rate-limited duplicate" from "camera truly stale". + // If decoder is still receiving frames (age < 5s), advance PTS so LabVIEW + // sees a changing timestamp and doesn't trigger false reconnect. + // If decoder is stale (age > 5s), return same PTS so LabVIEW detects it. + double ageMs = _playerClient->getLastFrameAgeMs(); + if (ageMs > 0.0 && ageMs < 5000.0) { + // Camera alive but rate-limited — advance PTS to prevent false stale detection + _pts++; + } else if (ageMs >= 5000.0) { + // Camera stale — keep same PTS so LabVIEW triggers reconnect + ANS_DBG("RTSP_GetImage", "STALE PTS: ageMs=%.1f pts=%lld url=%s — not advancing PTS", + ageMs, (long long)_pts, _url.c_str()); + } pts = _pts; return _pLastFrame; } @@ -891,6 +946,10 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _useNV12FastPath = enable; } + double ANSRTSPClient::GetLastFrameAgeMs() { + std::lock_guard lock(_mutex); + return _playerClient->getLastFrameAgeMs(); + } AVFrame* ANSRTSPClient::GetNV12Frame() { std::lock_guard lock(_mutex); if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed) @@ -937,6 +996,7 @@ namespace ANSCENTER { } extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle, const char* licenseKey, const char* username, const char* password, const char* url) { + ANS_DBG("RTSP_Lifecycle", "CREATE: url=%s", url ? url : "null"); if (!Handle || !licenseKey || !url) return -1; try { auto ptr = std::make_unique(); @@ -946,11 +1006,10 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url); else result = ptr->Init(licenseKey, username, password, url); if (result) { - // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy - // fast path. LabVIEW may not call SetRTSPHWDecoding after - // destroy+recreate cycles, so this ensures the new handle always - // uses the GPU decode path instead of falling back to D3D11VA/CPU. - ptr->SetHWDecoding(7); // HW_DECODING_CUDA + // Software decode by default — saves VRAM (no NVDEC DPB surfaces). + // With 100 cameras, HW decode would consume ~5-21 GB VRAM for idle decoders. + // User can enable HW decode per-camera via SetRTSPHWDecoding(handle, 7). + // ptr->SetHWDecoding(7); // Disabled — was HW_DECODING_CUDA *Handle = ptr.release(); extern void anscv_unregister_handle(void*); extern void anscv_register_handle(void*, void(*)(void*)); @@ -967,6 +1026,7 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien } catch (...) { return -1; } } extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle) { + ANS_DBG("RTSP_Lifecycle", "RELEASE: handle=%p", Handle ? (void*)*Handle : nullptr); if (Handle == nullptr || *Handle == nullptr) return -1; try { extern void anscv_unregister_handle(void*); @@ -982,25 +1042,27 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie // on any subsequent call, and prevents NEW NV12 GPU surface // pointers from being handed out. // - // Do NOT call Destroy()/close() here — close() frees the - // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may - // still be in use by a CUDA inference kernel that received - // the NV12 pointer from a GetRTSPCVImage call that already - // completed before this Release was called. + // Synchronous cleanup — ensures all GPU resources (NVDEC surfaces, VRAM) + // are fully released BEFORE LabVIEW creates a new camera. + // Previously deferred to a background thread, but that caused the old + // camera's resources to overlap with the new camera's allocations, + // leading to temporary VRAM doubling (~240MB per camera) and eventual + // VRAM exhaustion on cameras with frequent reconnects. { - // Use the client's _mutex to safely set _isPlaying = false. - // This is the same lock GetImage/GetNV12Frame acquire. - raw->Stop(); // sets _isPlaying = false, stops playback - } + auto t0 = std::chrono::steady_clock::now(); + raw->Stop(); + auto t1 = std::chrono::steady_clock::now(); + raw->Destroy(); + auto t2 = std::chrono::steady_clock::now(); + delete raw; + auto t3 = std::chrono::steady_clock::now(); - // Defer the full cleanup (Destroy + delete) to a background thread - // so LabVIEW's UI thread is not blocked. Destroy() now waits - // precisely for in-flight inference to finish (via _inFlightFrames - // counter + condition variable) instead of the old 500ms sleep hack. - std::thread([raw]() { - try { raw->Destroy(); } catch (...) {} - try { delete raw; } catch (...) {} - }).detach(); + double stopMs = std::chrono::duration(t1 - t0).count(); + double destroyMs = std::chrono::duration(t2 - t1).count(); + double deleteMs = std::chrono::duration(t3 - t2).count(); + ANS_DBG("RTSP_Lifecycle", "RELEASE complete: stop=%.1fms destroy=%.1fms delete=%.1fms total=%.1fms", + stopMs, destroyMs, deleteMs, stopMs + destroyMs + deleteMs); + } return 0; } catch (...) { @@ -1269,6 +1331,7 @@ extern "C" __declspec(dllexport) int GetRTSPImage(ANSCENTER::ANSRTSPClient** Han } } extern "C" __declspec(dllexport) int StartRTSP(ANSCENTER::ANSRTSPClient **Handle) { + ANS_DBG("RTSP_Lifecycle", "START: handle=%p", Handle ? (void*)*Handle : nullptr); if (Handle == nullptr || *Handle == nullptr) return -1; try { bool result = (*Handle)->Start(); @@ -1301,6 +1364,7 @@ extern "C" __declspec(dllexport) int ReconnectRTSP(ANSCENTER::ANSRTSPClient * *H } } extern "C" __declspec(dllexport) int StopRTSP(ANSCENTER::ANSRTSPClient * *Handle) { + ANS_DBG("RTSP_Lifecycle", "STOP: handle=%p", Handle ? (void*)*Handle : nullptr); if (Handle == nullptr || *Handle == nullptr) return -1; try { bool result = (*Handle)->Stop(); @@ -1462,9 +1526,15 @@ extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient* extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) { if (Handle == nullptr || *Handle == nullptr) return; try { - (*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path + (*Handle)->SetNV12FastPath(enable != 0); } catch (...) { } } +extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle) { + if (Handle == nullptr || *Handle == nullptr) return -1.0; + try { + return (*Handle)->GetLastFrameAgeMs(); + } catch (...) { return -1.0; } +} extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) { if (Handle == nullptr || *Handle == nullptr) return -1; try { diff --git a/modules/ANSCV/ANSRTSP.h b/modules/ANSCV/ANSRTSP.h index 5bc4bb5..abe09bb 100644 --- a/modules/ANSCV/ANSRTSP.h +++ b/modules/ANSCV/ANSRTSP.h @@ -106,6 +106,7 @@ namespace ANSCENTER void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable) bool IsNV12FastPath() const { return _useNV12FastPath; } + double GetLastFrameAgeMs(); // Milliseconds since last frame from decoder (detects truly stale cameras, unaffected by SetTargetFPS) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -145,4 +146,5 @@ extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClie extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs); extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable); +extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle); #endif \ No newline at end of file diff --git a/modules/ANSCV/ANSSRT.cpp b/modules/ANSCV/ANSSRT.cpp index be14595..f003312 100644 --- a/modules/ANSCV/ANSSRT.cpp +++ b/modules/ANSCV/ANSSRT.cpp @@ -221,43 +221,22 @@ namespace ANSCENTER { } bool ANSSRTClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) { - // Quick size and type checks - if (img1.size() != img2.size() || img1.type() != img2.type()) { - return false; - } + double ageMs = _playerClient->getLastFrameAgeMs(); + if (ageMs > 5000.0) return true; + if (ageMs > 0.0) return false; - // Handle empty images - if (img1.empty()) { - return img2.empty(); - } + if (img1.empty() && img2.empty()) return true; + if (img1.empty() || img2.empty()) return false; + if (img1.size() != img2.size() || img1.type() != img2.type()) return false; + if (img1.data == img2.data) return true; if (img1.isContinuous() && img2.isContinuous()) { const size_t totalBytes = img1.total() * img1.elemSize(); - - // Fast rejection: sample 5 positions across contiguous memory - // Catches 99.99% of different frames immediately - const size_t quarter = totalBytes / 4; - const size_t half = totalBytes / 2; - const size_t threeQuarter = 3 * totalBytes / 4; - - if (img1.data[0] != img2.data[0] || - img1.data[quarter] != img2.data[quarter] || - img1.data[half] != img2.data[half] || - img1.data[threeQuarter] != img2.data[threeQuarter] || - img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) { - return false; - } - - // Full comparison return std::memcmp(img1.data, img2.data, totalBytes) == 0; } - - // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices) const size_t rowSize = img1.cols * img1.elemSize(); for (int i = 0; i < img1.rows; i++) { - if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) { - return false; - } + if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false; } return true; diff --git a/modules/ANSCV/ANSVideoPlayer.cpp b/modules/ANSCV/ANSVideoPlayer.cpp index 9ca451d..d45f1b5 100644 --- a/modules/ANSCV/ANSVideoPlayer.cpp +++ b/modules/ANSCV/ANSVideoPlayer.cpp @@ -136,7 +136,7 @@ namespace ANSCENTER { if (!_hwDecodeActive && !_hwPlayer) { try { auto hwp = std::make_unique(); - hwp->setHWDecoding(HW_DECODING_AUTO); // CUDA → D3D11VA → DXVA2 → software + hwp->setHWDecoding(HW_DECODING_DISABLE); // Software decode by default — saves VRAM if (hwp->open(_url)) { _hwPlayer = std::move(hwp); _hwDecodeActive = true; diff --git a/modules/ANSCV/VideoPlayer.cpp b/modules/ANSCV/VideoPlayer.cpp index 84bc57a..c71368f 100644 --- a/modules/ANSCV/VideoPlayer.cpp +++ b/modules/ANSCV/VideoPlayer.cpp @@ -93,7 +93,7 @@ CVideoPlayer::CVideoPlayer(): , m_bPaused(FALSE) , m_bSizeChanged(FALSE) //, m_nRenderMode(RENDER_MODE_KEEP) - , m_nHWDecoding(HW_DECODING_AUTO) + , m_nHWDecoding(HW_DECODING_DISABLE) // Software decode by default — saves VRAM , m_nDstVideoFmt(AV_PIX_FMT_YUV420P) , m_bUpdown(FALSE) , m_bSnapshot(FALSE) diff --git a/modules/ANSODEngine/ANSODEngine.cpp b/modules/ANSODEngine/ANSODEngine.cpp index 76004fd..bfe984a 100644 --- a/modules/ANSODEngine/ANSODEngine.cpp +++ b/modules/ANSODEngine/ANSODEngine.cpp @@ -3,6 +3,7 @@ #include #include #include "ANSODEngine.h" +#include "ANSLicense.h" // ANS_DBG macro #include "ANSYOLOOD.h" #include "ANSTENSORRTOD.h" #include "ANSTENSORRTCL.h" @@ -879,6 +880,9 @@ namespace ANSCENTER std::vector allResults; allResults.clear(); try { + ANS_DBG("ODEngine", "SAHI START: %dx%d tile=%dx%d overlap=%.1f cam=%s", + input.cols, input.rows, tiledWidth, tiledHeight, overLap, camera_id.c_str()); + auto _sahiStart = std::chrono::steady_clock::now(); cv::Mat image = input.clone(); if (image.empty() || !image.data || !image.u) { return allResults; @@ -920,6 +924,16 @@ namespace ANSCENTER //4. Apply Non-Maximum Suppression (NMS) to merge overlapping results float iouThreshold = 0.1; std::vector finalResults = ANSUtilityHelper::ApplyNMS(allResults, iouThreshold); + { + double _sahiMs = std::chrono::duration( + std::chrono::steady_clock::now() - _sahiStart).count(); + ANS_DBG("ODEngine", "SAHI DONE: %.1fms patches=%zu results=%zu cam=%s", + _sahiMs, patches.size() + 1, finalResults.size(), camera_id.c_str()); + if (_sahiMs > 2000.0) { + ANS_DBG("ODEngine", "SAHI SLOW: %.1fms — %zu patches held _mutex entire time!", + _sahiMs, patches.size() + 1); + } + } image.release(); return finalResults; } @@ -2103,6 +2117,8 @@ namespace ANSCENTER // No coarse _mutex — sub-components (engines, trackers) have their own locks. // LabVIEW semaphore controls concurrency at the caller level. try { + ANS_DBG("ODEngine", "RunInferenceWithOption: cam=%s %dx%d mode=%s", + camera_id.c_str(), input.cols, input.rows, activeROIMode.c_str()); int mode = 0; double confidenceThreshold = 0.35; std::vector trackingObjectIds; diff --git a/modules/ANSODEngine/ANSRTYOLO.cpp b/modules/ANSODEngine/ANSRTYOLO.cpp index 0f1ad4a..32b1f7a 100644 --- a/modules/ANSODEngine/ANSRTYOLO.cpp +++ b/modules/ANSODEngine/ANSRTYOLO.cpp @@ -1,5 +1,6 @@ #include "ANSRTYOLO.h" #include "Utility.h" +#include "ANSLicense.h" // ANS_DBG macro for DebugView #include #include #include @@ -903,7 +904,6 @@ namespace ANSCENTER { return {}; } - // Check if model is classification (output ndims <= 2) const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; @@ -914,11 +914,8 @@ namespace ANSCENTER { cv::cuda::GpuMat resized; if (imgRGB.rows != inputH || imgRGB.cols != inputW) { if (isClassification) { - // Classification: direct resize (no letterbox padding) - // Must use explicit stream to avoid conflict with CUDA Graph capture on null stream cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); } else { - // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); } } @@ -1831,8 +1828,7 @@ namespace ANSCENTER { } // --- 2. Preprocess under lock --- - // Try NV12 fast path first (12MB upload vs 24MB BGR for 4K) - // Falls back to standard GPU preprocessing if no NV12 data available. + ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows); ImageMetadata meta; std::vector> input; bool usedNV12 = false; @@ -1874,11 +1870,22 @@ namespace ANSCENTER { } // --- 3. TRT Inference (mutex released for concurrent GPU slots) --- + ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d", + (int)usedNV12, + input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols), + input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows)); + auto _trtStart = std::chrono::steady_clock::now(); std::vector>> featureVectors; if (!m_trtEngine->runInference(input, featureVectors)) { + ANS_DBG("YOLO", "ERROR: TRT runInference FAILED"); _logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__); return {}; } + auto _trtEnd = std::chrono::steady_clock::now(); + double _trtMs = std::chrono::duration(_trtEnd - _trtStart).count(); + if (_trtMs > 500.0) { + ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs); + } double msInference = dbg ? elapsed() : 0; // --- 4. Transform output --- diff --git a/modules/ANSODEngine/ANSRTYOLO.h b/modules/ANSODEngine/ANSRTYOLO.h index 47d0683..1973fba 100644 --- a/modules/ANSODEngine/ANSRTYOLO.h +++ b/modules/ANSODEngine/ANSRTYOLO.h @@ -81,6 +81,7 @@ namespace ANSCENTER { std::vector> PreprocessBatch( const std::vector& inputImages, BatchMetadata& outMetadata); + // ── Detection pipeline ─────────────────────────────────────────── std::vector DetectObjects(const cv::Mat& inputImage, const std::string& camera_id); diff --git a/modules/ANSODEngine/NV12PreprocessHelper.cpp b/modules/ANSODEngine/NV12PreprocessHelper.cpp index 13be033..dd519cc 100644 --- a/modules/ANSODEngine/NV12PreprocessHelper.cpp +++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp @@ -1,6 +1,7 @@ #include "NV12PreprocessHelper.h" #include "ANSGpuFrameRegistry.h" #include "ANSEngineCommon.h" +#include "ANSLicense.h" // ANS_DBG macro #include #include #include diff --git a/modules/ANSODEngine/dllmain.cpp b/modules/ANSODEngine/dllmain.cpp index 56dada8..768be6c 100644 --- a/modules/ANSODEngine/dllmain.cpp +++ b/modules/ANSODEngine/dllmain.cpp @@ -6,6 +6,7 @@ #include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH #include // INT_MIN +#include "ANSLicense.h" // ANS_DBG macro for DebugView // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers). // Defined here, declared extern in EngineBuildLoadNetwork.inl. @@ -1696,6 +1697,8 @@ static int RunInferenceComplete_LV_Impl( auto* engine = guard.get(); try { + auto _t0 = std::chrono::steady_clock::now(); + // Save/restore thread-local to support nested calls (custom model DLLs // calling back into ANSODEngine via ANSLIB.dll). GpuFrameData* savedFrame = tl_currentGpuFrame(); @@ -1708,6 +1711,10 @@ static int RunInferenceComplete_LV_Impl( int originalWidth = localImage.cols; int originalHeight = localImage.rows; + ANS_DBG("LV_Inference", "START cam=%s %dx%d gpuFrame=%p nv12=%s", + cameraId ? cameraId : "?", originalWidth, originalHeight, + (void*)gpuFrame, gpuFrame ? "YES" : "NO"); + if (originalWidth == 0 || originalHeight == 0) { tl_currentGpuFrame() = savedFrame; return -2; @@ -1717,8 +1724,17 @@ static int RunInferenceComplete_LV_Impl( // Safe: *cvImage holds a refcount, keeping gpuFrame alive during inference. // Only use OWN gpuFrame — never inherit outer caller's frame (dimension mismatch on crops). tl_currentGpuFrame() = gpuFrame; + auto _t1 = std::chrono::steady_clock::now(); std::vector outputs = engine->RunInferenceWithOption(localImage, cameraId, activeROIMode); + auto _t2 = std::chrono::steady_clock::now(); tl_currentGpuFrame() = savedFrame; + + double prepMs = std::chrono::duration(_t1 - _t0).count(); + double infMs = std::chrono::duration(_t2 - _t1).count(); + if (infMs > 500.0) { + ANS_DBG("LV_Inference", "SLOW cam=%s prep=%.1fms inf=%.1fms results=%zu", + cameraId ? cameraId : "?", prepMs, infMs, outputs.size()); + } bool getJpeg = (getJpegString == 1); std::string stImage; // NOTE: odMutex was removed here. All variables in this scope are local diff --git a/modules/ANSODEngine/engine.h b/modules/ANSODEngine/engine.h index c522447..59208f4 100644 --- a/modules/ANSODEngine/engine.h +++ b/modules/ANSODEngine/engine.h @@ -402,6 +402,9 @@ private: cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations std::vector m_preprocessedInputs; // Keep inputs alive + // Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static, + // so cached buffers use thread_local inside the functions themselves. + // Thermal management (ADD THESE) //int m_consecutiveInferences; @@ -431,7 +434,7 @@ private: Logger m_logger; bool m_verbose{ true }; // false for non-probe pool slots - bool m_disableGraphs{ false }; // true for pool slots — concurrent graph captures corrupt CUDA context + bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM // -- Multi-GPU pool data --------------------------------------------------- diff --git a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp index 557115f..2c86de3 100644 --- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp +++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp @@ -814,8 +814,8 @@ static void ALPRWorkerThread(int taskId, g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")"); } if (emptyFrames > 300) { - g_log.add(prefix + " Too many empty frames, attempting reconnect..."); - ReconnectRTSP(&rtspClient); + g_log.add(prefix + " Too many empty frames (reconnect disabled for long test)"); + // ReconnectRTSP(&rtspClient); // Disabled for VRAM stability testing emptyFrames = 0; } streamLock.unlock(); @@ -1222,9 +1222,9 @@ int ANSLPR_MultiGPU_StressTest() { g_log.add(buf); printf("%s\n", buf); } else if (currentGpu != streamPreferredGpu[s]) { - // Decoder is active on wrong GPU — reconnect to move it + // Decoder is active on wrong GPU — reconnect disabled for VRAM stability testing SetRTSPHWDecoding(&rtspClients[s], 7, streamPreferredGpu[s]); - ReconnectRTSP(&rtspClients[s]); + // ReconnectRTSP(&rtspClients[s]); // Disabled for long test char buf[256]; snprintf(buf, sizeof(buf), "[Stream%d] NVDEC GPU realigned: GPU[%d] -> GPU[%d] (reconnected for zero-copy)", @@ -1279,7 +1279,7 @@ int ANSLPR_MultiGPU_StressTest() { // CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running. // This is the exact scenario that triggers the nvcuda64 SRW lock deadlock. // ========================================================================= - std::atomic chaosEnabled{true}; + std::atomic chaosEnabled{false}; // Disabled for VRAM stability long test std::thread chaosThread([&]() { std::mt19937 rng(std::random_device{}());