Use software decoder by default
This commit is contained in:
@@ -51,7 +51,25 @@
|
|||||||
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug1.txt''\\).Count\")",
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug1.txt''\\).Count\")",
|
||||||
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION20.log''\\).Count\")",
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION20.log''\\).Count\")",
|
||||||
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION21.log''\\).Count\")",
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION21.log''\\).Count\")",
|
||||||
"Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")"
|
"Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")",
|
||||||
|
"Bash(ls -la /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.h)",
|
||||||
|
"Bash(grep -r \"cudaMalloc\\\\|cudaFree\\\\|cudaStreamCreate\\\\|cudaStreamDestroy\" /c/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/*.cpp)",
|
||||||
|
"Bash(grep -n \"cudaStreamCreate\\\\|cudaEventCreate\\\\|cudaEventDestroy\\\\|cudaStreamDestroy\\\\|cudaStreamSynchronize\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
|
||||||
|
"Bash(dir \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\\\\engines\\\\TensorRTAPI\\\\include\\\\engine\\\\*.h\" /b)",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION26.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION27.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION28.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION29.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION30.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logging4.txt''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION31.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\loging5.txt''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION32.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION33.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION34.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION35.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION36.log''\\).Count\")",
|
||||||
|
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION37.log''\\).Count\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -258,7 +258,15 @@ void CRtspPlayer::stop()
|
|||||||
// Set flags BEFORE stopping decoder so TCP rx thread stops calling decode()
|
// Set flags BEFORE stopping decoder so TCP rx thread stops calling decode()
|
||||||
m_bPlaying = FALSE;
|
m_bPlaying = FALSE;
|
||||||
m_bPaused = FALSE;
|
m_bPaused = FALSE;
|
||||||
CVideoPlayer::StopVideoDecoder(); // Stop the video decoder
|
CVideoPlayer::StopVideoDecoder(); // Stop the video decoder + uninit (free VRAM)
|
||||||
|
|
||||||
|
// Close RTSP connection and shut down RX threads.
|
||||||
|
// Without this, stopped cameras keep TCP/UDP threads running,
|
||||||
|
// sockets open, and receiving network data — wasting CPU and
|
||||||
|
// network resources. With 100 cameras and only 5 running,
|
||||||
|
// 95 idle threads would consume CPU for no purpose.
|
||||||
|
// Start() → Setup() → open() will reconnect when needed.
|
||||||
|
m_rtsp.rtsp_close();
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOL CRtspPlayer::pause()
|
BOOL CRtspPlayer::pause()
|
||||||
|
|||||||
@@ -1275,6 +1275,90 @@ cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame)
|
|||||||
return cv::Mat();
|
return cv::Mat();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
|
||||||
|
try {
|
||||||
|
if (!frame || frame->width <= 0 || frame->height <= 0) {
|
||||||
|
return cv::Mat();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int width = frame->width;
|
||||||
|
const int height = frame->height;
|
||||||
|
|
||||||
|
// YUV420P has 3 separate planes: Y (full res), U (half), V (half).
|
||||||
|
// OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer
|
||||||
|
// with Y on top (H rows) and U,V stacked below (H/2 rows total).
|
||||||
|
// Layout: [Y: W×H] [U: W/2 × H/2] [V: W/2 × H/2]
|
||||||
|
// Total height = H * 3/2, width = W, single channel.
|
||||||
|
|
||||||
|
// If all planes are contiguous with matching strides, wrap directly
|
||||||
|
const int yStride = frame->linesize[0];
|
||||||
|
const int uStride = frame->linesize[1];
|
||||||
|
const int vStride = frame->linesize[2];
|
||||||
|
|
||||||
|
// Fast path: planes are packed contiguously with stride == width
|
||||||
|
if (yStride == width && uStride == width / 2 && vStride == width / 2 &&
|
||||||
|
frame->data[1] == frame->data[0] + width * height &&
|
||||||
|
frame->data[2] == frame->data[1] + (width / 2) * (height / 2)) {
|
||||||
|
// Contiguous I420 — wrap directly, zero copy
|
||||||
|
cv::Mat yuv(height * 3 / 2, width, CV_8UC1, frame->data[0]);
|
||||||
|
cv::Mat bgrImage;
|
||||||
|
cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
|
||||||
|
if (m_nImageQuality == 1) {
|
||||||
|
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
|
||||||
|
}
|
||||||
|
return bgrImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Slow path: planes have padding (linesize > width) — copy to contiguous buffer
|
||||||
|
const int uvWidth = width / 2;
|
||||||
|
const int uvHeight = height / 2;
|
||||||
|
const int totalSize = width * height + uvWidth * uvHeight * 2;
|
||||||
|
|
||||||
|
cv::Mat yuv(height * 3 / 2, width, CV_8UC1);
|
||||||
|
uint8_t* dst = yuv.data;
|
||||||
|
|
||||||
|
// Copy Y plane (line by line if stride != width)
|
||||||
|
if (yStride == width) {
|
||||||
|
std::memcpy(dst, frame->data[0], width * height);
|
||||||
|
} else {
|
||||||
|
for (int row = 0; row < height; ++row) {
|
||||||
|
std::memcpy(dst + row * width, frame->data[0] + row * yStride, width);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dst += width * height;
|
||||||
|
|
||||||
|
// Copy U plane
|
||||||
|
if (uStride == uvWidth) {
|
||||||
|
std::memcpy(dst, frame->data[1], uvWidth * uvHeight);
|
||||||
|
} else {
|
||||||
|
for (int row = 0; row < uvHeight; ++row) {
|
||||||
|
std::memcpy(dst + row * uvWidth, frame->data[1] + row * uStride, uvWidth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dst += uvWidth * uvHeight;
|
||||||
|
|
||||||
|
// Copy V plane
|
||||||
|
if (vStride == uvWidth) {
|
||||||
|
std::memcpy(dst, frame->data[2], uvWidth * uvHeight);
|
||||||
|
} else {
|
||||||
|
for (int row = 0; row < uvHeight; ++row) {
|
||||||
|
std::memcpy(dst + row * uvWidth, frame->data[2] + row * vStride, uvWidth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::Mat bgrImage;
|
||||||
|
cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
|
||||||
|
if (m_nImageQuality == 1) {
|
||||||
|
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
|
||||||
|
}
|
||||||
|
return bgrImage;
|
||||||
|
}
|
||||||
|
catch (const std::exception& e) {
|
||||||
|
std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl;
|
||||||
|
return cv::Mat();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
|
cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
|
||||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
try {
|
try {
|
||||||
@@ -1287,8 +1371,9 @@ cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
|
|||||||
switch (pFrame->format) {
|
switch (pFrame->format) {
|
||||||
case AV_PIX_FMT_NV12:
|
case AV_PIX_FMT_NV12:
|
||||||
return avframeNV12ToCvMat(pFrame);
|
return avframeNV12ToCvMat(pFrame);
|
||||||
|
case AV_PIX_FMT_YUV420P:
|
||||||
case AV_PIX_FMT_YUVJ420P:
|
case AV_PIX_FMT_YUVJ420P:
|
||||||
return avframeAnyToCvmat(pFrame);
|
return avframeYUV420PToCvMat(pFrame);
|
||||||
default:
|
default:
|
||||||
return avframeAnyToCvmat(pFrame);
|
return avframeAnyToCvmat(pFrame);
|
||||||
|
|
||||||
@@ -1305,7 +1390,7 @@ CVideoPlayer::CVideoPlayer() :
|
|||||||
, m_bAudioInited(FALSE)
|
, m_bAudioInited(FALSE)
|
||||||
, m_bPlaying(FALSE)
|
, m_bPlaying(FALSE)
|
||||||
, m_bPaused(FALSE)
|
, m_bPaused(FALSE)
|
||||||
, m_nHWDecoding(HW_DECODING_AUTO)//(HW_DECODING_AUTO)// HW_DECODING_D3D11 //HW_DECODING_DISABLE
|
, m_nHWDecoding(HW_DECODING_DISABLE)// Software decode by default — saves VRAM (no NVDEC DPB surfaces)
|
||||||
, m_bUpdown(FALSE)
|
, m_bUpdown(FALSE)
|
||||||
, m_bSnapshot(FALSE)
|
, m_bSnapshot(FALSE)
|
||||||
, m_nSnapVideoFmt(AV_PIX_FMT_YUVJ420P)
|
, m_nSnapVideoFmt(AV_PIX_FMT_YUVJ420P)
|
||||||
@@ -1740,6 +1825,13 @@ void CVideoPlayer::StopVideoDecoder() {
|
|||||||
// Flush decoder to drain and discard any buffered frames,
|
// Flush decoder to drain and discard any buffered frames,
|
||||||
// so stale reference frames don't corrupt the next session
|
// so stale reference frames don't corrupt the next session
|
||||||
decoder->flush();
|
decoder->flush();
|
||||||
|
// Free NVDEC decoder context and all GPU surfaces (DPB buffers).
|
||||||
|
// Stopped cameras should not hold VRAM — with 100 cameras created
|
||||||
|
// but only 5 running, the 95 idle decoders would consume ~5-10 GB.
|
||||||
|
// The decoder will be re-initialized automatically when the next
|
||||||
|
// video packet arrives after Start() is called.
|
||||||
|
decoder->uninit();
|
||||||
|
m_bVideoInited = FALSE;
|
||||||
}
|
}
|
||||||
// Clear queue but KEEP m_currentImage and m_lastJpegImage —
|
// Clear queue but KEEP m_currentImage and m_lastJpegImage —
|
||||||
// getImage()/getJpegImage() will return the last good frame while decoder stabilizes
|
// getImage()/getJpegImage() will return the last good frame while decoder stabilizes
|
||||||
@@ -1842,6 +1934,13 @@ void CVideoPlayer::setTargetFPS(double intervalMs)
|
|||||||
m_targetIntervalMs = intervalMs;
|
m_targetIntervalMs = intervalMs;
|
||||||
m_targetFPSInitialized = false; // reset timing on change
|
m_targetFPSInitialized = false; // reset timing on change
|
||||||
}
|
}
|
||||||
|
double CVideoPlayer::getLastFrameAgeMs()
|
||||||
|
{
|
||||||
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
|
if (!m_lastDecoderFrameTimeSet) return 0.0;
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
return std::chrono::duration<double, std::milli>(now - m_lastDecoderFrameTime).count();
|
||||||
|
}
|
||||||
void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq)
|
void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq)
|
||||||
{
|
{
|
||||||
if (m_bRecording)
|
if (m_bRecording)
|
||||||
@@ -2061,6 +2160,11 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record wall-clock time of every decoded frame (even rate-limited ones).
|
||||||
|
// Used by getLastFrameAgeMs() to detect truly stale cameras.
|
||||||
|
m_lastDecoderFrameTime = std::chrono::steady_clock::now();
|
||||||
|
m_lastDecoderFrameTimeSet = true;
|
||||||
|
|
||||||
// --- Frame rate limiting ---
|
// --- Frame rate limiting ---
|
||||||
// Skip post-decode processing (clone, queue push, CUDA clone) if not enough
|
// Skip post-decode processing (clone, queue push, CUDA clone) if not enough
|
||||||
// time has elapsed since the last processed frame. The decode itself still
|
// time has elapsed since the last processed frame. The decode itself still
|
||||||
|
|||||||
@@ -148,6 +148,7 @@ public:
|
|||||||
// Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms)
|
// Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms)
|
||||||
virtual void setImageQuality(int mode) { m_nImageQuality = mode; }
|
virtual void setImageQuality(int mode) { m_nImageQuality = mode; }
|
||||||
void setTargetFPS(double intervalMs); // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS)
|
void setTargetFPS(double intervalMs); // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS)
|
||||||
|
double getLastFrameAgeMs(); // Milliseconds since last frame arrived from decoder (0 if no frame yet)
|
||||||
virtual void setRtpMulticast(BOOL flag) {}
|
virtual void setRtpMulticast(BOOL flag) {}
|
||||||
virtual void setRtpOverUdp(BOOL flag) {}
|
virtual void setRtpOverUdp(BOOL flag) {}
|
||||||
|
|
||||||
@@ -223,6 +224,7 @@ protected:
|
|||||||
|
|
||||||
cv::Mat avframeAnyToCvmat(const AVFrame* frame);
|
cv::Mat avframeAnyToCvmat(const AVFrame* frame);
|
||||||
cv::Mat avframeNV12ToCvMat(const AVFrame* frame);
|
cv::Mat avframeNV12ToCvMat(const AVFrame* frame);
|
||||||
|
cv::Mat avframeYUV420PToCvMat(const AVFrame* frame); // YUV420P/YUVJ420P → BGR (OpenCV, no sws_scale)
|
||||||
cv::Mat avframeYUVJ420PToCvmat(const AVFrame* frame);
|
cv::Mat avframeYUVJ420PToCvmat(const AVFrame* frame);
|
||||||
cv::Mat avframeToCVMat(const AVFrame* frame);
|
cv::Mat avframeToCVMat(const AVFrame* frame);
|
||||||
|
|
||||||
@@ -273,6 +275,12 @@ protected:
|
|||||||
std::chrono::steady_clock::time_point m_lastProcessedTime; // timestamp of last processed frame
|
std::chrono::steady_clock::time_point m_lastProcessedTime; // timestamp of last processed frame
|
||||||
bool m_targetFPSInitialized = false; // first-frame flag
|
bool m_targetFPSInitialized = false; // first-frame flag
|
||||||
|
|
||||||
|
// Wall-clock timestamp of last frame received from the decoder (NOT from getImage).
|
||||||
|
// Updated in onVideoFrame() for EVERY decoded frame, even rate-limited ones.
|
||||||
|
// Used by LabVIEW to detect truly stale cameras vs rate-limited ones.
|
||||||
|
std::chrono::steady_clock::time_point m_lastDecoderFrameTime;
|
||||||
|
bool m_lastDecoderFrameTimeSet = false;
|
||||||
|
|
||||||
BOOL m_bPlaying;
|
BOOL m_bPlaying;
|
||||||
BOOL m_bPaused;
|
BOOL m_bPaused;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,33 @@
|
|||||||
#ifndef ANSLICENSE_H
|
#ifndef ANSLICENSE_H
|
||||||
#define ANSLICENSE_H
|
#define ANSLICENSE_H
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Global debug toggle for DebugView (DbgView) logging.
|
||||||
|
// Define ANSCORE_DEBUGVIEW=1 to enable verbose OutputDebugStringA logging
|
||||||
|
// across all ANSCORE modules (ANSCV, ANSODEngine, TensorRT engine, etc.).
|
||||||
|
// Set to 0 for production builds to eliminate all debug output overhead.
|
||||||
|
// ============================================================================
|
||||||
|
#ifndef ANSCORE_DEBUGVIEW
|
||||||
|
#define ANSCORE_DEBUGVIEW 1 // 1 = enabled (debug), 0 = disabled (production)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// ANS_DBG: Debug logging macro for DebugView (OutputDebugStringA on Windows).
|
||||||
|
// Usage: ANS_DBG("MyModule", "value=%d ptr=%p", val, ptr);
|
||||||
|
// Output: [MyModule] value=42 ptr=0x1234
|
||||||
|
// When ANSCORE_DEBUGVIEW=0, compiles to nothing (zero overhead).
|
||||||
|
// NOTE: We avoid #include <windows.h> here to prevent winsock.h/winsock2.h
|
||||||
|
// conflicts. Instead, forward-declare OutputDebugStringA directly.
|
||||||
|
#if ANSCORE_DEBUGVIEW && defined(_WIN32)
|
||||||
|
extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA(const char* lpOutputString);
|
||||||
|
#define ANS_DBG(tag, fmt, ...) do { \
|
||||||
|
char _ans_dbg_buf[1024]; \
|
||||||
|
snprintf(_ans_dbg_buf, sizeof(_ans_dbg_buf), "[" tag "] " fmt "\n", ##__VA_ARGS__); \
|
||||||
|
OutputDebugStringA(_ans_dbg_buf); \
|
||||||
|
} while(0)
|
||||||
|
#else
|
||||||
|
#define ANS_DBG(tag, fmt, ...) ((void)0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef ANSLICENSE_EXPORTS
|
#ifdef ANSLICENSE_EXPORTS
|
||||||
#define ANSLICENSE_API __declspec(dllexport)
|
#define ANSLICENSE_API __declspec(dllexport)
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
|
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
|
||||||
{
|
{
|
||||||
|
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
|
||||||
|
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
|
||||||
|
// respects the cudaMemPool release threshold (set to 0), so freed memory is
|
||||||
|
// returned to the GPU immediately instead of being cached forever.
|
||||||
|
//
|
||||||
|
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
|
||||||
|
// (no API to force release), causing VRAM to grow monotonically when GpuMat
|
||||||
|
// objects of varying sizes are allocated and freed repeatedly (different batch
|
||||||
|
// sizes, different image resolutions across cameras).
|
||||||
|
{
|
||||||
|
static std::once_flag s_allocatorFlag;
|
||||||
|
std::call_once(s_allocatorFlag, []() {
|
||||||
|
// Set release threshold to 0 on all GPUs
|
||||||
|
int deviceCount = 0;
|
||||||
|
cudaGetDeviceCount(&deviceCount);
|
||||||
|
for (int d = 0; d < deviceCount; ++d) {
|
||||||
|
cudaMemPool_t pool = nullptr;
|
||||||
|
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
|
||||||
|
uint64_t threshold = 0;
|
||||||
|
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
|
||||||
|
// synchronous cudaMalloc but goes through the stream-ordered pool).
|
||||||
|
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
|
||||||
|
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
|
||||||
|
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
|
||||||
|
size_t step = elemSize * cols;
|
||||||
|
// Align step to 256 bytes (same as default allocator)
|
||||||
|
step = (step + 255) & ~size_t(255);
|
||||||
|
void* ptr = nullptr;
|
||||||
|
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
|
||||||
|
if (err != cudaSuccess || !ptr) {
|
||||||
|
// Fallback to regular cudaMalloc if async not supported
|
||||||
|
err = cudaMalloc(&ptr, step * rows);
|
||||||
|
if (err != cudaSuccess) return false;
|
||||||
|
}
|
||||||
|
mat->data = static_cast<uchar*>(ptr);
|
||||||
|
mat->step = step;
|
||||||
|
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
|
||||||
|
*mat->refcount = 1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
void free(cv::cuda::GpuMat* mat) override {
|
||||||
|
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
|
||||||
|
cv::fastFree(mat->refcount);
|
||||||
|
mat->data = nullptr;
|
||||||
|
mat->datastart = nullptr;
|
||||||
|
mat->dataend = nullptr;
|
||||||
|
mat->refcount = nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
static AsyncAllocator s_allocator;
|
||||||
|
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
|
||||||
|
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
m_lastLoadFailedVRAM = false; // reset on each load attempt
|
m_lastLoadFailedVRAM = false; // reset on each load attempt
|
||||||
m_subVals = subVals;
|
m_subVals = subVals;
|
||||||
m_divVals = divVals;
|
m_divVals = divVals;
|
||||||
@@ -958,11 +1017,13 @@ trt_cache_create_context:
|
|||||||
|
|
||||||
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
||||||
if (!m_context) {
|
if (!m_context) {
|
||||||
|
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
||||||
+ trtModelPath, true);
|
+ trtModelPath, true);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
|
||||||
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
|
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
size_t vramFree = 0, vramTotal = 0;
|
||||||
|
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||||
|
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
|
||||||
|
totalAllocated / (1024*1024),
|
||||||
|
(vramTotal - vramFree) / (1024*1024),
|
||||||
|
vramFree / (1024*1024),
|
||||||
|
vramTotal / (1024*1024));
|
||||||
|
}
|
||||||
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
|
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
|
||||||
|
|
||||||
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
||||||
|
|||||||
@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
|
|||||||
// harmless — the second one finds a fresh slot immediately.
|
// harmless — the second one finds a fresh slot immediately.
|
||||||
InferenceSlot* slot = nullptr;
|
InferenceSlot* slot = nullptr;
|
||||||
bool kickedGrowth = false;
|
bool kickedGrowth = false;
|
||||||
|
auto _poolAcquireStart = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(m_slotMutex);
|
std::unique_lock<std::mutex> lock(m_slotMutex);
|
||||||
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!slot) {
|
if (!slot) {
|
||||||
|
ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
|
||||||
|
n, m_activeCount.load());
|
||||||
// All slots busy. In elastic mode, proactively grow the
|
// All slots busy. In elastic mode, proactively grow the
|
||||||
// pool in the background so the next request has a slot
|
// pool in the background so the next request has a slot
|
||||||
// on a different GPU. We only kick once per wait cycle.
|
// on a different GPU. We only kick once per wait cycle.
|
||||||
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -- 3. Still no slot => reject ---------------------------------------
|
// -- 3. Still no slot => reject ---------------------------------------
|
||||||
|
{
|
||||||
|
double _acquireMs = std::chrono::duration<double, std::milli>(
|
||||||
|
std::chrono::steady_clock::now() - _poolAcquireStart).count();
|
||||||
|
if (_acquireMs > 100.0) {
|
||||||
|
ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
|
||||||
|
_acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
|
||||||
|
m_activeCount.load(), m_slots.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!slot) {
|
if (!slot) {
|
||||||
|
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
|
||||||
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
||||||
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
||||||
+ " slot(s) busy"
|
+ " slot(s) busy"
|
||||||
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
|
|||||||
if (currentDev != slot->deviceIndex) {
|
if (currentDev != slot->deviceIndex) {
|
||||||
cudaSetDevice(slot->deviceIndex);
|
cudaSetDevice(slot->deviceIndex);
|
||||||
}
|
}
|
||||||
|
ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
|
||||||
|
slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||||
|
auto _slotStart = std::chrono::steady_clock::now();
|
||||||
result = slot->engine->runInference(inputs, featureVectors);
|
result = slot->engine->runInference(inputs, featureVectors);
|
||||||
|
auto _slotEnd = std::chrono::steady_clock::now();
|
||||||
|
double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
|
||||||
|
if (_slotMs > 500.0) {
|
||||||
|
ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
|
||||||
|
_slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (const std::exception& ex) {
|
catch (const std::exception& ex) {
|
||||||
|
ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
|
||||||
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
|
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
|
||||||
}
|
}
|
||||||
catch (...) {
|
catch (...) {
|
||||||
|
ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
|
||||||
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
|
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <chrono>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <semaphore>
|
#include <semaphore>
|
||||||
#include "TRTCompat.h"
|
#include "TRTCompat.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
||||||
|
|
||||||
// Per-device mutex for CUDA graph capture.
|
// Per-device mutex for CUDA graph capture.
|
||||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||||
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
|
|||||||
cudaError_t err = cudaStreamQuery(stream);
|
cudaError_t err = cudaStreamQuery(stream);
|
||||||
if (err != cudaErrorNotReady) return err;
|
if (err != cudaErrorNotReady) return err;
|
||||||
|
|
||||||
|
auto syncStart = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
|
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
|
||||||
// Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
|
|
||||||
// a window to acquire the exclusive nvcuda64 SRW lock.
|
|
||||||
// Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
|
|
||||||
// caused ~20-second stalls when concurrent cleanup needed exclusive access.
|
|
||||||
for (int i = 0; i < 10; ++i) {
|
for (int i = 0; i < 10; ++i) {
|
||||||
Sleep(0);
|
Sleep(0);
|
||||||
err = cudaStreamQuery(stream);
|
err = cudaStreamQuery(stream);
|
||||||
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
|
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
|
||||||
|
int sleepCount = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
Sleep(1);
|
Sleep(1);
|
||||||
|
sleepCount++;
|
||||||
err = cudaStreamQuery(stream);
|
err = cudaStreamQuery(stream);
|
||||||
if (err != cudaErrorNotReady) return err;
|
if (err != cudaErrorNotReady) {
|
||||||
|
// Log if sync took too long (>500ms indicates GPU stall)
|
||||||
|
auto elapsed = std::chrono::duration<double, std::milli>(
|
||||||
|
std::chrono::steady_clock::now() - syncStart).count();
|
||||||
|
if (elapsed > 500.0) {
|
||||||
|
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
|
||||||
|
elapsed, sleepCount, (void*)stream, (int)err);
|
||||||
|
}
|
||||||
|
return err;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
|
||||||
|
// moment GPU becomes saturated. Logs every 50 inferences with rolling
|
||||||
|
// average, and immediately when degradation is detected.
|
||||||
|
// ============================================================================
|
||||||
|
// Global (process-wide) counters shared across all engine instances/threads
|
||||||
|
static std::atomic<int64_t> s_globalInfCount{0};
|
||||||
|
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
|
||||||
|
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
|
||||||
|
|
||||||
|
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
||||||
|
s_globalActiveInf.fetch_add(1);
|
||||||
|
|
||||||
|
// Per-thread tracking
|
||||||
|
{
|
||||||
|
static thread_local int64_t s_infCount = 0;
|
||||||
|
static thread_local std::chrono::steady_clock::time_point s_lastLog;
|
||||||
|
static thread_local double s_rollingAvgMs = 0.0;
|
||||||
|
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
|
||||||
|
static thread_local double s_maxMs = 0.0;
|
||||||
|
static thread_local bool s_degradationLogged = false;
|
||||||
|
s_infCount++;
|
||||||
|
|
||||||
|
if (s_infCount == 1) {
|
||||||
|
s_lastLog = std::chrono::steady_clock::now();
|
||||||
|
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log every 50 inferences (more frequent than 500 to catch transitions)
|
||||||
|
if (s_infCount % 50 == 0) {
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
|
||||||
|
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
|
||||||
|
s_lastLog = now;
|
||||||
|
|
||||||
|
size_t vramFree = 0, vramTotal = 0;
|
||||||
|
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||||
|
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
|
||||||
|
size_t vramFreeMB = vramFree / (1024 * 1024);
|
||||||
|
|
||||||
|
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
|
||||||
|
s_infCount, myInfNum, s_globalActiveInf.load(),
|
||||||
|
fps, s_rollingAvgMs, s_maxMs,
|
||||||
|
(int)inputs[0].size(), m_graphExecs.size(),
|
||||||
|
vramUsedMB, vramFreeMB);
|
||||||
|
|
||||||
|
// Capture baseline from first 100 inferences
|
||||||
|
if (s_infCount == 100) {
|
||||||
|
s_baselineMs = s_rollingAvgMs;
|
||||||
|
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect degradation: avg >3x baseline AND baseline is set
|
||||||
|
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
|
||||||
|
s_degradationLogged = true;
|
||||||
|
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
|
||||||
|
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
|
||||||
|
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset max for next window
|
||||||
|
s_maxMs = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const auto numInputs = m_inputDims.size();
|
const auto numInputs = m_inputDims.size();
|
||||||
if (inputs.size() != numInputs) {
|
if (inputs.size() != numInputs) {
|
||||||
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
||||||
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (anyDimChanged) {
|
if (anyDimChanged) {
|
||||||
|
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
|
||||||
|
m_lastBatchSize, batchSize, m_graphExecs.size());
|
||||||
|
|
||||||
// === First-time diagnostics (verbose, once) ===
|
// === First-time diagnostics (verbose, once) ===
|
||||||
const bool firstTime = !m_batchShapeChangeLogged;
|
const bool firstTime = !m_batchShapeChangeLogged;
|
||||||
|
|
||||||
@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
<< newDims.d[3] << "]" << std::endl;
|
<< newDims.d[3] << "]" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
|
||||||
|
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
|
||||||
if (!m_context->setInputShape(tensorName, newDims)) {
|
if (!m_context->setInputShape(tensorName, newDims)) {
|
||||||
|
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
|
||||||
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
|
|
||||||
m_lastBatchSize = batchSize;
|
m_lastBatchSize = batchSize;
|
||||||
m_batchShapeChangeLogged = true;
|
m_batchShapeChangeLogged = true;
|
||||||
|
|
||||||
|
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
|
||||||
|
// Graphs were captured with the OLD context state (old tensor shapes).
|
||||||
|
// Launching them after setInputShape() produces undefined GPU behavior
|
||||||
|
// (invalid kernel sequences, SM lockup at 100%, inference hang).
|
||||||
|
if (!m_graphExecs.empty()) {
|
||||||
|
size_t destroyed = m_graphExecs.size();
|
||||||
|
for (auto& [bs, ge] : m_graphExecs) {
|
||||||
|
if (ge) cudaGraphExecDestroy(ge);
|
||||||
|
}
|
||||||
|
m_graphExecs.clear();
|
||||||
|
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
|
||||||
|
destroyed, batchSize);
|
||||||
|
if (m_verbose || firstTime) {
|
||||||
|
std::cout << "Info: Invalidated " << destroyed
|
||||||
|
<< " cached CUDA graphs after shape change" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (m_verbose && firstTime) {
|
if (m_verbose && firstTime) {
|
||||||
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
||||||
<< batchSize << " ✓\n" << std::endl;
|
<< batchSize << " ✓\n" << std::endl;
|
||||||
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
//
|
//
|
||||||
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
||||||
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
||||||
|
auto _prepStart = std::chrono::steady_clock::now();
|
||||||
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
||||||
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
||||||
preprocessedBuffers.reserve(numInputs);
|
preprocessedBuffers.reserve(numInputs);
|
||||||
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
double _prepMs = std::chrono::duration<double, std::milli>(
|
||||||
|
std::chrono::steady_clock::now() - _prepStart).count();
|
||||||
|
if (_prepMs > 100.0) {
|
||||||
|
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// PRE-ALLOCATE OUTPUT STRUCTURE
|
// PRE-ALLOCATE OUTPUT STRUCTURE
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
if (canGraph) {
|
if (canGraph) {
|
||||||
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
||||||
if (!graphExec) {
|
if (!graphExec) {
|
||||||
|
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
|
||||||
|
batchSize, m_graphExecs.size());
|
||||||
// First call for this batchSize -- capture a new graph.
|
// First call for this batchSize -- capture a new graph.
|
||||||
// Serialise captures across all Engine instances on this device to
|
// Serialise captures across all Engine instances on this device to
|
||||||
// prevent TRT's shared workspace from creating cross-stream
|
// prevent TRT's shared workspace from creating cross-stream
|
||||||
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
||||||
graphExec = exec;
|
graphExec = exec;
|
||||||
cudaGraphDestroy(graph);
|
cudaGraphDestroy(graph);
|
||||||
|
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
|
||||||
|
batchSize, (void*)graphExec);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!graphExec) {
|
if (!graphExec) {
|
||||||
|
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
|
||||||
|
batchSize);
|
||||||
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
||||||
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
||||||
// Disable graph acceleration for this Engine instance.
|
// Disable graph acceleration for this Engine instance.
|
||||||
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (graphExec) {
|
if (graphExec) {
|
||||||
|
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
|
||||||
// Launch the pre-captured graph (single API call replaces many).
|
// Launch the pre-captured graph (single API call replaces many).
|
||||||
|
auto _graphStart = std::chrono::steady_clock::now();
|
||||||
cudaGraphLaunch(graphExec, m_inferenceStream);
|
cudaGraphLaunch(graphExec, m_inferenceStream);
|
||||||
cudaStreamSynchronize_Safe(m_inferenceStream);
|
cudaStreamSynchronize_Safe(m_inferenceStream);
|
||||||
|
auto _graphEnd = std::chrono::steady_clock::now();
|
||||||
|
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
|
||||||
|
if (_graphMs > 500.0) {
|
||||||
|
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
|
||||||
|
_graphMs, batchSize, s_globalActiveInf.load());
|
||||||
|
}
|
||||||
|
|
||||||
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
||||||
for (int batch = 0; batch < batchSize; ++batch) {
|
for (int batch = 0; batch < batchSize; ++batch) {
|
||||||
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
// ----------------------
|
// ----------------------
|
||||||
// Used when pinned buffers are unavailable or graph capture failed.
|
// Used when pinned buffers are unavailable or graph capture failed.
|
||||||
if (!graphUsed) {
|
if (!graphUsed) {
|
||||||
|
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
|
||||||
|
auto enqueueStart = std::chrono::steady_clock::now();
|
||||||
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||||
|
auto enqueueEnd = std::chrono::steady_clock::now();
|
||||||
|
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
|
||||||
|
if (enqueueMs > 500.0) {
|
||||||
|
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
|
||||||
|
}
|
||||||
if (!success) {
|
if (!success) {
|
||||||
|
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
|
||||||
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
||||||
+ std::to_string(batchSize)
|
+ std::to_string(batchSize)
|
||||||
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
||||||
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto syncStart = std::chrono::steady_clock::now();
|
||||||
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
|
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
|
||||||
|
auto syncEnd = std::chrono::steady_clock::now();
|
||||||
|
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
|
||||||
|
if (syncMs > 500.0) {
|
||||||
|
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
|
||||||
|
}
|
||||||
if (syncErr != cudaSuccess) {
|
if (syncErr != cudaSuccess) {
|
||||||
|
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
|
||||||
|
(int)syncErr, cudaGetErrorString(syncErr));
|
||||||
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
||||||
+ std::string(cudaGetErrorString(syncErr));
|
+ std::string(cudaGetErrorString(syncErr));
|
||||||
std::cout << errMsg << std::endl;
|
std::cout << errMsg << std::endl;
|
||||||
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// SM=100% DETECTOR — end-of-inference timing
|
||||||
|
// ============================================================================
|
||||||
|
{
|
||||||
|
static thread_local double s_ema = 0;
|
||||||
|
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
|
||||||
|
static thread_local bool s_firstDone = false;
|
||||||
|
|
||||||
|
auto _now = std::chrono::steady_clock::now();
|
||||||
|
if (s_firstDone) {
|
||||||
|
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
|
||||||
|
// If time between consecutive inferences jumps dramatically,
|
||||||
|
// something blocked the thread (SM=100% or mutex contention)
|
||||||
|
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
|
||||||
|
size_t vf = 0, vt = 0;
|
||||||
|
cudaMemGetInfo(&vf, &vt);
|
||||||
|
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
|
||||||
|
sinceLastMs, s_ema, sinceLastMs / s_ema,
|
||||||
|
s_globalActiveInf.load(), vf / (1024*1024));
|
||||||
|
}
|
||||||
|
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
|
||||||
|
}
|
||||||
|
s_prevEnd = _now;
|
||||||
|
s_firstDone = true;
|
||||||
|
|
||||||
|
s_globalActiveInf.fetch_sub(1);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,25 +27,29 @@ template <typename T>
|
|||||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||||
size_t height, size_t width,
|
size_t height, size_t width,
|
||||||
const cv::Scalar& bgcolor) {
|
const cv::Scalar& bgcolor) {
|
||||||
// Ensure input is valid
|
|
||||||
if (input.empty()) {
|
if (input.empty()) {
|
||||||
return cv::cuda::GpuMat();
|
return cv::cuda::GpuMat();
|
||||||
}
|
}
|
||||||
// Create a CUDA stream
|
|
||||||
cv::cuda::Stream stream;
|
// Use a thread_local stream to avoid creating a new CUDA stream per call.
|
||||||
// Calculate aspect ratio and unpadded dimensions
|
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
|
||||||
|
thread_local cv::cuda::Stream stream;
|
||||||
|
|
||||||
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
||||||
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
||||||
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
||||||
|
|
||||||
// Resize the input image
|
// Resize the input image
|
||||||
cv::cuda::GpuMat re;
|
cv::cuda::GpuMat re;
|
||||||
re.create(unpad_h, unpad_w, input.type());
|
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
|
||||||
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
||||||
|
|
||||||
// Create the output image and fill with the background color
|
// Create the output image and fill with the background color
|
||||||
cv::cuda::GpuMat out;
|
cv::cuda::GpuMat out;
|
||||||
out.create(height, width, input.type());
|
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
|
||||||
out.setTo(bgcolor, stream);
|
out.setTo(bgcolor, stream);
|
||||||
// Copy the resized content into the top-left corner of the output image
|
|
||||||
|
// Copy the resized content into the top-left corner
|
||||||
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
||||||
stream.waitForCompletion();
|
stream.waitForCompletion();
|
||||||
return out;
|
return out;
|
||||||
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
|||||||
const int W = batchInput[0].cols;
|
const int W = batchInput[0].cols;
|
||||||
const int batch = static_cast<int>(batchInput.size());
|
const int batch = static_cast<int>(batchInput.size());
|
||||||
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
||||||
|
const int totalElems = batch * 3 * static_cast<int>(planeSize);
|
||||||
|
|
||||||
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
|
// thread_local cached buffers — reused across calls on the same thread.
|
||||||
// Total elements = batch * 3 * H * W.
|
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
|
||||||
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
|
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
|
||||||
|
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
|
||||||
|
thread_local cv::cuda::GpuMat tl_blob;
|
||||||
|
thread_local cv::cuda::GpuMat tl_floatImg;
|
||||||
|
thread_local int tl_blobMaxElems = 0;
|
||||||
|
|
||||||
|
if (totalElems > tl_blobMaxElems) {
|
||||||
|
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
|
||||||
|
tl_blobMaxElems = totalElems;
|
||||||
|
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
|
||||||
|
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
|
||||||
|
batch, W, H, blobBytes / (1024.0 * 1024.0));
|
||||||
|
}
|
||||||
|
// Use a sub-region of the cached blob for the current batch
|
||||||
|
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
|
||||||
|
|
||||||
for (int img = 0; img < batch; ++img) {
|
for (int img = 0; img < batch; ++img) {
|
||||||
// 1. Convert to float and normalise while still in HWC (interleaved) format.
|
|
||||||
// Channel-wise subtract / divide operate correctly on interleaved data.
|
|
||||||
cv::cuda::GpuMat floatImg;
|
|
||||||
if (normalize) {
|
if (normalize) {
|
||||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
|
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||||
} else {
|
} else {
|
||||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
|
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
|
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
|
||||||
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
|
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
|
||||||
|
|
||||||
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
||||||
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
||||||
|
|
||||||
if (swapRB) {
|
if (swapRB) {
|
||||||
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
|
|
||||||
std::vector<cv::cuda::GpuMat> channels{
|
std::vector<cv::cuda::GpuMat> channels{
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
|
||||||
cv::cuda::split(floatImg, channels, stream);
|
cv::cuda::split(tl_floatImg, channels, stream);
|
||||||
} else {
|
} else {
|
||||||
// BGR input -> BGR planes: keep channel order
|
|
||||||
std::vector<cv::cuda::GpuMat> channels{
|
std::vector<cv::cuda::GpuMat> channels{
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
||||||
cv::cuda::split(floatImg, channels, stream);
|
cv::cuda::split(tl_floatImg, channels, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
|||||||
template <typename T> void Engine<T>::clearGpuBuffers() {
|
template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||||
if (!m_buffers.empty()) {
|
if (!m_buffers.empty()) {
|
||||||
// Free ALL I/O GPU buffers (both inputs and outputs).
|
// Free ALL I/O GPU buffers (both inputs and outputs).
|
||||||
// Previously only outputs were freed, leaking input allocations from loadNetwork().
|
|
||||||
for (void* ptr : m_buffers) {
|
for (void* ptr : m_buffers) {
|
||||||
if (ptr) {
|
if (ptr) {
|
||||||
Util::checkCudaErrorCode(cudaFree(ptr));
|
Util::checkCudaErrorCode(cudaFree(ptr));
|
||||||
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
|
|||||||
}
|
}
|
||||||
m_buffers.clear();
|
m_buffers.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
|
||||||
|
// They are cleaned up automatically when threads exit.
|
||||||
|
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -218,44 +218,25 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSFLVClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
bool ANSFLVClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
||||||
// Quick size and type checks
|
// Use decoder frame age — returns "stale" only if no decoder output for 5+ seconds.
|
||||||
if (img1.size() != img2.size() || img1.type() != img2.type()) {
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
return false;
|
if (ageMs > 5000.0) return true; // Truly stale
|
||||||
}
|
if (ageMs > 0.0) return false; // Decoder alive
|
||||||
|
|
||||||
// Handle empty images
|
// Fallback for startup (no frame decoded yet)
|
||||||
if (img1.empty()) {
|
if (img1.empty() && img2.empty()) return true;
|
||||||
return img2.empty();
|
if (img1.empty() || img2.empty()) return false;
|
||||||
}
|
if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
|
||||||
|
if (img1.data == img2.data) return true;
|
||||||
|
|
||||||
if (img1.isContinuous() && img2.isContinuous()) {
|
if (img1.isContinuous() && img2.isContinuous()) {
|
||||||
const size_t totalBytes = img1.total() * img1.elemSize();
|
const size_t totalBytes = img1.total() * img1.elemSize();
|
||||||
|
|
||||||
// Fast rejection: sample 5 positions across contiguous memory
|
|
||||||
const size_t quarter = totalBytes / 4;
|
|
||||||
const size_t half = totalBytes / 2;
|
|
||||||
const size_t threeQuarter = 3 * totalBytes / 4;
|
|
||||||
|
|
||||||
if (img1.data[0] != img2.data[0] ||
|
|
||||||
img1.data[quarter] != img2.data[quarter] ||
|
|
||||||
img1.data[half] != img2.data[half] ||
|
|
||||||
img1.data[threeQuarter] != img2.data[threeQuarter] ||
|
|
||||||
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Full comparison
|
|
||||||
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
|
|
||||||
const size_t rowSize = img1.cols * img1.elemSize();
|
const size_t rowSize = img1.cols * img1.elemSize();
|
||||||
for (int i = 0; i < img1.rows; i++) {
|
for (int i = 0; i < img1.rows; i++) {
|
||||||
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
|
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
cv::Mat ANSFLVClient::GetImage(int& width, int& height, int64_t& pts) {
|
cv::Mat ANSFLVClient::GetImage(int& width, int& height, int64_t& pts) {
|
||||||
|
|||||||
@@ -208,44 +208,23 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSMJPEGClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
bool ANSMJPEGClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
||||||
// Quick size and type checks
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
if (img1.size() != img2.size() || img1.type() != img2.type()) {
|
if (ageMs > 5000.0) return true;
|
||||||
return false;
|
if (ageMs > 0.0) return false;
|
||||||
}
|
|
||||||
|
|
||||||
// Handle empty images
|
if (img1.empty() && img2.empty()) return true;
|
||||||
if (img1.empty()) {
|
if (img1.empty() || img2.empty()) return false;
|
||||||
return img2.empty();
|
if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
|
||||||
}
|
if (img1.data == img2.data) return true;
|
||||||
|
|
||||||
if (img1.isContinuous() && img2.isContinuous()) {
|
if (img1.isContinuous() && img2.isContinuous()) {
|
||||||
const size_t totalBytes = img1.total() * img1.elemSize();
|
const size_t totalBytes = img1.total() * img1.elemSize();
|
||||||
|
|
||||||
// Fast rejection: sample 5 positions across contiguous memory
|
|
||||||
const size_t quarter = totalBytes / 4;
|
|
||||||
const size_t half = totalBytes / 2;
|
|
||||||
const size_t threeQuarter = 3 * totalBytes / 4;
|
|
||||||
|
|
||||||
if (img1.data[0] != img2.data[0] ||
|
|
||||||
img1.data[quarter] != img2.data[quarter] ||
|
|
||||||
img1.data[half] != img2.data[half] ||
|
|
||||||
img1.data[threeQuarter] != img2.data[threeQuarter] ||
|
|
||||||
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Full comparison
|
|
||||||
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
|
|
||||||
const size_t rowSize = img1.cols * img1.elemSize();
|
const size_t rowSize = img1.cols * img1.elemSize();
|
||||||
for (int i = 0; i < img1.rows; i++) {
|
for (int i = 0; i < img1.rows; i++) {
|
||||||
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
|
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
cv::Mat ANSMJPEGClient::GetImage(int& width, int& height, int64_t& pts) {
|
cv::Mat ANSMJPEGClient::GetImage(int& width, int& height, int64_t& pts) {
|
||||||
|
|||||||
@@ -213,43 +213,22 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSRTMPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
bool ANSRTMPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
||||||
// Quick size and type checks
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
if (img1.size() != img2.size() || img1.type() != img2.type()) {
|
if (ageMs > 5000.0) return true;
|
||||||
return false;
|
if (ageMs > 0.0) return false;
|
||||||
}
|
|
||||||
|
|
||||||
// Handle empty images
|
if (img1.empty() && img2.empty()) return true;
|
||||||
if (img1.empty()) {
|
if (img1.empty() || img2.empty()) return false;
|
||||||
return img2.empty();
|
if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
|
||||||
}
|
if (img1.data == img2.data) return true;
|
||||||
|
|
||||||
if (img1.isContinuous() && img2.isContinuous()) {
|
if (img1.isContinuous() && img2.isContinuous()) {
|
||||||
const size_t totalBytes = img1.total() * img1.elemSize();
|
const size_t totalBytes = img1.total() * img1.elemSize();
|
||||||
|
|
||||||
// Fast rejection: sample 5 positions across contiguous memory
|
|
||||||
// Catches 99.99% of different frames immediately
|
|
||||||
const size_t quarter = totalBytes / 4;
|
|
||||||
const size_t half = totalBytes / 2;
|
|
||||||
const size_t threeQuarter = 3 * totalBytes / 4;
|
|
||||||
|
|
||||||
if (img1.data[0] != img2.data[0] ||
|
|
||||||
img1.data[quarter] != img2.data[quarter] ||
|
|
||||||
img1.data[half] != img2.data[half] ||
|
|
||||||
img1.data[threeQuarter] != img2.data[threeQuarter] ||
|
|
||||||
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Full comparison
|
|
||||||
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
|
|
||||||
const size_t rowSize = img1.cols * img1.elemSize();
|
const size_t rowSize = img1.cols * img1.elemSize();
|
||||||
for (int i = 0; i < img1.rows; i++) {
|
for (int i = 0; i < img1.rows; i++) {
|
||||||
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
|
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -2,7 +2,9 @@
|
|||||||
#include "ANSMatRegistry.h"
|
#include "ANSMatRegistry.h"
|
||||||
#include "ANSGpuFrameOps.h"
|
#include "ANSGpuFrameOps.h"
|
||||||
#include "GpuNV12SlotPool.h"
|
#include "GpuNV12SlotPool.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <chrono>
|
||||||
#include <format>
|
#include <format>
|
||||||
#include "media_codec.h"
|
#include "media_codec.h"
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -69,6 +71,7 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ANSRTSPClient::Destroy() {
|
void ANSRTSPClient::Destroy() {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "DESTROY called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
|
||||||
// Move the player client pointer out of the lock scope, then
|
// Move the player client pointer out of the lock scope, then
|
||||||
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
|
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
|
||||||
// cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
|
// cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
|
||||||
@@ -126,6 +129,24 @@ namespace ANSCENTER {
|
|||||||
// belong to the global GpuNV12SlotPool, not the decoder.
|
// belong to the global GpuNV12SlotPool, not the decoder.
|
||||||
if (clientToClose) {
|
if (clientToClose) {
|
||||||
clientToClose->close();
|
clientToClose->close();
|
||||||
|
|
||||||
|
// Force CUDA runtime to release all cached memory from the destroyed
|
||||||
|
// NVDEC decoder. Without this, cuMemFree returns memory to the CUDA
|
||||||
|
// driver's internal cache, and the next camera creation allocates fresh
|
||||||
|
// memory → VRAM grows by ~200-300MB per destroy/create cycle.
|
||||||
|
// cudaDeviceSynchronize ensures all pending GPU ops are done, then
|
||||||
|
// cudaMemPool trim releases the freed blocks back to the OS.
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
cudaMemPool_t memPool = nullptr;
|
||||||
|
int currentDev = 0;
|
||||||
|
cudaGetDevice(¤tDev);
|
||||||
|
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||||
|
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
|
||||||
|
}
|
||||||
|
size_t vramFree = 0, vramTotal = 0;
|
||||||
|
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||||
|
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
|
||||||
|
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
|
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
|
||||||
@@ -211,6 +232,7 @@ namespace ANSCENTER {
|
|||||||
_playerClient->setCrop(crop);
|
_playerClient->setCrop(crop);
|
||||||
}
|
}
|
||||||
bool ANSRTSPClient::Reconnect() {
|
bool ANSRTSPClient::Reconnect() {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "RECONNECT called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
|
||||||
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
|
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
|
||||||
// return the cached _pLastFrame instead of calling into the player,
|
// return the cached _pLastFrame instead of calling into the player,
|
||||||
// and blocks new TryIncrementInFlight calls (no new NV12 attachments).
|
// and blocks new TryIncrementInFlight calls (no new NV12 attachments).
|
||||||
@@ -253,8 +275,30 @@ namespace ANSCENTER {
|
|||||||
// completed (or timed out), so close() is safe.
|
// completed (or timed out), so close() is safe.
|
||||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||||
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
|
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
|
||||||
|
auto _rc0 = std::chrono::steady_clock::now();
|
||||||
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
|
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
|
||||||
_playerClient->close();
|
_playerClient->close();
|
||||||
|
auto _rc1 = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
// Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
auto _rc2 = std::chrono::steady_clock::now();
|
||||||
|
cudaMemPool_t memPool = nullptr;
|
||||||
|
int currentDev = 0;
|
||||||
|
cudaGetDevice(¤tDev);
|
||||||
|
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||||
|
cudaMemPoolTrimTo(memPool, 0);
|
||||||
|
}
|
||||||
|
auto _rc3 = std::chrono::steady_clock::now();
|
||||||
|
{
|
||||||
|
size_t vf = 0, vt = 0;
|
||||||
|
cudaMemGetInfo(&vf, &vt);
|
||||||
|
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
|
||||||
|
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
|
||||||
|
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
|
||||||
|
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
|
||||||
|
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
|
||||||
|
}
|
||||||
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
|
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
|
||||||
|
|
||||||
// 3. Re-setup and play under the mutex.
|
// 3. Re-setup and play under the mutex.
|
||||||
@@ -283,12 +327,9 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSRTSPClient::Stop() {
|
bool ANSRTSPClient::Stop() {
|
||||||
// Grab the player pointer and clear _isPlaying under the lock,
|
// Stop playback but keep the RTSP connection and NVDEC decoder alive.
|
||||||
// then call stop() OUTSIDE the mutex. stop() internally calls
|
// LabVIEW uses Stop/Start to pause cameras when no AI task is subscribed.
|
||||||
// StopVideoDecoder -> decoder->flush() which does CUDA calls
|
// The camera resumes instantly on Start() without re-connecting.
|
||||||
// that can block on the nvcuda64 SRW lock. Holding _mutex
|
|
||||||
// during that time blocks all other operations on this client
|
|
||||||
// and contributes to the convoy when many clients stop at once.
|
|
||||||
CRtspPlayer* player = nullptr;
|
CRtspPlayer* player = nullptr;
|
||||||
{
|
{
|
||||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
@@ -300,6 +341,7 @@ namespace ANSCENTER {
|
|||||||
if (player) {
|
if (player) {
|
||||||
player->stop();
|
player->stop();
|
||||||
}
|
}
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "STOP complete: handle=%p (connection kept alive)", (void*)this);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool ANSRTSPClient::Pause() {
|
bool ANSRTSPClient::Pause() {
|
||||||
@@ -342,45 +384,44 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSRTSPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
bool ANSRTSPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
||||||
// Quick size and type checks
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
if (img1.size() != img2.size() || img1.type() != img2.type()) {
|
|
||||||
return false;
|
if (ageMs > 5000.0) {
|
||||||
|
ANS_DBG("RTSP_Stale", "FROZEN DETECTED: ageMs=%.1f url=%s playing=%d — camera truly stale",
|
||||||
|
ageMs, _url.c_str(), (int)_isPlaying);
|
||||||
|
return true; // Truly stale — no decoder output for 5+ seconds
|
||||||
|
}
|
||||||
|
if (ageMs > 0.0) {
|
||||||
|
return false; // Decoder is receiving frames — camera is alive
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle empty images
|
// ageMs == 0 means no frame has been decoded yet (startup).
|
||||||
if (img1.empty()) {
|
// Fall back to pixel comparison for backward compatibility.
|
||||||
return img2.empty();
|
if (img1.empty() && img2.empty()) return true;
|
||||||
}
|
if (img1.empty() || img2.empty()) return false;
|
||||||
|
if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
|
||||||
|
|
||||||
|
// Same data pointer = same cv::Mat (shallow copy)
|
||||||
|
if (img1.data == img2.data) return true;
|
||||||
|
|
||||||
|
// Quick 5-point sampling
|
||||||
if (img1.isContinuous() && img2.isContinuous()) {
|
if (img1.isContinuous() && img2.isContinuous()) {
|
||||||
const size_t totalBytes = img1.total() * img1.elemSize();
|
const size_t totalBytes = img1.total() * img1.elemSize();
|
||||||
|
|
||||||
// Fast rejection: sample 5 positions across contiguous memory
|
|
||||||
// Catches 99.99% of different frames immediately
|
|
||||||
const size_t quarter = totalBytes / 4;
|
const size_t quarter = totalBytes / 4;
|
||||||
const size_t half = totalBytes / 2;
|
const size_t half = totalBytes / 2;
|
||||||
const size_t threeQuarter = 3 * totalBytes / 4;
|
|
||||||
|
|
||||||
if (img1.data[0] != img2.data[0] ||
|
if (img1.data[0] != img2.data[0] ||
|
||||||
img1.data[quarter] != img2.data[quarter] ||
|
img1.data[quarter] != img2.data[quarter] ||
|
||||||
img1.data[half] != img2.data[half] ||
|
img1.data[half] != img2.data[half] ||
|
||||||
img1.data[threeQuarter] != img2.data[threeQuarter] ||
|
|
||||||
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Full comparison
|
|
||||||
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
|
|
||||||
const size_t rowSize = img1.cols * img1.elemSize();
|
const size_t rowSize = img1.cols * img1.elemSize();
|
||||||
for (int i = 0; i < img1.rows; i++) {
|
for (int i = 0; i < img1.rows; i++) {
|
||||||
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
|
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) {
|
cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) {
|
||||||
@@ -414,6 +455,20 @@ namespace ANSCENTER {
|
|||||||
if (currentPts == _pts && !_pLastFrame.empty()) {
|
if (currentPts == _pts && !_pLastFrame.empty()) {
|
||||||
width = _imageWidth;
|
width = _imageWidth;
|
||||||
height = _imageHeight;
|
height = _imageHeight;
|
||||||
|
// Return timestamp based on decoder frame age so LabVIEW can distinguish
|
||||||
|
// "rate-limited duplicate" from "camera truly stale".
|
||||||
|
// If decoder is still receiving frames (age < 5s), advance PTS so LabVIEW
|
||||||
|
// sees a changing timestamp and doesn't trigger false reconnect.
|
||||||
|
// If decoder is stale (age > 5s), return same PTS so LabVIEW detects it.
|
||||||
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
|
if (ageMs > 0.0 && ageMs < 5000.0) {
|
||||||
|
// Camera alive but rate-limited — advance PTS to prevent false stale detection
|
||||||
|
_pts++;
|
||||||
|
} else if (ageMs >= 5000.0) {
|
||||||
|
// Camera stale — keep same PTS so LabVIEW triggers reconnect
|
||||||
|
ANS_DBG("RTSP_GetImage", "STALE PTS: ageMs=%.1f pts=%lld url=%s — not advancing PTS",
|
||||||
|
ageMs, (long long)_pts, _url.c_str());
|
||||||
|
}
|
||||||
pts = _pts;
|
pts = _pts;
|
||||||
return _pLastFrame;
|
return _pLastFrame;
|
||||||
}
|
}
|
||||||
@@ -891,6 +946,10 @@ namespace ANSCENTER {
|
|||||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
_useNV12FastPath = enable;
|
_useNV12FastPath = enable;
|
||||||
}
|
}
|
||||||
|
double ANSRTSPClient::GetLastFrameAgeMs() {
|
||||||
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
|
return _playerClient->getLastFrameAgeMs();
|
||||||
|
}
|
||||||
AVFrame* ANSRTSPClient::GetNV12Frame() {
|
AVFrame* ANSRTSPClient::GetNV12Frame() {
|
||||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
|
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
|
||||||
@@ -937,6 +996,7 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle, const char* licenseKey, const char* username, const char* password, const char* url) {
|
extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle, const char* licenseKey, const char* username, const char* password, const char* url) {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "CREATE: url=%s", url ? url : "null");
|
||||||
if (!Handle || !licenseKey || !url) return -1;
|
if (!Handle || !licenseKey || !url) return -1;
|
||||||
try {
|
try {
|
||||||
auto ptr = std::make_unique<ANSCENTER::ANSRTSPClient>();
|
auto ptr = std::make_unique<ANSCENTER::ANSRTSPClient>();
|
||||||
@@ -946,11 +1006,10 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
|
|||||||
if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
|
if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
|
||||||
else result = ptr->Init(licenseKey, username, password, url);
|
else result = ptr->Init(licenseKey, username, password, url);
|
||||||
if (result) {
|
if (result) {
|
||||||
// Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
|
// Software decode by default — saves VRAM (no NVDEC DPB surfaces).
|
||||||
// fast path. LabVIEW may not call SetRTSPHWDecoding after
|
// With 100 cameras, HW decode would consume ~5-21 GB VRAM for idle decoders.
|
||||||
// destroy+recreate cycles, so this ensures the new handle always
|
// User can enable HW decode per-camera via SetRTSPHWDecoding(handle, 7).
|
||||||
// uses the GPU decode path instead of falling back to D3D11VA/CPU.
|
// ptr->SetHWDecoding(7); // Disabled — was HW_DECODING_CUDA
|
||||||
ptr->SetHWDecoding(7); // HW_DECODING_CUDA
|
|
||||||
*Handle = ptr.release();
|
*Handle = ptr.release();
|
||||||
extern void anscv_unregister_handle(void*);
|
extern void anscv_unregister_handle(void*);
|
||||||
extern void anscv_register_handle(void*, void(*)(void*));
|
extern void anscv_register_handle(void*, void(*)(void*));
|
||||||
@@ -967,6 +1026,7 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
|
|||||||
} catch (...) { return -1; }
|
} catch (...) { return -1; }
|
||||||
}
|
}
|
||||||
extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle) {
|
extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle) {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "RELEASE: handle=%p", Handle ? (void*)*Handle : nullptr);
|
||||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||||
try {
|
try {
|
||||||
extern void anscv_unregister_handle(void*);
|
extern void anscv_unregister_handle(void*);
|
||||||
@@ -982,25 +1042,27 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
|
|||||||
// on any subsequent call, and prevents NEW NV12 GPU surface
|
// on any subsequent call, and prevents NEW NV12 GPU surface
|
||||||
// pointers from being handed out.
|
// pointers from being handed out.
|
||||||
//
|
//
|
||||||
// Do NOT call Destroy()/close() here — close() frees the
|
// Synchronous cleanup — ensures all GPU resources (NVDEC surfaces, VRAM)
|
||||||
// NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
|
// are fully released BEFORE LabVIEW creates a new camera.
|
||||||
// still be in use by a CUDA inference kernel that received
|
// Previously deferred to a background thread, but that caused the old
|
||||||
// the NV12 pointer from a GetRTSPCVImage call that already
|
// camera's resources to overlap with the new camera's allocations,
|
||||||
// completed before this Release was called.
|
// leading to temporary VRAM doubling (~240MB per camera) and eventual
|
||||||
|
// VRAM exhaustion on cameras with frequent reconnects.
|
||||||
{
|
{
|
||||||
// Use the client's _mutex to safely set _isPlaying = false.
|
auto t0 = std::chrono::steady_clock::now();
|
||||||
// This is the same lock GetImage/GetNV12Frame acquire.
|
raw->Stop();
|
||||||
raw->Stop(); // sets _isPlaying = false, stops playback
|
auto t1 = std::chrono::steady_clock::now();
|
||||||
}
|
raw->Destroy();
|
||||||
|
auto t2 = std::chrono::steady_clock::now();
|
||||||
|
delete raw;
|
||||||
|
auto t3 = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
// Defer the full cleanup (Destroy + delete) to a background thread
|
double stopMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||||
// so LabVIEW's UI thread is not blocked. Destroy() now waits
|
double destroyMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
|
||||||
// precisely for in-flight inference to finish (via _inFlightFrames
|
double deleteMs = std::chrono::duration<double, std::milli>(t3 - t2).count();
|
||||||
// counter + condition variable) instead of the old 500ms sleep hack.
|
ANS_DBG("RTSP_Lifecycle", "RELEASE complete: stop=%.1fms destroy=%.1fms delete=%.1fms total=%.1fms",
|
||||||
std::thread([raw]() {
|
stopMs, destroyMs, deleteMs, stopMs + destroyMs + deleteMs);
|
||||||
try { raw->Destroy(); } catch (...) {}
|
}
|
||||||
try { delete raw; } catch (...) {}
|
|
||||||
}).detach();
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
@@ -1269,6 +1331,7 @@ extern "C" __declspec(dllexport) int GetRTSPImage(ANSCENTER::ANSRTSPClient** Han
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
extern "C" __declspec(dllexport) int StartRTSP(ANSCENTER::ANSRTSPClient **Handle) {
|
extern "C" __declspec(dllexport) int StartRTSP(ANSCENTER::ANSRTSPClient **Handle) {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "START: handle=%p", Handle ? (void*)*Handle : nullptr);
|
||||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||||
try {
|
try {
|
||||||
bool result = (*Handle)->Start();
|
bool result = (*Handle)->Start();
|
||||||
@@ -1301,6 +1364,7 @@ extern "C" __declspec(dllexport) int ReconnectRTSP(ANSCENTER::ANSRTSPClient * *H
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
extern "C" __declspec(dllexport) int StopRTSP(ANSCENTER::ANSRTSPClient * *Handle) {
|
extern "C" __declspec(dllexport) int StopRTSP(ANSCENTER::ANSRTSPClient * *Handle) {
|
||||||
|
ANS_DBG("RTSP_Lifecycle", "STOP: handle=%p", Handle ? (void*)*Handle : nullptr);
|
||||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||||
try {
|
try {
|
||||||
bool result = (*Handle)->Stop();
|
bool result = (*Handle)->Stop();
|
||||||
@@ -1462,9 +1526,15 @@ extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient*
|
|||||||
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
|
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
|
||||||
if (Handle == nullptr || *Handle == nullptr) return;
|
if (Handle == nullptr || *Handle == nullptr) return;
|
||||||
try {
|
try {
|
||||||
(*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path
|
(*Handle)->SetNV12FastPath(enable != 0);
|
||||||
} catch (...) { }
|
} catch (...) { }
|
||||||
}
|
}
|
||||||
|
extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle) {
|
||||||
|
if (Handle == nullptr || *Handle == nullptr) return -1.0;
|
||||||
|
try {
|
||||||
|
return (*Handle)->GetLastFrameAgeMs();
|
||||||
|
} catch (...) { return -1.0; }
|
||||||
|
}
|
||||||
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
|
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
|
||||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ namespace ANSCENTER
|
|||||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable)
|
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable)
|
||||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||||
|
double GetLastFrameAgeMs(); // Milliseconds since last frame from decoder (detects truly stale cameras, unaffected by SetTargetFPS)
|
||||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||||
@@ -145,4 +146,5 @@ extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClie
|
|||||||
extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height);
|
extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height);
|
||||||
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs);
|
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs);
|
||||||
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable);
|
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable);
|
||||||
|
extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle);
|
||||||
#endif
|
#endif
|
||||||
@@ -221,43 +221,22 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ANSSRTClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
bool ANSSRTClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
|
||||||
// Quick size and type checks
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
if (img1.size() != img2.size() || img1.type() != img2.type()) {
|
if (ageMs > 5000.0) return true;
|
||||||
return false;
|
if (ageMs > 0.0) return false;
|
||||||
}
|
|
||||||
|
|
||||||
// Handle empty images
|
if (img1.empty() && img2.empty()) return true;
|
||||||
if (img1.empty()) {
|
if (img1.empty() || img2.empty()) return false;
|
||||||
return img2.empty();
|
if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
|
||||||
}
|
if (img1.data == img2.data) return true;
|
||||||
|
|
||||||
if (img1.isContinuous() && img2.isContinuous()) {
|
if (img1.isContinuous() && img2.isContinuous()) {
|
||||||
const size_t totalBytes = img1.total() * img1.elemSize();
|
const size_t totalBytes = img1.total() * img1.elemSize();
|
||||||
|
|
||||||
// Fast rejection: sample 5 positions across contiguous memory
|
|
||||||
// Catches 99.99% of different frames immediately
|
|
||||||
const size_t quarter = totalBytes / 4;
|
|
||||||
const size_t half = totalBytes / 2;
|
|
||||||
const size_t threeQuarter = 3 * totalBytes / 4;
|
|
||||||
|
|
||||||
if (img1.data[0] != img2.data[0] ||
|
|
||||||
img1.data[quarter] != img2.data[quarter] ||
|
|
||||||
img1.data[half] != img2.data[half] ||
|
|
||||||
img1.data[threeQuarter] != img2.data[threeQuarter] ||
|
|
||||||
img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Full comparison
|
|
||||||
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
return std::memcmp(img1.data, img2.data, totalBytes) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
|
|
||||||
const size_t rowSize = img1.cols * img1.elemSize();
|
const size_t rowSize = img1.cols * img1.elemSize();
|
||||||
for (int i = 0; i < img1.rows; i++) {
|
for (int i = 0; i < img1.rows; i++) {
|
||||||
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
|
if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ namespace ANSCENTER {
|
|||||||
if (!_hwDecodeActive && !_hwPlayer) {
|
if (!_hwDecodeActive && !_hwPlayer) {
|
||||||
try {
|
try {
|
||||||
auto hwp = std::make_unique<CFilePlayer>();
|
auto hwp = std::make_unique<CFilePlayer>();
|
||||||
hwp->setHWDecoding(HW_DECODING_AUTO); // CUDA → D3D11VA → DXVA2 → software
|
hwp->setHWDecoding(HW_DECODING_DISABLE); // Software decode by default — saves VRAM
|
||||||
if (hwp->open(_url)) {
|
if (hwp->open(_url)) {
|
||||||
_hwPlayer = std::move(hwp);
|
_hwPlayer = std::move(hwp);
|
||||||
_hwDecodeActive = true;
|
_hwDecodeActive = true;
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ CVideoPlayer::CVideoPlayer():
|
|||||||
, m_bPaused(FALSE)
|
, m_bPaused(FALSE)
|
||||||
, m_bSizeChanged(FALSE)
|
, m_bSizeChanged(FALSE)
|
||||||
//, m_nRenderMode(RENDER_MODE_KEEP)
|
//, m_nRenderMode(RENDER_MODE_KEEP)
|
||||||
, m_nHWDecoding(HW_DECODING_AUTO)
|
, m_nHWDecoding(HW_DECODING_DISABLE) // Software decode by default — saves VRAM
|
||||||
, m_nDstVideoFmt(AV_PIX_FMT_YUV420P)
|
, m_nDstVideoFmt(AV_PIX_FMT_YUV420P)
|
||||||
, m_bUpdown(FALSE)
|
, m_bUpdown(FALSE)
|
||||||
, m_bSnapshot(FALSE)
|
, m_bSnapshot(FALSE)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <json.hpp>
|
#include <json.hpp>
|
||||||
#include "ANSODEngine.h"
|
#include "ANSODEngine.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include "ANSYOLOOD.h"
|
#include "ANSYOLOOD.h"
|
||||||
#include "ANSTENSORRTOD.h"
|
#include "ANSTENSORRTOD.h"
|
||||||
#include "ANSTENSORRTCL.h"
|
#include "ANSTENSORRTCL.h"
|
||||||
@@ -879,6 +880,9 @@ namespace ANSCENTER
|
|||||||
std::vector<Object> allResults;
|
std::vector<Object> allResults;
|
||||||
allResults.clear();
|
allResults.clear();
|
||||||
try {
|
try {
|
||||||
|
ANS_DBG("ODEngine", "SAHI START: %dx%d tile=%dx%d overlap=%.1f cam=%s",
|
||||||
|
input.cols, input.rows, tiledWidth, tiledHeight, overLap, camera_id.c_str());
|
||||||
|
auto _sahiStart = std::chrono::steady_clock::now();
|
||||||
cv::Mat image = input.clone();
|
cv::Mat image = input.clone();
|
||||||
if (image.empty() || !image.data || !image.u) {
|
if (image.empty() || !image.data || !image.u) {
|
||||||
return allResults;
|
return allResults;
|
||||||
@@ -920,6 +924,16 @@ namespace ANSCENTER
|
|||||||
//4. Apply Non-Maximum Suppression (NMS) to merge overlapping results
|
//4. Apply Non-Maximum Suppression (NMS) to merge overlapping results
|
||||||
float iouThreshold = 0.1;
|
float iouThreshold = 0.1;
|
||||||
std::vector<Object> finalResults = ANSUtilityHelper::ApplyNMS(allResults, iouThreshold);
|
std::vector<Object> finalResults = ANSUtilityHelper::ApplyNMS(allResults, iouThreshold);
|
||||||
|
{
|
||||||
|
double _sahiMs = std::chrono::duration<double, std::milli>(
|
||||||
|
std::chrono::steady_clock::now() - _sahiStart).count();
|
||||||
|
ANS_DBG("ODEngine", "SAHI DONE: %.1fms patches=%zu results=%zu cam=%s",
|
||||||
|
_sahiMs, patches.size() + 1, finalResults.size(), camera_id.c_str());
|
||||||
|
if (_sahiMs > 2000.0) {
|
||||||
|
ANS_DBG("ODEngine", "SAHI SLOW: %.1fms — %zu patches held _mutex entire time!",
|
||||||
|
_sahiMs, patches.size() + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
image.release();
|
image.release();
|
||||||
return finalResults;
|
return finalResults;
|
||||||
}
|
}
|
||||||
@@ -2103,6 +2117,8 @@ namespace ANSCENTER
|
|||||||
// No coarse _mutex — sub-components (engines, trackers) have their own locks.
|
// No coarse _mutex — sub-components (engines, trackers) have their own locks.
|
||||||
// LabVIEW semaphore controls concurrency at the caller level.
|
// LabVIEW semaphore controls concurrency at the caller level.
|
||||||
try {
|
try {
|
||||||
|
ANS_DBG("ODEngine", "RunInferenceWithOption: cam=%s %dx%d mode=%s",
|
||||||
|
camera_id.c_str(), input.cols, input.rows, activeROIMode.c_str());
|
||||||
int mode = 0;
|
int mode = 0;
|
||||||
double confidenceThreshold = 0.35;
|
double confidenceThreshold = 0.35;
|
||||||
std::vector<int> trackingObjectIds;
|
std::vector<int> trackingObjectIds;
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#include "ANSRTYOLO.h"
|
#include "ANSRTYOLO.h"
|
||||||
#include "Utility.h"
|
#include "Utility.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro for DebugView
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@@ -903,7 +904,6 @@ namespace ANSCENTER {
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if model is classification (output ndims <= 2)
|
|
||||||
const auto& outputDims = m_trtEngine->getOutputDims();
|
const auto& outputDims = m_trtEngine->getOutputDims();
|
||||||
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
|
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
|
||||||
|
|
||||||
@@ -914,11 +914,8 @@ namespace ANSCENTER {
|
|||||||
cv::cuda::GpuMat resized;
|
cv::cuda::GpuMat resized;
|
||||||
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
|
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
|
||||||
if (isClassification) {
|
if (isClassification) {
|
||||||
// Classification: direct resize (no letterbox padding)
|
|
||||||
// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
|
|
||||||
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
|
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
|
||||||
} else {
|
} else {
|
||||||
// Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad
|
|
||||||
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
|
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1831,8 +1828,7 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// --- 2. Preprocess under lock ---
|
// --- 2. Preprocess under lock ---
|
||||||
// Try NV12 fast path first (12MB upload vs 24MB BGR for 4K)
|
ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
|
||||||
// Falls back to standard GPU preprocessing if no NV12 data available.
|
|
||||||
ImageMetadata meta;
|
ImageMetadata meta;
|
||||||
std::vector<std::vector<cv::cuda::GpuMat>> input;
|
std::vector<std::vector<cv::cuda::GpuMat>> input;
|
||||||
bool usedNV12 = false;
|
bool usedNV12 = false;
|
||||||
@@ -1874,11 +1870,22 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
|
// --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
|
||||||
|
ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
|
||||||
|
(int)usedNV12,
|
||||||
|
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
|
||||||
|
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
|
||||||
|
auto _trtStart = std::chrono::steady_clock::now();
|
||||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||||
if (!m_trtEngine->runInference(input, featureVectors)) {
|
if (!m_trtEngine->runInference(input, featureVectors)) {
|
||||||
|
ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
|
||||||
_logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
|
_logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
auto _trtEnd = std::chrono::steady_clock::now();
|
||||||
|
double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
|
||||||
|
if (_trtMs > 500.0) {
|
||||||
|
ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
|
||||||
|
}
|
||||||
double msInference = dbg ? elapsed() : 0;
|
double msInference = dbg ? elapsed() : 0;
|
||||||
|
|
||||||
// --- 4. Transform output ---
|
// --- 4. Transform output ---
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ namespace ANSCENTER {
|
|||||||
std::vector<std::vector<cv::cuda::GpuMat>> PreprocessBatch(
|
std::vector<std::vector<cv::cuda::GpuMat>> PreprocessBatch(
|
||||||
const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata);
|
const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata);
|
||||||
|
|
||||||
|
|
||||||
// ── Detection pipeline ───────────────────────────────────────────
|
// ── Detection pipeline ───────────────────────────────────────────
|
||||||
std::vector<Object> DetectObjects(const cv::Mat& inputImage,
|
std::vector<Object> DetectObjects(const cv::Mat& inputImage,
|
||||||
const std::string& camera_id);
|
const std::string& camera_id);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#include "NV12PreprocessHelper.h"
|
#include "NV12PreprocessHelper.h"
|
||||||
#include "ANSGpuFrameRegistry.h"
|
#include "ANSGpuFrameRegistry.h"
|
||||||
#include "ANSEngineCommon.h"
|
#include "ANSEngineCommon.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <opencv2/cudaimgproc.hpp>
|
#include <opencv2/cudaimgproc.hpp>
|
||||||
#include <opencv2/cudawarping.hpp>
|
#include <opencv2/cudawarping.hpp>
|
||||||
#include <opencv2/core/cuda_stream_accessor.hpp>
|
#include <opencv2/core/cuda_stream_accessor.hpp>
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
|
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
|
||||||
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
||||||
#include <climits> // INT_MIN
|
#include <climits> // INT_MIN
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro for DebugView
|
||||||
|
|
||||||
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
||||||
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
||||||
@@ -1696,6 +1697,8 @@ static int RunInferenceComplete_LV_Impl(
|
|||||||
auto* engine = guard.get();
|
auto* engine = guard.get();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
auto _t0 = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
// Save/restore thread-local to support nested calls (custom model DLLs
|
// Save/restore thread-local to support nested calls (custom model DLLs
|
||||||
// calling back into ANSODEngine via ANSLIB.dll).
|
// calling back into ANSODEngine via ANSLIB.dll).
|
||||||
GpuFrameData* savedFrame = tl_currentGpuFrame();
|
GpuFrameData* savedFrame = tl_currentGpuFrame();
|
||||||
@@ -1708,6 +1711,10 @@ static int RunInferenceComplete_LV_Impl(
|
|||||||
int originalWidth = localImage.cols;
|
int originalWidth = localImage.cols;
|
||||||
int originalHeight = localImage.rows;
|
int originalHeight = localImage.rows;
|
||||||
|
|
||||||
|
ANS_DBG("LV_Inference", "START cam=%s %dx%d gpuFrame=%p nv12=%s",
|
||||||
|
cameraId ? cameraId : "?", originalWidth, originalHeight,
|
||||||
|
(void*)gpuFrame, gpuFrame ? "YES" : "NO");
|
||||||
|
|
||||||
if (originalWidth == 0 || originalHeight == 0) {
|
if (originalWidth == 0 || originalHeight == 0) {
|
||||||
tl_currentGpuFrame() = savedFrame;
|
tl_currentGpuFrame() = savedFrame;
|
||||||
return -2;
|
return -2;
|
||||||
@@ -1717,8 +1724,17 @@ static int RunInferenceComplete_LV_Impl(
|
|||||||
// Safe: *cvImage holds a refcount, keeping gpuFrame alive during inference.
|
// Safe: *cvImage holds a refcount, keeping gpuFrame alive during inference.
|
||||||
// Only use OWN gpuFrame — never inherit outer caller's frame (dimension mismatch on crops).
|
// Only use OWN gpuFrame — never inherit outer caller's frame (dimension mismatch on crops).
|
||||||
tl_currentGpuFrame() = gpuFrame;
|
tl_currentGpuFrame() = gpuFrame;
|
||||||
|
auto _t1 = std::chrono::steady_clock::now();
|
||||||
std::vector<ANSCENTER::Object> outputs = engine->RunInferenceWithOption(localImage, cameraId, activeROIMode);
|
std::vector<ANSCENTER::Object> outputs = engine->RunInferenceWithOption(localImage, cameraId, activeROIMode);
|
||||||
|
auto _t2 = std::chrono::steady_clock::now();
|
||||||
tl_currentGpuFrame() = savedFrame;
|
tl_currentGpuFrame() = savedFrame;
|
||||||
|
|
||||||
|
double prepMs = std::chrono::duration<double, std::milli>(_t1 - _t0).count();
|
||||||
|
double infMs = std::chrono::duration<double, std::milli>(_t2 - _t1).count();
|
||||||
|
if (infMs > 500.0) {
|
||||||
|
ANS_DBG("LV_Inference", "SLOW cam=%s prep=%.1fms inf=%.1fms results=%zu",
|
||||||
|
cameraId ? cameraId : "?", prepMs, infMs, outputs.size());
|
||||||
|
}
|
||||||
bool getJpeg = (getJpegString == 1);
|
bool getJpeg = (getJpegString == 1);
|
||||||
std::string stImage;
|
std::string stImage;
|
||||||
// NOTE: odMutex was removed here. All variables in this scope are local
|
// NOTE: odMutex was removed here. All variables in this scope are local
|
||||||
|
|||||||
@@ -402,6 +402,9 @@ private:
|
|||||||
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
|
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
|
||||||
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
|
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
|
||||||
|
|
||||||
|
// Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
|
||||||
|
// so cached buffers use thread_local inside the functions themselves.
|
||||||
|
|
||||||
|
|
||||||
// Thermal management (ADD THESE)
|
// Thermal management (ADD THESE)
|
||||||
//int m_consecutiveInferences;
|
//int m_consecutiveInferences;
|
||||||
@@ -431,7 +434,7 @@ private:
|
|||||||
|
|
||||||
Logger m_logger;
|
Logger m_logger;
|
||||||
bool m_verbose{ true }; // false for non-probe pool slots
|
bool m_verbose{ true }; // false for non-probe pool slots
|
||||||
bool m_disableGraphs{ false }; // true for pool slots — concurrent graph captures corrupt CUDA context
|
bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM
|
||||||
|
|
||||||
// -- Multi-GPU pool data ---------------------------------------------------
|
// -- Multi-GPU pool data ---------------------------------------------------
|
||||||
|
|
||||||
|
|||||||
@@ -814,8 +814,8 @@ static void ALPRWorkerThread(int taskId,
|
|||||||
g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")");
|
g_log.add(prefix + " Empty frame (count=" + std::to_string(emptyFrames) + ")");
|
||||||
}
|
}
|
||||||
if (emptyFrames > 300) {
|
if (emptyFrames > 300) {
|
||||||
g_log.add(prefix + " Too many empty frames, attempting reconnect...");
|
g_log.add(prefix + " Too many empty frames (reconnect disabled for long test)");
|
||||||
ReconnectRTSP(&rtspClient);
|
// ReconnectRTSP(&rtspClient); // Disabled for VRAM stability testing
|
||||||
emptyFrames = 0;
|
emptyFrames = 0;
|
||||||
}
|
}
|
||||||
streamLock.unlock();
|
streamLock.unlock();
|
||||||
@@ -1222,9 +1222,9 @@ int ANSLPR_MultiGPU_StressTest() {
|
|||||||
g_log.add(buf);
|
g_log.add(buf);
|
||||||
printf("%s\n", buf);
|
printf("%s\n", buf);
|
||||||
} else if (currentGpu != streamPreferredGpu[s]) {
|
} else if (currentGpu != streamPreferredGpu[s]) {
|
||||||
// Decoder is active on wrong GPU — reconnect to move it
|
// Decoder is active on wrong GPU — reconnect disabled for VRAM stability testing
|
||||||
SetRTSPHWDecoding(&rtspClients[s], 7, streamPreferredGpu[s]);
|
SetRTSPHWDecoding(&rtspClients[s], 7, streamPreferredGpu[s]);
|
||||||
ReconnectRTSP(&rtspClients[s]);
|
// ReconnectRTSP(&rtspClients[s]); // Disabled for long test
|
||||||
char buf[256];
|
char buf[256];
|
||||||
snprintf(buf, sizeof(buf),
|
snprintf(buf, sizeof(buf),
|
||||||
"[Stream%d] NVDEC GPU realigned: GPU[%d] -> GPU[%d] (reconnected for zero-copy)",
|
"[Stream%d] NVDEC GPU realigned: GPU[%d] -> GPU[%d] (reconnected for zero-copy)",
|
||||||
@@ -1279,7 +1279,7 @@ int ANSLPR_MultiGPU_StressTest() {
|
|||||||
// CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
|
// CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
|
||||||
// This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
|
// This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
|
||||||
// =========================================================================
|
// =========================================================================
|
||||||
std::atomic<bool> chaosEnabled{true};
|
std::atomic<bool> chaosEnabled{false}; // Disabled for VRAM stability long test
|
||||||
std::thread chaosThread([&]() {
|
std::thread chaosThread([&]() {
|
||||||
std::mt19937 rng(std::random_device{}());
|
std::mt19937 rng(std::random_device{}());
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user