From 98681f4da6521f9840e223da098095c38cda9795 Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Sat, 4 Apr 2026 22:29:08 +1100 Subject: [PATCH] Use CPU resize before upload to GPU to remove PCIe bottleneck --- .claude/settings.local.json | 8 +- .../include/engine/EngineRunInference.inl | 20 +++ .../include/engine/EngineUtilities.inl | 23 +++ modules/ANSFR/ANSFaceRecognizer.cpp | 76 ++++----- modules/ANSFR/ARCFaceRT.cpp | 61 +++---- modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp | 8 +- modules/ANSODEngine/ANSRTYOLO.cpp | 156 +++++++++--------- modules/ANSODEngine/ANSTENSORRTCL.cpp | 100 +++++------ modules/ANSODEngine/ANSTENSORRTPOSE.cpp | 106 +++++++----- modules/ANSODEngine/ANSTENSORRTSEG.cpp | 112 +++++++------ modules/ANSODEngine/ANSTENSORTRTOD.cpp | 116 +++++++------ modules/ANSODEngine/ANSYOLOV10RTOD.cpp | 115 ++++++------- modules/ANSODEngine/ANSYOLOV12RTOD.cpp | 111 +++++++------ modules/ANSODEngine/SCRFDFaceDetector.cpp | 51 +++--- modules/ANSODEngine/engine.h | 2 + 15 files changed, 572 insertions(+), 493 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index ac63ccb..aaec1c5 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -69,7 +69,13 @@ "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION34.log''\\).Count\")", "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION35.log''\\).Count\")", "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION36.log''\\).Count\")", - "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION37.log''\\).Count\")" + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION37.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION38.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION39.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION40.log''\\).Count\")", + "Bash(python -c \":*)", + "Bash(find /c/Projects/CLionProjects/ANSCORE -type d -name *ANSODEngine*)", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION41.log''\\).Count\")" ] } } diff --git a/engines/TensorRTAPI/include/engine/EngineRunInference.inl b/engines/TensorRTAPI/include/engine/EngineRunInference.inl index 4a7a950..c1fe6e0 100644 --- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl +++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl @@ -284,7 +284,13 @@ bool Engine::runInference(const std::vector>& i // fatal "illegal memory access" that permanently corrupts the CUDA context. // // Pool-mode slots have their own busy-flag dispatch so they do NOT need this. + auto _mutexWaitStart = std::chrono::steady_clock::now(); std::lock_guard inferenceLock(m_inferenceMutex); + auto _mutexAcquired = std::chrono::steady_clock::now(); + double _mutexWaitMs = std::chrono::duration(_mutexAcquired - _mutexWaitStart).count(); + if (_mutexWaitMs > 50.0) { + ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs); + } // ============================================================================ // THREAD-SAFE GPU CONTEXT @@ -955,6 +961,20 @@ bool Engine::runInference(const std::vector>& i } } + // ============================================================================ + // Per-inference total timing breakdown (mutex wait + preprocess + GPU) + // ============================================================================ + { + double totalMs = std::chrono::duration( + std::chrono::steady_clock::now() - _mutexWaitStart).count(); + double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired + // Log every inference that takes >100ms total (including mutex wait) + if (totalMs > 100.0) { + ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d", + totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load()); + } + } + // ============================================================================ // SM=100% DETECTOR — end-of-inference timing // ============================================================================ diff --git a/engines/TensorRTAPI/include/engine/EngineUtilities.inl b/engines/TensorRTAPI/include/engine/EngineUtilities.inl index bc2e5c9..d366775 100644 --- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl +++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl @@ -23,6 +23,29 @@ void Engine::transformOutput(std::vector>> &input, } output = std::move(input[0][0]); } +// CPU letterbox resize — same logic as the GPU version but runs on CPU. +// Used in Preprocess to resize BEFORE GPU upload, reducing PCIe transfer +// from 25 MB (4K) to 1.2 MB (640×640) — 20x less bandwidth. +template +cv::Mat Engine::cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat& input, + size_t height, size_t width, + const cv::Scalar& bgcolor) { + if (input.empty()) return cv::Mat(); + + float r = std::min(static_cast(width) / input.cols, + static_cast(height) / input.rows); + int unpad_w = static_cast(r * input.cols); + int unpad_h = static_cast(r * input.rows); + + cv::Mat re; + cv::resize(input, re, cv::Size(unpad_w, unpad_h), 0, 0, cv::INTER_LINEAR); + + cv::Mat out(static_cast(height), static_cast(width), input.type(), bgcolor); + re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); + + return out; +} + template cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, size_t height, size_t width, diff --git a/modules/ANSFR/ANSFaceRecognizer.cpp b/modules/ANSFR/ANSFaceRecognizer.cpp index d0c508f..22391a4 100644 --- a/modules/ANSFR/ANSFaceRecognizer.cpp +++ b/modules/ANSFR/ANSFaceRecognizer.cpp @@ -674,26 +674,22 @@ namespace ANSCENTER { } try { - // Fix #8: Use pooled GPU buffers to avoid per-frame allocation - m_gpuImg.upload(inputImage, m_gpuStream); - - // Handle grayscale conversion on GPU - if (inputImage.channels() == 1) { - cv::cuda::cvtColor(m_gpuImg, m_gpuRgb, cv::COLOR_GRAY2BGR, 0, m_gpuStream); - std::swap(m_gpuImg, m_gpuRgb); + // CPU preprocessing: resize + BGR→RGB before GPU upload + // Reduces PCIe transfer and eliminates GPU cvtColor/resize overhead + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - - // Resize on GPU if needed - if (inputImage.cols != GPU_FACE_WIDTH || inputImage.rows != GPU_FACE_HEIGHT) { - cv::cuda::resize(m_gpuImg, m_gpuResized, cv::Size(GPU_FACE_WIDTH, GPU_FACE_HEIGHT), - 0, 0, cv::INTER_LINEAR, m_gpuStream); + cv::Mat cpuResized; + if (srcImg.cols != GPU_FACE_WIDTH || srcImg.rows != GPU_FACE_HEIGHT) { + cv::resize(srcImg, cpuResized, cv::Size(GPU_FACE_WIDTH, GPU_FACE_HEIGHT), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; } - else { - m_gpuResized = m_gpuImg; - } - - // BGR to RGB conversion on GPU - cv::cuda::cvtColor(m_gpuResized, m_gpuRgb, cv::COLOR_BGR2RGB, 0, m_gpuStream); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + m_gpuRgb.upload(cpuRGB, m_gpuStream); + m_gpuStream.waitForCompletion(); // Prepare inference inputs std::vector inputVec; @@ -781,33 +777,39 @@ namespace ANSCENTER { batchGpu.reserve(chunkEnd - chunkStart); for (size_t i = chunkStart; i < chunkEnd; i++) { - cv::cuda::GpuMat d_img; - // Use GPU-resident face if available (NV12 affine warp path), - // otherwise upload from CPU (standard path) + // otherwise do CPU resize + BGR→RGB before upload if (i < gpuFaceROIs.size() && !gpuFaceROIs[i].empty()) { - d_img = gpuFaceROIs[i]; // already on GPU — skip upload + cv::cuda::GpuMat d_img = gpuFaceROIs[i]; // already on GPU + if (d_img.cols != GPU_FACE_WIDTH || d_img.rows != GPU_FACE_HEIGHT) { + cv::cuda::GpuMat d_resized; + cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, m_gpuStream); + d_img = d_resized; + } + cv::cuda::GpuMat d_rgb; + cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, m_gpuStream); + batchGpu.emplace_back(std::move(d_rgb)); } else { const auto& roi = faceROIs[i]; if (roi.empty()) continue; - d_img.upload(roi, m_gpuStream); - if (roi.channels() == 1) { - cv::cuda::GpuMat d_bgr; - cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, m_gpuStream); - d_img = d_bgr; + // CPU preprocessing: resize + BGR→RGB before upload + cv::Mat srcImg = roi; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } + cv::Mat cpuResized; + if (srcImg.cols != GPU_FACE_WIDTH || srcImg.rows != GPU_FACE_HEIGHT) { + cv::resize(srcImg, cpuResized, targetSize, 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; + } + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + cv::cuda::GpuMat d_rgb; + d_rgb.upload(cpuRGB, m_gpuStream); + batchGpu.emplace_back(std::move(d_rgb)); } - - if (d_img.cols != GPU_FACE_WIDTH || d_img.rows != GPU_FACE_HEIGHT) { - cv::cuda::GpuMat d_resized; - cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, m_gpuStream); - d_img = d_resized; - } - - cv::cuda::GpuMat d_rgb; - cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, m_gpuStream); - batchGpu.emplace_back(std::move(d_rgb)); } FR_END_TIMER(gpu_preproc, "RunArcFaceBatch GPU preprocess (" + std::to_string(batchGpu.size()) + " faces)"); diff --git a/modules/ANSFR/ARCFaceRT.cpp b/modules/ANSFR/ARCFaceRT.cpp index ee0ca97..ba66d2d 100644 --- a/modules/ANSFR/ARCFaceRT.cpp +++ b/modules/ANSFR/ARCFaceRT.cpp @@ -303,31 +303,27 @@ namespace ANSCENTER { return embedding; } - // GPU preprocessing pipeline + // CPU preprocessing: resize + color convert, then upload small image cv::cuda::Stream stream; - cv::cuda::GpuMat d_img; - // Upload to GPU - d_img.upload(inputImage, stream); - - // Handle grayscale conversion on GPU - if (inputImage.channels() == 1) { - cv::cuda::GpuMat d_bgr; - cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, stream); - d_img = d_bgr; + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Resize on GPU if needed - if (inputImage.cols != FACE_WIDTH || inputImage.rows != FACE_HEIGHT) { - cv::cuda::GpuMat d_resized; - cv::cuda::resize(d_img, d_resized, cv::Size(FACE_WIDTH, FACE_HEIGHT), - 0, 0, cv::INTER_LINEAR, stream); - d_img = d_resized; + cv::Mat cpuResized; + if (srcImg.rows != FACE_HEIGHT || srcImg.cols != FACE_WIDTH) { + cv::resize(srcImg, cpuResized, cv::Size(FACE_WIDTH, FACE_HEIGHT), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; } - // BGR to RGB conversion on GPU + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + cv::cuda::GpuMat d_rgb; - cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, stream); + d_rgb.upload(cpuRGB, stream); + stream.waitForCompletion(); // Prepare inference inputs std::vector inputVec; @@ -404,27 +400,24 @@ namespace ANSCENTER { continue; } - // Upload to GPU - cv::cuda::GpuMat d_img; - d_img.upload(roi, stream); - - // Handle grayscale conversion on GPU - if (roi.channels() == 1) { - cv::cuda::GpuMat d_bgr; - cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, stream); - d_img = d_bgr; + // CPU preprocessing: resize + color convert, then upload small image + cv::Mat srcImg = roi; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Resize on GPU if needed - if (roi.cols != FACE_WIDTH || roi.rows != FACE_HEIGHT) { - cv::cuda::GpuMat d_resized; - cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, stream); - d_img = d_resized; + cv::Mat cpuResized; + if (srcImg.rows != FACE_HEIGHT || srcImg.cols != FACE_WIDTH) { + cv::resize(srcImg, cpuResized, targetSize, 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; } - // BGR to RGB conversion on GPU + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + cv::cuda::GpuMat d_rgb; - cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, stream); + d_rgb.upload(cpuRGB, stream); batchGpu.emplace_back(std::move(d_rgb)); } diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp index 41c290c..dc38de9 100644 --- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp +++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp @@ -178,10 +178,10 @@ std::vector RTOCRDetector::Detect(const cv::Mat& image, } if (!usedNV12) { - // Fallback: standard BGR upload - cv::cuda::GpuMat gpuImg; - gpuImg.upload(image); - cv::cuda::resize(gpuImg, gpuResized, resizeShape); + // Fallback: CPU resize then upload small image to GPU + cv::Mat cpuResized; + cv::resize(image, cpuResized, resizeShape, 0, 0, cv::INTER_LINEAR); + gpuResized.upload(cpuResized); } // Keep BGR order (PaddleOCR official does NOT convert BGR->RGB) diff --git a/modules/ANSODEngine/ANSRTYOLO.cpp b/modules/ANSODEngine/ANSRTYOLO.cpp index 32b1f7a..a8efec9 100644 --- a/modules/ANSODEngine/ANSRTYOLO.cpp +++ b/modules/ANSODEngine/ANSRTYOLO.cpp @@ -462,50 +462,46 @@ namespace ANSCENTER { // Early-out if CUDA context is dead (sticky error from CUVID crash etc.) if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {}; - cv::cuda::Stream stream; - cv::cuda::GpuMat gpuImg; - - // Resolve source Mat (handle grayscale → BGR on CPU first) - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - gpuImg.upload(img3Channel, stream); - } else { - gpuImg.upload(inputImage, stream); + // --- CPU preprocessing: resize + BGR→RGB before GPU upload --- + // Reduces PCIe transfer from 25 MB (4K BGR) to 1.2 MB (640×640 RGB). + // With 12 AI tasks uploading concurrently, this eliminates the WDDM + // SRW lock convoy that causes 400-580ms preprocess spikes. + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // GPU: BGR → RGB - cv::cuda::GpuMat gpuRGB; - cv::cuda::cvtColor(gpuImg, gpuRGB, cv::COLOR_BGR2RGB, 0, stream); - - outMeta.imgHeight = static_cast(gpuRGB.rows); - outMeta.imgWidth = static_cast(gpuRGB.cols); + outMeta.imgHeight = static_cast(srcImg.rows); + outMeta.imgWidth = static_cast(srcImg.cols); if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { outMeta.ratio = 1.f / std::min( - inputDims[0].d[2] / static_cast(gpuRGB.cols), - inputDims[0].d[1] / static_cast(gpuRGB.rows)); + inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - // Check if model is classification (output ndims <= 2) const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - cv::cuda::GpuMat gpuResized; - if (gpuRGB.rows != inputH || gpuRGB.cols != inputW) { + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { if (isClassification) { - // Classification: direct resize (no letterbox padding) - cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH), - 0, 0, cv::INTER_LINEAR, stream); - } - else { - // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad (on GPU) - gpuResized = Engine::resizeKeepAspectRatioPadRightBottom( - gpuRGB, inputH, inputW); + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); } } else { - gpuResized = gpuRGB; + cpuResized = srcImg; } + // CPU BGR → RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU (1.2 MB instead of 25 MB for 4K) + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); stream.waitForCompletion(); std::vector input{ std::move(gpuResized) }; @@ -878,26 +874,18 @@ namespace ANSCENTER { "Empty input image at index " + std::to_string(i), __FILE__, __LINE__); return {}; } - cv::cuda::GpuMat img; - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else if (inputImage.channels() == 3) { - img.upload(inputImage, stream); - } - else { + // CPU preprocessing: resize + BGR→RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); + } else if (srcImg.channels() != 3) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__); return {}; } - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__); @@ -907,23 +895,27 @@ namespace ANSCENTER { const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - const float scaleW = inputW / static_cast(imgRGB.cols); - const float scaleH = inputH / static_cast(imgRGB.rows); + const float scaleW = inputW / static_cast(srcImg.cols); + const float scaleH = inputH / static_cast(srcImg.rows); outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH); - cv::cuda::GpuMat resized; - if (imgRGB.rows != inputH || imgRGB.cols != inputW) { + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { if (isClassification) { - cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); } else { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); } - } - else { - resized = imgRGB; + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); @@ -1804,10 +1796,10 @@ namespace ANSCENTER { std::vector ANSRTYOLO::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) { try { - // --- Debug timer helper (zero-cost when _debugFlag == false) --- + // --- Debug timer helper --- using Clock = std::chrono::steady_clock; const bool dbg = _debugFlag; - auto t0 = dbg ? Clock::now() : Clock::time_point{}; + auto t0 = Clock::now(); // Always set — used by ANS_DBG timing output auto tPrev = t0; auto elapsed = [&]() -> double { auto now = Clock::now(); @@ -2045,13 +2037,21 @@ namespace ANSCENTER { } // --- 6. Total pipeline time --- - if (dbg) { + { double msTotal = std::chrono::duration(Clock::now() - t0).count(); - _logger.LogInfo("ANSRTYOLO::DetectObjects", - "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) + - "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + - ") Results=" + std::to_string(results.size()), - __FILE__, __LINE__); + if (dbg) { + _logger.LogInfo("ANSRTYOLO::DetectObjects", + "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) + + "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + + ") Results=" + std::to_string(results.size()), + __FILE__, __LINE__); + } + // DebugView output — controlled by ANSCORE_DEBUGVIEW + double msPreproc = std::chrono::duration(_trtStart - t0).count(); + ANS_DBG("YOLO_Timing", "cam=%s total=%.1fms preproc=%.1fms inf=%.1fms %dx%d det=%zu %s", + camera_id.c_str(), msTotal, msPreproc, _trtMs, + inputImage.cols, inputImage.rows, results.size(), + usedNV12 ? "NV12" : "BGR"); } return results; @@ -2101,7 +2101,7 @@ namespace ANSCENTER { // --- Debug timer helper --- using Clock = std::chrono::steady_clock; const bool dbg = _debugFlag; - auto t0 = dbg ? Clock::now() : Clock::time_point{}; + auto t0 = Clock::now(); // Always set — used by ANS_DBG timing output auto tPrev = t0; auto elapsed = [&]() -> double { auto now = Clock::now(); @@ -2350,19 +2350,23 @@ namespace ANSCENTER { } } - if (dbg) { - double msPostprocess = elapsed(); + { + double msPostprocess = dbg ? elapsed() : 0; double msTotal = std::chrono::duration(Clock::now() - t0).count(); - _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch", - "[DEBUG] " + camera_id + - " batch=" + std::to_string(realCount) + - " | SetDev=" + std::to_string(msSetDevice) + - "ms Pad=" + std::to_string(msPad) + - "ms Preproc=" + std::to_string(msPreprocess) + - "ms Inf=" + std::to_string(msInference) + - "ms Postproc=" + std::to_string(msPostprocess) + - "ms TOTAL=" + std::to_string(msTotal) + "ms", - __FILE__, __LINE__); + if (dbg) { + _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch", + "[DEBUG] " + camera_id + + " batch=" + std::to_string(realCount) + + " | SetDev=" + std::to_string(msSetDevice) + + "ms Pad=" + std::to_string(msPad) + + "ms Preproc=" + std::to_string(msPreprocess) + + "ms Inf=" + std::to_string(msInference) + + "ms Postproc=" + std::to_string(msPostprocess) + + "ms TOTAL=" + std::to_string(msTotal) + "ms", + __FILE__, __LINE__); + } + ANS_DBG("YOLO_Timing", "cam=%s batch=%d total=%.1fms preproc=%.1fms inf=%.1fms", + camera_id.c_str(), realCount, msTotal, msPreprocess, msInference); } return batchDetections; diff --git a/modules/ANSODEngine/ANSTENSORRTCL.cpp b/modules/ANSODEngine/ANSTENSORRTCL.cpp index fc1139a..af3532d 100644 --- a/modules/ANSODEngine/ANSTENSORRTCL.cpp +++ b/modules/ANSODEngine/ANSTENSORRTCL.cpp @@ -534,27 +534,15 @@ namespace ANSCENTER const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; - // Upload the image to GPU memory - cv::cuda::Stream stream; // Create a custom stream - cv::cuda::GpuMat img; - - if (inputImage.channels() == 1) { - // Convert grayscale to 3-channel BGR before uploading - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - else { - img.upload(inputImage, stream); - } - - // Convert BGR to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); // These parameters will be used in the post-processing stage - outMeta.imgHeight = imgRGB.rows; - outMeta.imgWidth = imgRGB.cols; + outMeta.imgHeight = srcImg.rows; + outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) { _logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__); @@ -564,19 +552,26 @@ namespace ANSCENTER if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { outMeta.ratio = 1.f; - cv::cuda::GpuMat resized = imgRGB; - - // Classification: direct resize (no letterbox padding) — matches ANSONNXCL - // Must use explicit stream to avoid conflict with CUDA Graph capture on null stream - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); + // Classification: direct CPU resize (no letterbox padding) + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; } - // Wait for all GPU ops to complete before returning GpuMats + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); stream.waitForCompletion(); // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -811,25 +806,17 @@ namespace ANSCENTER return {}; } - // Upload to GPU - cv::cuda::GpuMat img; - if (inputImage.channels() == 1) { - // Convert grayscale to BGR + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR); + srcImg = img3Channel; } - else { - img.upload(inputImage, stream); - } - - // Convert BGR to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); // Store original dimensions - int imgHeight = imgRGB.rows; - int imgWidth = imgRGB.cols; + int imgHeight = srcImg.rows; + int imgWidth = srcImg.cols; if (imgHeight <= 0 || imgWidth <= 0) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", @@ -841,26 +828,25 @@ namespace ANSCENTER outMetadata.imgHeights.push_back(imgHeight); outMetadata.imgWidths.push_back(imgWidth); - // Calculate resize ratio - float ratio = 1.f / std::min( - inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows) - ); - outMetadata.ratios.push_back(ratio); + // Classification: ratio is always 1.0 + outMetadata.ratios.push_back(1.f); - // Resize maintaining aspect ratio with padding - cv::cuda::GpuMat resized; - if (imgRGB.rows != inputDims[0].d[1] || imgRGB.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom( - imgRGB, inputDims[0].d[1], inputDims[0].d[2] - ); - } - else { - resized = imgRGB; + // Classification: direct CPU resize (no letterbox padding) + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; } + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + // Add to batch - batchedImages.push_back(std::move(resized)); + batchedImages.push_back(std::move(gpuResized)); } // Wait for all GPU operations to complete diff --git a/modules/ANSODEngine/ANSTENSORRTPOSE.cpp b/modules/ANSODEngine/ANSTENSORRTPOSE.cpp index 239deda..f6c1a2c 100644 --- a/modules/ANSODEngine/ANSTENSORRTPOSE.cpp +++ b/modules/ANSODEngine/ANSTENSORRTPOSE.cpp @@ -508,41 +508,46 @@ namespace ANSCENTER const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; - // Upload the image to GPU memory - cv::cuda::Stream stream; - cv::cuda::GpuMat img; - - if (inputImage.channels() == 1) { - // Convert grayscale to 3-channel BGR before uploading - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else { - img.upload(inputImage, stream); + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - stream.waitForCompletion(); - - // Set image size parameters - outMeta.imgHeight = imgRGB.rows; - outMeta.imgWidth = imgRGB.cols; + // Set image size parameters from ORIGINAL image + outMeta.imgHeight = srcImg.rows; + outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { - outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows)); + outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - cv::cuda::GpuMat resized = imgRGB; + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize to the model's expected input size while maintaining aspect ratio with padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]); + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + stream.waitForCompletion(); + // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -793,19 +798,13 @@ namespace ANSCENTER "Empty input image at index " + std::to_string(i), __FILE__, __LINE__); return {}; } - cv::cuda::GpuMat img; - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - else { - img.upload(inputImage, stream); - } - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogFatal("ANSTENSORRTPOSE::PreprocessBatch", "Image " + std::to_string(i) + " has invalid dimensions (Width: " + @@ -813,13 +812,30 @@ namespace ANSCENTER std::to_string(outMetadata.imgHeights[i]) + ")", __FILE__, __LINE__); return {}; } - outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast(imgRGB.cols), - inputH / static_cast(imgRGB.rows)); - cv::cuda::GpuMat resized = imgRGB; - if (resized.rows != inputH || resized.cols != inputW) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); + + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; + + outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast(srcImg.cols), + inputH / static_cast(srcImg.rows)); + + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); std::vector> inputs; diff --git a/modules/ANSODEngine/ANSTENSORRTSEG.cpp b/modules/ANSODEngine/ANSTENSORRTSEG.cpp index 85a7b90..91eaeb9 100644 --- a/modules/ANSODEngine/ANSTENSORRTSEG.cpp +++ b/modules/ANSODEngine/ANSTENSORRTSEG.cpp @@ -561,41 +561,46 @@ namespace ANSCENTER const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; - // Upload the image to GPU memory - cv::cuda::Stream stream; - cv::cuda::GpuMat img; - - if (inputImage.channels() == 1) { - // Convert grayscale to 3-channel BGR before uploading - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else { - img.upload(inputImage, stream); + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - stream.waitForCompletion(); - - // Set image size parameters - outMeta.imgHeight = imgRGB.rows; - outMeta.imgWidth = imgRGB.cols; + // Set image size parameters from ORIGINAL image + outMeta.imgHeight = srcImg.rows; + outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { - outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows)); + outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - cv::cuda::GpuMat resized = imgRGB; + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize to the model's expected input size while maintaining aspect ratio with padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]); + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + stream.waitForCompletion(); + // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -891,26 +896,15 @@ namespace ANSCENTER return {}; } - // Upload to GPU - cv::cuda::GpuMat img; - - // Convert grayscale to BGR if needed - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - else { - img.upload(inputImage, stream); - } - - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); // Store original dimensions - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogFatal("TENSORRTSEG::PreprocessBatch", @@ -921,17 +915,31 @@ namespace ANSCENTER return {}; } - // Calculate ratio for this image - outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast(imgRGB.cols), - inputH / static_cast(imgRGB.rows)); + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize with padding - cv::cuda::GpuMat resized = imgRGB; - if (resized.rows != inputH || resized.cols != inputW) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); + // Calculate ratio for this image + outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast(srcImg.cols), + inputH / static_cast(srcImg.rows)); + + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); diff --git a/modules/ANSODEngine/ANSTENSORTRTOD.cpp b/modules/ANSODEngine/ANSTENSORTRTOD.cpp index 4fafa33..9306a35 100644 --- a/modules/ANSODEngine/ANSTENSORTRTOD.cpp +++ b/modules/ANSODEngine/ANSTENSORTRTOD.cpp @@ -587,41 +587,46 @@ namespace ANSCENTER const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; - // Upload the image to GPU memory - cv::cuda::Stream stream; - cv::cuda::GpuMat img; - - if (inputImage.channels() == 1) { - // Convert grayscale to 3-channel BGR before uploading - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else { - img.upload(inputImage, stream); + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - stream.waitForCompletion(); - - // Set image size parameters into per-call metadata (not shared members) - outMeta.imgHeight = static_cast(imgRGB.rows); - outMeta.imgWidth = static_cast(imgRGB.cols); + // Set image size parameters from ORIGINAL image (before resize) + outMeta.imgHeight = static_cast(srcImg.rows); + outMeta.imgWidth = static_cast(srcImg.cols); if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { - outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows)); + outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - cv::cuda::GpuMat resized = imgRGB; + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize to the model's expected input size while maintaining aspect ratio with padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]); + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + stream.waitForCompletion(); + // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -1174,29 +1179,20 @@ namespace ANSCENTER return {}; } - cv::cuda::GpuMat img; - - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else if (inputImage.channels() == 3) { - img.upload(inputImage, stream); - } - else { + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); + } else if (srcImg.channels() != 3) { _logger.LogError("TENSORRTOD::PreprocessBatch", "Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__); return {}; } - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - - // Store in output metadata - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + // Store in output metadata from ORIGINAL image + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogError("TENSORRTOD::PreprocessBatch", @@ -1205,20 +1201,30 @@ namespace ANSCENTER return {}; } - const float scaleW = inputW / static_cast(imgRGB.cols); - const float scaleH = inputH / static_cast(imgRGB.rows); - outMetadata.ratios[i] = 1.f / std::min(scaleW, scaleH); + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - cv::cuda::GpuMat resized; - if (imgRGB.rows != inputH || imgRGB.cols != inputW) { - resized = Engine::resizeKeepAspectRatioPadRightBottom( - imgRGB, inputH, inputW); - } - else { - resized = imgRGB; + const float scaleW = inputW / static_cast(srcImg.cols); + const float scaleH = inputH / static_cast(srcImg.rows); + outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH); + + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); diff --git a/modules/ANSODEngine/ANSYOLOV10RTOD.cpp b/modules/ANSODEngine/ANSYOLOV10RTOD.cpp index 8eec9b6..d16c184 100644 --- a/modules/ANSODEngine/ANSYOLOV10RTOD.cpp +++ b/modules/ANSODEngine/ANSYOLOV10RTOD.cpp @@ -519,46 +519,46 @@ namespace ANSCENTER const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; - // Upload input image to GPU - cv::cuda::Stream stream; - cv::cuda::GpuMat img; - - if (inputImage.empty()) { - _logger.LogFatal("ANSYOLOV10RTOD::Preprocess", "Empty input image", __FILE__, __LINE__); - return {}; + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Convert grayscale to BGR if needed - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else { - img.upload(inputImage, stream); - } - - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - stream.waitForCompletion(); - - outMeta.imgHeight = imgRGB.rows; - outMeta.imgWidth = imgRGB.cols; + outMeta.imgHeight = srcImg.rows; + outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { - outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows)); + outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - cv::cuda::GpuMat resized = imgRGB; + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize to the model's expected input size while maintaining aspect ratio with padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]); + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + stream.waitForCompletion(); + // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -1058,26 +1058,15 @@ namespace ANSCENTER return {}; } - // Upload to GPU - cv::cuda::GpuMat img; - - // Convert grayscale to BGR if needed - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - else { - img.upload(inputImage, stream); - } - - // Convert to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); // Store original dimensions - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogFatal("ANSYOLOV10RTOD::PreprocessBatch", @@ -1088,17 +1077,31 @@ namespace ANSCENTER return {}; } - // Calculate ratio for this image - outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast(imgRGB.cols), - inputH / static_cast(imgRGB.rows)); + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize with padding - cv::cuda::GpuMat resized = imgRGB; - if (resized.rows != inputH || resized.cols != inputW) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); + // Calculate ratio for this image + outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast(srcImg.cols), + inputH / static_cast(srcImg.rows)); + + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); diff --git a/modules/ANSODEngine/ANSYOLOV12RTOD.cpp b/modules/ANSODEngine/ANSYOLOV12RTOD.cpp index 92273d5..e8103c1 100644 --- a/modules/ANSODEngine/ANSYOLOV12RTOD.cpp +++ b/modules/ANSODEngine/ANSYOLOV12RTOD.cpp @@ -484,39 +484,47 @@ namespace ANSCENTER } const auto& inputDims = m_trtEngine->getInputDims(); - cv::cuda::Stream stream; - cv::cuda::GpuMat img; - - // Upload to GPU - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); - } - else { - img.upload(inputImage, stream); + // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - // Convert BGR to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); - stream.waitForCompletion(); - - outMeta.imgHeight = imgRGB.rows; - outMeta.imgWidth = imgRGB.cols; + outMeta.imgHeight = srcImg.rows; + outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { - outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(imgRGB.cols), - inputDims[0].d[1] / static_cast(imgRGB.rows)); + outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast(srcImg.cols), + inputDims[0].d[1] / static_cast(srcImg.rows)); - cv::cuda::GpuMat resized = imgRGB; + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize to the model's expected input size while maintaining aspect ratio with padding - if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]); + // CPU resize to model input size + cv::Mat cpuResized; + const int inputH = inputDims[0].d[1]; + const int inputW = inputDims[0].d[2]; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small image to GPU + cv::cuda::Stream stream; + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + stream.waitForCompletion(); + // Convert to format expected by our inference engine - std::vector input{ std::move(resized) }; + std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } @@ -744,26 +752,15 @@ namespace ANSCENTER return {}; } - // Upload to GPU - cv::cuda::GpuMat img; - - // Convert grayscale to BGR if needed - if (inputImage.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); - img.upload(img3Channel, stream); + // CPU preprocessing: resize + BGR->RGB before GPU upload + cv::Mat srcImg = inputImage; + if (srcImg.channels() == 1) { + cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } - else { - img.upload(inputImage, stream); - } - - // Convert BGR to RGB - cv::cuda::GpuMat imgRGB; - cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); // Store original dimensions - outMetadata.imgHeights[i] = imgRGB.rows; - outMetadata.imgWidths[i] = imgRGB.cols; + outMetadata.imgHeights[i] = srcImg.rows; + outMetadata.imgWidths[i] = srcImg.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogFatal("ANSYOLOV12RTOD::PreprocessBatch", @@ -774,17 +771,31 @@ namespace ANSCENTER return {}; } - // Calculate ratio for this image - outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast(imgRGB.cols), - inputH / static_cast(imgRGB.rows)); + const auto& outputDims = m_trtEngine->getOutputDims(); + const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; - // Resize with padding - cv::cuda::GpuMat resized = imgRGB; - if (resized.rows != inputH || resized.cols != inputW) { - resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); + // Calculate ratio for this image + outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast(srcImg.cols), + inputH / static_cast(srcImg.rows)); + + // CPU resize to model input size + cv::Mat cpuResized; + if (srcImg.rows != inputH || srcImg.cols != inputW) { + if (isClassification) { + cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = Engine::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW); + } + } else { + cpuResized = srcImg; } - batchProcessed.push_back(std::move(resized)); + cv::Mat cpuRGB; + cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); + + cv::cuda::GpuMat gpuResized; + gpuResized.upload(cpuRGB, stream); + batchProcessed.push_back(std::move(gpuResized)); } stream.waitForCompletion(); diff --git a/modules/ANSODEngine/SCRFDFaceDetector.cpp b/modules/ANSODEngine/SCRFDFaceDetector.cpp index da84189..ca65172 100644 --- a/modules/ANSODEngine/SCRFDFaceDetector.cpp +++ b/modules/ANSODEngine/SCRFDFaceDetector.cpp @@ -665,38 +665,37 @@ namespace ANSCENTER { } if (!usedNV12) { - // Standard BGR upload + resize + center-pad path - cv::cuda::Stream stream; - cv::cuda::GpuMat d_bgr; - + // CPU center-padded letterbox + BGR->RGB, then upload small image + cv::Mat srcImg; if (input.channels() == 1) { - cv::Mat img3Channel; - cv::cvtColor(input, img3Channel, cv::COLOR_GRAY2BGR); - d_bgr.upload(img3Channel, stream); - } - else if (input.channels() == 3) { - d_bgr.upload(input, stream); - } - else { + cv::cvtColor(input, srcImg, cv::COLOR_GRAY2BGR); + } else if (input.channels() == 3) { + srcImg = input; + } else { this->_logger.LogError("ANSSCRFDFD::Detect", "Unsupported channel count", __FILE__, __LINE__); return {}; } - cv::cuda::GpuMat d_rgb; - cv::cuda::GpuMat d_resized; + // CPU resize to unpadded size + cv::Mat cpuResized; + if (srcImg.rows != new_unpad_h || srcImg.cols != new_unpad_w) { + cv::resize(srcImg, cpuResized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR); + } else { + cpuResized = srcImg; + } + + // CPU center-pad to net_w x net_h + cv::Mat cpuPadded(net_h, net_w, CV_8UC3, cv::Scalar(0, 0, 0)); + cpuResized.copyTo(cpuPadded(cv::Rect(dw, dh, new_unpad_w, new_unpad_h))); + + // CPU BGR -> RGB + cv::Mat cpuRGB; + cv::cvtColor(cpuPadded, cpuRGB, cv::COLOR_BGR2RGB); + + // Upload small padded image to GPU + cv::cuda::Stream stream; cv::cuda::GpuMat d_padded; - - cv::cuda::cvtColor(d_bgr, d_rgb, cv::COLOR_BGR2RGB, 0, stream); - cv::cuda::resize(d_rgb, d_resized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR, stream); - - d_padded.create(net_h, net_w, d_resized.type()); - d_padded.setTo(cv::Scalar(0, 0, 0), stream); - - cv::Rect roi(dw, dh, new_unpad_w, new_unpad_h > 0 ? new_unpad_h : 0); - roi.width = new_unpad_w; - roi.height = new_unpad_h; - d_resized.copyTo(d_padded(roi), stream); - + d_padded.upload(cpuRGB, stream); stream.waitForCompletion(); std::vector inputVec; diff --git a/modules/ANSODEngine/engine.h b/modules/ANSODEngine/engine.h index 59208f4..49a0b69 100644 --- a/modules/ANSODEngine/engine.h +++ b/modules/ANSODEngine/engine.h @@ -173,6 +173,8 @@ public: // to the original reference frame. static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width, const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0)); + static cv::Mat cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat &input, size_t height, size_t width, + const cv::Scalar &bgcolor = cv::Scalar(114, 114, 114)); [[nodiscard]] const std::vector &getInputDims() const override { return m_inputDims; }; [[nodiscard]] const std::vector &getOutputDims() const override { return m_outputDims; };