Use CPU resize before upload to GPU to remove PCIe bottleneck

2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions
--- a/modules/ANSODEngine/ANSRTYOLO.cpp
+++ b/modules/ANSODEngine/ANSRTYOLO.cpp
@@ -462,50 +462,46 @@ namespace ANSCENTER {
            // Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
            if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};

-            cv::cuda::Stream stream;
-            cv::cuda::GpuMat gpuImg;
-
-            // Resolve source Mat (handle grayscale → BGR on CPU first)
-            if (inputImage.channels() == 1) {
-                cv::Mat img3Channel;
-                cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-                gpuImg.upload(img3Channel, stream);
-            } else {
-                gpuImg.upload(inputImage, stream);
+            // --- CPU preprocessing: resize + BGR→RGB before GPU upload ---
+            // Reduces PCIe transfer from 25 MB (4K BGR) to 1.2 MB (640×640 RGB).
+            // With 12 AI tasks uploading concurrently, this eliminates the WDDM
+            // SRW lock convoy that causes 400-580ms preprocess spikes.
+            cv::Mat srcImg = inputImage;
+            if (srcImg.channels() == 1) {
+                cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
            }

-            // GPU: BGR → RGB
-            cv::cuda::GpuMat gpuRGB;
-            cv::cuda::cvtColor(gpuImg, gpuRGB, cv::COLOR_BGR2RGB, 0, stream);
-
-            outMeta.imgHeight = static_cast<float>(gpuRGB.rows);
-            outMeta.imgWidth  = static_cast<float>(gpuRGB.cols);
+            outMeta.imgHeight = static_cast<float>(srcImg.rows);
+            outMeta.imgWidth  = static_cast<float>(srcImg.cols);

            if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
                outMeta.ratio = 1.f / std::min(
-                    inputDims[0].d[2] / static_cast<float>(gpuRGB.cols),
-                    inputDims[0].d[1] / static_cast<float>(gpuRGB.rows));
+                    inputDims[0].d[2] / static_cast<float>(srcImg.cols),
+                    inputDims[0].d[1] / static_cast<float>(srcImg.rows));

-                // Check if model is classification (output ndims <= 2)
                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-                cv::cuda::GpuMat gpuResized;
-                if (gpuRGB.rows != inputH || gpuRGB.cols != inputW) {
+                // CPU resize to model input size
+                cv::Mat cpuResized;
+                if (srcImg.rows != inputH || srcImg.cols != inputW) {
                    if (isClassification) {
-                        // Classification: direct resize (no letterbox padding)
-                        cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
-                                         0, 0, cv::INTER_LINEAR, stream);
-                    }
-                    else {
-                        // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad (on GPU)
-                        gpuResized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
-                            gpuRGB, inputH, inputW);
+                        cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+                    } else {
+                        cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
                    }
                } else {
-                    gpuResized = gpuRGB;
+                    cpuResized = srcImg;
                }

+                // CPU BGR → RGB
+                cv::Mat cpuRGB;
+                cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+                // Upload small image to GPU (1.2 MB instead of 25 MB for 4K)
+                cv::cuda::Stream stream;
+                cv::cuda::GpuMat gpuResized;
+                gpuResized.upload(cpuRGB, stream);
                stream.waitForCompletion();

                std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
@@ -878,26 +874,18 @@ namespace ANSCENTER {
                        "Empty input image at index " + std::to_string(i), __FILE__, __LINE__);
                    return {};
                }
-                cv::cuda::GpuMat img;
-                if (inputImage.channels() == 1) {
-                    cv::Mat img3Channel;
-                    cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-                    img.upload(img3Channel, stream);
-                }
-                else if (inputImage.channels() == 3) {
-                    img.upload(inputImage, stream);
-                }
-                else {
+                // CPU preprocessing: resize + BGR→RGB before GPU upload
+                cv::Mat srcImg = inputImage;
+                if (srcImg.channels() == 1) {
+                    cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
+                } else if (srcImg.channels() != 3) {
                    _logger.LogError("ANSRTYOLO::PreprocessBatch",
                        "Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__);
                    return {};
                }

-                cv::cuda::GpuMat imgRGB;
-                cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
-
-                outMetadata.imgHeights[i] = imgRGB.rows;
-                outMetadata.imgWidths[i]  = imgRGB.cols;
+                outMetadata.imgHeights[i] = srcImg.rows;
+                outMetadata.imgWidths[i]  = srcImg.cols;
                if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
                    _logger.LogError("ANSRTYOLO::PreprocessBatch",
                        "Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__);
@@ -907,23 +895,27 @@ namespace ANSCENTER {
                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-                const float scaleW = inputW / static_cast<float>(imgRGB.cols);
-                const float scaleH = inputH / static_cast<float>(imgRGB.rows);
+                const float scaleW = inputW / static_cast<float>(srcImg.cols);
+                const float scaleH = inputH / static_cast<float>(srcImg.rows);
                outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);

-                cv::cuda::GpuMat resized;
-                if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
+                cv::Mat cpuResized;
+                if (srcImg.rows != inputH || srcImg.cols != inputW) {
                    if (isClassification) {
-                        cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
+                        cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
                    } else {
-                        resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
+                        cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
                    }
-                }
-                else {
-                    resized = imgRGB;
+                } else {
+                    cpuResized = srcImg;
                }

-                batchProcessed.push_back(std::move(resized));
+                cv::Mat cpuRGB;
+                cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+                cv::cuda::GpuMat gpuResized;
+                gpuResized.upload(cpuRGB, stream);
+                batchProcessed.push_back(std::move(gpuResized));
            }
            stream.waitForCompletion();

@@ -1804,10 +1796,10 @@ namespace ANSCENTER {
    std::vector<Object> ANSRTYOLO::DetectObjects(const cv::Mat& inputImage,
                                                  const std::string& camera_id) {
        try {
-            // --- Debug timer helper (zero-cost when _debugFlag == false) ---
+            // --- Debug timer helper ---
            using Clock = std::chrono::steady_clock;
            const bool dbg = _debugFlag;
-            auto t0 = dbg ? Clock::now() : Clock::time_point{};
+            auto t0 = Clock::now();  // Always set — used by ANS_DBG timing output
            auto tPrev = t0;
            auto elapsed = [&]() -> double {
                auto now = Clock::now();
@@ -2045,13 +2037,21 @@ namespace ANSCENTER {
            }

            // --- 6. Total pipeline time ---
-            if (dbg) {
+            {
                double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
-                _logger.LogInfo("ANSRTYOLO::DetectObjects",
-                    "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) +
-                    "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
-                    ") Results=" + std::to_string(results.size()),
-                    __FILE__, __LINE__);
+                if (dbg) {
+                    _logger.LogInfo("ANSRTYOLO::DetectObjects",
+                        "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) +
+                        "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
+                        ") Results=" + std::to_string(results.size()),
+                        __FILE__, __LINE__);
+                }
+                // DebugView output — controlled by ANSCORE_DEBUGVIEW
+                double msPreproc = std::chrono::duration<double, std::milli>(_trtStart - t0).count();
+                ANS_DBG("YOLO_Timing", "cam=%s total=%.1fms preproc=%.1fms inf=%.1fms %dx%d det=%zu %s",
+                        camera_id.c_str(), msTotal, msPreproc, _trtMs,
+                        inputImage.cols, inputImage.rows, results.size(),
+                        usedNV12 ? "NV12" : "BGR");
            }

            return results;
@@ -2101,7 +2101,7 @@ namespace ANSCENTER {
            // --- Debug timer helper ---
            using Clock = std::chrono::steady_clock;
            const bool dbg = _debugFlag;
-            auto t0 = dbg ? Clock::now() : Clock::time_point{};
+            auto t0 = Clock::now();  // Always set — used by ANS_DBG timing output
            auto tPrev = t0;
            auto elapsed = [&]() -> double {
                auto now = Clock::now();
@@ -2350,19 +2350,23 @@ namespace ANSCENTER {
                }
            }

-            if (dbg) {
-                double msPostprocess = elapsed();
+            {
+                double msPostprocess = dbg ? elapsed() : 0;
                double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
-                _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch",
-                    "[DEBUG] " + camera_id +
-                    " batch=" + std::to_string(realCount) +
-                    " | SetDev=" + std::to_string(msSetDevice) +
-                    "ms Pad=" + std::to_string(msPad) +
-                    "ms Preproc=" + std::to_string(msPreprocess) +
-                    "ms Inf=" + std::to_string(msInference) +
-                    "ms Postproc=" + std::to_string(msPostprocess) +
-                    "ms TOTAL=" + std::to_string(msTotal) + "ms",
-                    __FILE__, __LINE__);
+                if (dbg) {
+                    _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch",
+                        "[DEBUG] " + camera_id +
+                        " batch=" + std::to_string(realCount) +
+                        " | SetDev=" + std::to_string(msSetDevice) +
+                        "ms Pad=" + std::to_string(msPad) +
+                        "ms Preproc=" + std::to_string(msPreprocess) +
+                        "ms Inf=" + std::to_string(msInference) +
+                        "ms Postproc=" + std::to_string(msPostprocess) +
+                        "ms TOTAL=" + std::to_string(msTotal) + "ms",
+                        __FILE__, __LINE__);
+                }
+                ANS_DBG("YOLO_Timing", "cam=%s batch=%d total=%.1fms preproc=%.1fms inf=%.1fms",
+                        camera_id.c_str(), realCount, msTotal, msPreprocess, msInference);
            }

            return batchDetections;