Use CPU resize before upload to GPU to remove PCIe bottleneck

2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions
--- a/modules/ANSODEngine/ANSYOLOV10RTOD.cpp
+++ b/modules/ANSODEngine/ANSYOLOV10RTOD.cpp
@@ -519,46 +519,46 @@ namespace ANSCENTER
            const int inputH = inputDims[0].d[1];
            const int inputW = inputDims[0].d[2];

-            // Upload input image to GPU
-            cv::cuda::Stream stream;
-            cv::cuda::GpuMat img;
-
-            if (inputImage.empty()) {
-                _logger.LogFatal("ANSYOLOV10RTOD::Preprocess", "Empty input image", __FILE__, __LINE__);
-                return {};
+            // --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
+            cv::Mat srcImg = inputImage;
+            if (srcImg.channels() == 1) {
+                cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
            }

-            // Convert grayscale to BGR if needed
-            if (inputImage.channels() == 1) {
-                cv::Mat img3Channel;
-                cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-                img.upload(img3Channel, stream);
-            }
-            else {
-                img.upload(inputImage, stream);
-            }
-
-            // Convert to RGB
-            cv::cuda::GpuMat imgRGB;
-            cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
-            stream.waitForCompletion();
-
-            outMeta.imgHeight = imgRGB.rows;
-            outMeta.imgWidth = imgRGB.cols;
+            outMeta.imgHeight = srcImg.rows;
+            outMeta.imgWidth = srcImg.cols;

            if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
-                outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
-                    inputDims[0].d[1] / static_cast<float>(imgRGB.rows));
+                outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
+                    inputDims[0].d[1] / static_cast<float>(srcImg.rows));

-                cv::cuda::GpuMat resized = imgRGB;
+                const auto& outputDims = m_trtEngine->getOutputDims();
+                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-                // Resize to the model's expected input size while maintaining aspect ratio with padding
-                if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
-                    resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]);
+                // CPU resize to model input size
+                cv::Mat cpuResized;
+                if (srcImg.rows != inputH || srcImg.cols != inputW) {
+                    if (isClassification) {
+                        cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+                    } else {
+                        cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
+                    }
+                } else {
+                    cpuResized = srcImg;
                }

+                // CPU BGR -> RGB
+                cv::Mat cpuRGB;
+                cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+                // Upload small image to GPU
+                cv::cuda::Stream stream;
+                cv::cuda::GpuMat gpuResized;
+                gpuResized.upload(cpuRGB, stream);
+                stream.waitForCompletion();
+
                // Convert to format expected by our inference engine
-                std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
+                std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
                std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
                return inputs;
            }
@@ -1058,26 +1058,15 @@ namespace ANSCENTER
                    return {};
                }

-                // Upload to GPU
-                cv::cuda::GpuMat img;
-
-                // Convert grayscale to BGR if needed
-                if (inputImage.channels() == 1) {
-                    cv::Mat img3Channel;
-                    cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-                    img.upload(img3Channel, stream);
+                // CPU preprocessing: resize + BGR->RGB before GPU upload
+                cv::Mat srcImg = inputImage;
+                if (srcImg.channels() == 1) {
+                    cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
                }
-                else {
-                    img.upload(inputImage, stream);
-                }
-
-                // Convert to RGB
-                cv::cuda::GpuMat imgRGB;
-                cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);

                // Store original dimensions
-                outMetadata.imgHeights[i] = imgRGB.rows;
-                outMetadata.imgWidths[i] = imgRGB.cols;
+                outMetadata.imgHeights[i] = srcImg.rows;
+                outMetadata.imgWidths[i] = srcImg.cols;

                if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
                    _logger.LogFatal("ANSYOLOV10RTOD::PreprocessBatch",
@@ -1088,17 +1077,31 @@ namespace ANSCENTER
                    return {};
                }

-                // Calculate ratio for this image
-                outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast<float>(imgRGB.cols),
-                    inputH / static_cast<float>(imgRGB.rows));
+                const auto& outputDims = m_trtEngine->getOutputDims();
+                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-                // Resize with padding
-                cv::cuda::GpuMat resized = imgRGB;
-                if (resized.rows != inputH || resized.cols != inputW) {
-                    resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
+                // Calculate ratio for this image
+                outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast<float>(srcImg.cols),
+                    inputH / static_cast<float>(srcImg.rows));
+
+                // CPU resize to model input size
+                cv::Mat cpuResized;
+                if (srcImg.rows != inputH || srcImg.cols != inputW) {
+                    if (isClassification) {
+                        cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+                    } else {
+                        cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
+                    }
+                } else {
+                    cpuResized = srcImg;
                }

-                batchProcessed.push_back(std::move(resized));
+                cv::Mat cpuRGB;
+                cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+                cv::cuda::GpuMat gpuResized;
+                gpuResized.upload(cpuRGB, stream);
+                batchProcessed.push_back(std::move(gpuResized));
            }

            stream.waitForCompletion();