Use CPU resize before upload to GPU to remove PCIe bottleneck

2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions
--- a/modules/ANSODEngine/ANSTENSORRTCL.cpp
+++ b/modules/ANSODEngine/ANSTENSORRTCL.cpp
@@ -534,27 +534,15 @@ namespace ANSCENTER
 			const int inputH = inputDims[0].d[1];
 			const int inputW = inputDims[0].d[2];

-			// Upload the image to GPU memory
-			cv::cuda::Stream stream;  // Create a custom stream
-			cv::cuda::GpuMat img;
-
-			if (inputImage.channels() == 1) {
-				// Convert grayscale to 3-channel BGR before uploading
-				cv::Mat img3Channel;
-				cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-				img.upload(img3Channel, stream);
+			// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
+			cv::Mat srcImg = inputImage;
+			if (srcImg.channels() == 1) {
+				cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
 			}
-			else {
-				img.upload(inputImage, stream);
-			}
-
-			// Convert BGR to RGB
-			cv::cuda::GpuMat imgRGB;
-			cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);

 			// These parameters will be used in the post-processing stage
-			outMeta.imgHeight = imgRGB.rows;
-			outMeta.imgWidth = imgRGB.cols;
+			outMeta.imgHeight = srcImg.rows;
+			outMeta.imgWidth = srcImg.cols;

 			if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) {
 				_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__);
@@ -564,19 +552,26 @@ namespace ANSCENTER
 			if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
 				outMeta.ratio = 1.f;

-				cv::cuda::GpuMat resized = imgRGB;
-
-				// Classification: direct resize (no letterbox padding) — matches ANSONNXCL
-				// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
-				if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
-					cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
+				// Classification: direct CPU resize (no letterbox padding)
+				cv::Mat cpuResized;
+				if (srcImg.rows != inputH || srcImg.cols != inputW) {
+					cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+				} else {
+					cpuResized = srcImg;
 				}

-				// Wait for all GPU ops to complete before returning GpuMats
+				// CPU BGR -> RGB
+				cv::Mat cpuRGB;
+				cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+				// Upload small image to GPU
+				cv::cuda::Stream stream;
+				cv::cuda::GpuMat gpuResized;
+				gpuResized.upload(cpuRGB, stream);
 				stream.waitForCompletion();

 				// Convert to format expected by our inference engine
-				std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
+				std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
 				std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
 				return inputs;
 			}
@@ -811,25 +806,17 @@ namespace ANSCENTER
 					return {};
 				}

-				// Upload to GPU
-				cv::cuda::GpuMat img;
-				if (inputImage.channels() == 1) {
-					// Convert grayscale to BGR
+				// CPU preprocessing: resize + BGR->RGB before GPU upload
+				cv::Mat srcImg = inputImage;
+				if (srcImg.channels() == 1) {
 					cv::Mat img3Channel;
-					cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-					img.upload(img3Channel, stream);
+					cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR);
+					srcImg = img3Channel;
 				}
-				else {
-					img.upload(inputImage, stream);
-				}
-
-				// Convert BGR to RGB
-				cv::cuda::GpuMat imgRGB;
-				cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);

 				// Store original dimensions
-				int imgHeight = imgRGB.rows;
-				int imgWidth = imgRGB.cols;
+				int imgHeight = srcImg.rows;
+				int imgWidth = srcImg.cols;

 				if (imgHeight <= 0 || imgWidth <= 0) {
 					_logger.LogFatal("TENSORRTCL::PreprocessBatch",
@@ -841,26 +828,25 @@ namespace ANSCENTER
 				outMetadata.imgHeights.push_back(imgHeight);
 				outMetadata.imgWidths.push_back(imgWidth);

-				// Calculate resize ratio
-				float ratio = 1.f / std::min(
-					inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
-					inputDims[0].d[1] / static_cast<float>(imgRGB.rows)
-				);
-				outMetadata.ratios.push_back(ratio);
+				// Classification: ratio is always 1.0
+				outMetadata.ratios.push_back(1.f);

-				// Resize maintaining aspect ratio with padding
-				cv::cuda::GpuMat resized;
-				if (imgRGB.rows != inputDims[0].d[1] || imgRGB.cols != inputDims[0].d[2]) {
-					resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
-						imgRGB, inputDims[0].d[1], inputDims[0].d[2]
-					);
-				}
-				else {
-					resized = imgRGB;
+				// Classification: direct CPU resize (no letterbox padding)
+				cv::Mat cpuResized;
+				if (srcImg.rows != inputH || srcImg.cols != inputW) {
+					cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+				} else {
+					cpuResized = srcImg;
 				}

+				cv::Mat cpuRGB;
+				cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+				cv::cuda::GpuMat gpuResized;
+				gpuResized.upload(cpuRGB, stream);
+
 				// Add to batch
-				batchedImages.push_back(std::move(resized));
+				batchedImages.push_back(std::move(gpuResized));
 			}

 			// Wait for all GPU operations to complete