Use CPU resize before upload to GPU to remove PCIe bottleneck

2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions
--- a/modules/ANSODEngine/ANSTENSORTRTOD.cpp
+++ b/modules/ANSODEngine/ANSTENSORTRTOD.cpp
@@ -587,41 +587,46 @@ namespace ANSCENTER
 			const auto& inputDims = m_trtEngine->getInputDims();
 			const int inputH = inputDims[0].d[1];
 			const int inputW = inputDims[0].d[2];
-			// Upload the image to GPU memory
-			cv::cuda::Stream stream;
-			cv::cuda::GpuMat img;
-
-			if (inputImage.channels() == 1) {
-				// Convert grayscale to 3-channel BGR before uploading
-				cv::Mat img3Channel;
-				cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-				img.upload(img3Channel, stream);
-			}
-			else {
-				img.upload(inputImage, stream);
+			// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
+			cv::Mat srcImg = inputImage;
+			if (srcImg.channels() == 1) {
+				cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
 			}

-			// Convert to RGB
-			cv::cuda::GpuMat imgRGB;
-			cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
-			stream.waitForCompletion();
-
-			// Set image size parameters into per-call metadata (not shared members)
-			outMeta.imgHeight = static_cast<float>(imgRGB.rows);
-			outMeta.imgWidth  = static_cast<float>(imgRGB.cols);
+			// Set image size parameters from ORIGINAL image (before resize)
+			outMeta.imgHeight = static_cast<float>(srcImg.rows);
+			outMeta.imgWidth  = static_cast<float>(srcImg.cols);
 			if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
-				outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
-					inputDims[0].d[1] / static_cast<float>(imgRGB.rows));
+				outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
+					inputDims[0].d[1] / static_cast<float>(srcImg.rows));

-				cv::cuda::GpuMat resized = imgRGB;
+				const auto& outputDims = m_trtEngine->getOutputDims();
+				const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-				// Resize to the model's expected input size while maintaining aspect ratio with padding
-				if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
-					resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]);
+				// CPU resize to model input size
+				cv::Mat cpuResized;
+				if (srcImg.rows != inputH || srcImg.cols != inputW) {
+					if (isClassification) {
+						cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+					} else {
+						cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
+					}
+				} else {
+					cpuResized = srcImg;
 				}

+				// CPU BGR -> RGB
+				cv::Mat cpuRGB;
+				cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+				// Upload small image to GPU
+				cv::cuda::Stream stream;
+				cv::cuda::GpuMat gpuResized;
+				gpuResized.upload(cpuRGB, stream);
+				stream.waitForCompletion();
+
 				// Convert to format expected by our inference engine
-				std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
+				std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
 				std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
 				return inputs;
 			}
@@ -1174,29 +1179,20 @@ namespace ANSCENTER
 				return {};
 			}

-			cv::cuda::GpuMat img;
-
-			if (inputImage.channels() == 1) {
-				cv::Mat img3Channel;
-				cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
-				img.upload(img3Channel, stream);
-			}
-			else if (inputImage.channels() == 3) {
-				img.upload(inputImage, stream);
-			}
-			else {
+			// CPU preprocessing: resize + BGR->RGB before GPU upload
+			cv::Mat srcImg = inputImage;
+			if (srcImg.channels() == 1) {
+				cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
+			} else if (srcImg.channels() != 3) {
 				_logger.LogError("TENSORRTOD::PreprocessBatch",
 					"Unsupported channel count at index " + std::to_string(i),
 					__FILE__, __LINE__);
 				return {};
 			}

-			cv::cuda::GpuMat imgRGB;
-			cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
-
-			// Store in output metadata
-			outMetadata.imgHeights[i] = imgRGB.rows;
-			outMetadata.imgWidths[i] = imgRGB.cols;
+			// Store in output metadata from ORIGINAL image
+			outMetadata.imgHeights[i] = srcImg.rows;
+			outMetadata.imgWidths[i] = srcImg.cols;

 			if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
 				_logger.LogError("TENSORRTOD::PreprocessBatch",
@@ -1205,20 +1201,30 @@ namespace ANSCENTER
 				return {};
 			}

-			const float scaleW = inputW / static_cast<float>(imgRGB.cols);
-			const float scaleH = inputH / static_cast<float>(imgRGB.rows);
-			outMetadata.ratios[i] = 1.f / std::min(scaleW, scaleH);
+			const auto& outputDims = m_trtEngine->getOutputDims();
+			const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

-			cv::cuda::GpuMat resized;
-			if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
-				resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
-					imgRGB, inputH, inputW);
-			}
-			else {
-				resized = imgRGB;
+			const float scaleW = inputW / static_cast<float>(srcImg.cols);
+			const float scaleH = inputH / static_cast<float>(srcImg.rows);
+			outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);
+
+			cv::Mat cpuResized;
+			if (srcImg.rows != inputH || srcImg.cols != inputW) {
+				if (isClassification) {
+					cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
+				} else {
+					cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
+				}
+			} else {
+				cpuResized = srcImg;
 			}

-			batchProcessed.push_back(std::move(resized));
+			cv::Mat cpuRGB;
+			cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
+
+			cv::cuda::GpuMat gpuResized;
+			gpuResized.upload(cpuRGB, stream);
+			batchProcessed.push_back(std::move(gpuResized));
 			}

 			stream.waitForCompletion();