Use CPU resize before upload to GPU to remove PCIe bottleneck

This commit is contained in:
2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions

View File

@@ -534,27 +534,15 @@ namespace ANSCENTER
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// Upload the image to GPU memory
cv::cuda::Stream stream; // Create a custom stream
cv::cuda::GpuMat img;
if (inputImage.channels() == 1) {
// Convert grayscale to 3-channel BGR before uploading
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
else {
img.upload(inputImage, stream);
}
// Convert BGR to RGB
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
// These parameters will be used in the post-processing stage
outMeta.imgHeight = imgRGB.rows;
outMeta.imgWidth = imgRGB.cols;
outMeta.imgHeight = srcImg.rows;
outMeta.imgWidth = srcImg.cols;
if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) {
_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__);
@@ -564,19 +552,26 @@ namespace ANSCENTER
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f;
cv::cuda::GpuMat resized = imgRGB;
// Classification: direct resize (no letterbox padding) — matches ANSONNXCL
// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
// Classification: direct CPU resize (no letterbox padding)
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
// Wait for all GPU ops to complete before returning GpuMats
// CPU BGR -> RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
stream.waitForCompletion();
// Convert to format expected by our inference engine
std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
@@ -811,25 +806,17 @@ namespace ANSCENTER
return {};
}
// Upload to GPU
cv::cuda::GpuMat img;
if (inputImage.channels() == 1) {
// Convert grayscale to BGR
// CPU preprocessing: resize + BGR->RGB before GPU upload
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR);
srcImg = img3Channel;
}
else {
img.upload(inputImage, stream);
}
// Convert BGR to RGB
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
// Store original dimensions
int imgHeight = imgRGB.rows;
int imgWidth = imgRGB.cols;
int imgHeight = srcImg.rows;
int imgWidth = srcImg.cols;
if (imgHeight <= 0 || imgWidth <= 0) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
@@ -841,26 +828,25 @@ namespace ANSCENTER
outMetadata.imgHeights.push_back(imgHeight);
outMetadata.imgWidths.push_back(imgWidth);
// Calculate resize ratio
float ratio = 1.f / std::min(
inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
inputDims[0].d[1] / static_cast<float>(imgRGB.rows)
);
outMetadata.ratios.push_back(ratio);
// Classification: ratio is always 1.0
outMetadata.ratios.push_back(1.f);
// Resize maintaining aspect ratio with padding
cv::cuda::GpuMat resized;
if (imgRGB.rows != inputDims[0].d[1] || imgRGB.cols != inputDims[0].d[2]) {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
imgRGB, inputDims[0].d[1], inputDims[0].d[2]
);
}
else {
resized = imgRGB;
// Classification: direct CPU resize (no letterbox padding)
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
// Add to batch
batchedImages.push_back(std::move(resized));
batchedImages.push_back(std::move(gpuResized));
}
// Wait for all GPU operations to complete