Use CPU resize before upload to GPU to remove PCIe bottleneck

This commit is contained in:
2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions

View File

@@ -519,46 +519,46 @@ namespace ANSCENTER
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// Upload input image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat img;
if (inputImage.empty()) {
_logger.LogFatal("ANSYOLOV10RTOD::Preprocess", "Empty input image", __FILE__, __LINE__);
return {};
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
// Convert grayscale to BGR if needed
if (inputImage.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
}
else {
img.upload(inputImage, stream);
}
// Convert to RGB
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
stream.waitForCompletion();
outMeta.imgHeight = imgRGB.rows;
outMeta.imgWidth = imgRGB.cols;
outMeta.imgHeight = srcImg.rows;
outMeta.imgWidth = srcImg.cols;
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
inputDims[0].d[1] / static_cast<float>(imgRGB.rows));
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
inputDims[0].d[1] / static_cast<float>(srcImg.rows));
cv::cuda::GpuMat resized = imgRGB;
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
// Resize to the model's expected input size while maintaining aspect ratio with padding
if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]);
// CPU resize to model input size
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
}
} else {
cpuResized = srcImg;
}
// CPU BGR -> RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
stream.waitForCompletion();
// Convert to format expected by our inference engine
std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
@@ -1058,26 +1058,15 @@ namespace ANSCENTER
return {};
}
// Upload to GPU
cv::cuda::GpuMat img;
// Convert grayscale to BGR if needed
if (inputImage.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
// CPU preprocessing: resize + BGR->RGB before GPU upload
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
else {
img.upload(inputImage, stream);
}
// Convert to RGB
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
// Store original dimensions
outMetadata.imgHeights[i] = imgRGB.rows;
outMetadata.imgWidths[i] = imgRGB.cols;
outMetadata.imgHeights[i] = srcImg.rows;
outMetadata.imgWidths[i] = srcImg.cols;
if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
_logger.LogFatal("ANSYOLOV10RTOD::PreprocessBatch",
@@ -1088,17 +1077,31 @@ namespace ANSCENTER
return {};
}
// Calculate ratio for this image
outMetadata.ratios[i] = 1.f / std::min(inputW / static_cast<float>(imgRGB.cols),
inputH / static_cast<float>(imgRGB.rows));
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
// Resize with padding
cv::cuda::GpuMat resized = imgRGB;
if (resized.rows != inputH || resized.cols != inputW) {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
// Calculate ratio for this image
outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(inputW / static_cast<float>(srcImg.cols),
inputH / static_cast<float>(srcImg.rows));
// CPU resize to model input size
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
}
} else {
cpuResized = srcImg;
}
batchProcessed.push_back(std::move(resized));
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
batchProcessed.push_back(std::move(gpuResized));
}
stream.waitForCompletion();