Use CPU resize before upload to GPU to remove PCIe bottleneck

This commit is contained in:
2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions

View File

@@ -587,41 +587,46 @@ namespace ANSCENTER
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// Upload the image to GPU memory
cv::cuda::Stream stream;
cv::cuda::GpuMat img;
if (inputImage.channels() == 1) {
// Convert grayscale to 3-channel BGR before uploading
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
}
else {
img.upload(inputImage, stream);
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
// Convert to RGB
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
stream.waitForCompletion();
// Set image size parameters into per-call metadata (not shared members)
outMeta.imgHeight = static_cast<float>(imgRGB.rows);
outMeta.imgWidth = static_cast<float>(imgRGB.cols);
// Set image size parameters from ORIGINAL image (before resize)
outMeta.imgHeight = static_cast<float>(srcImg.rows);
outMeta.imgWidth = static_cast<float>(srcImg.cols);
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
inputDims[0].d[1] / static_cast<float>(imgRGB.rows));
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
inputDims[0].d[1] / static_cast<float>(srcImg.rows));
cv::cuda::GpuMat resized = imgRGB;
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
// Resize to the model's expected input size while maintaining aspect ratio with padding
if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]);
// CPU resize to model input size
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
}
} else {
cpuResized = srcImg;
}
// CPU BGR -> RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
stream.waitForCompletion();
// Convert to format expected by our inference engine
std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
@@ -1174,29 +1179,20 @@ namespace ANSCENTER
return {};
}
cv::cuda::GpuMat img;
if (inputImage.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
}
else if (inputImage.channels() == 3) {
img.upload(inputImage, stream);
}
else {
// CPU preprocessing: resize + BGR->RGB before GPU upload
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
} else if (srcImg.channels() != 3) {
_logger.LogError("TENSORRTOD::PreprocessBatch",
"Unsupported channel count at index " + std::to_string(i),
__FILE__, __LINE__);
return {};
}
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
// Store in output metadata
outMetadata.imgHeights[i] = imgRGB.rows;
outMetadata.imgWidths[i] = imgRGB.cols;
// Store in output metadata from ORIGINAL image
outMetadata.imgHeights[i] = srcImg.rows;
outMetadata.imgWidths[i] = srcImg.cols;
if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
_logger.LogError("TENSORRTOD::PreprocessBatch",
@@ -1205,20 +1201,30 @@ namespace ANSCENTER
return {};
}
const float scaleW = inputW / static_cast<float>(imgRGB.cols);
const float scaleH = inputH / static_cast<float>(imgRGB.rows);
outMetadata.ratios[i] = 1.f / std::min(scaleW, scaleH);
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
cv::cuda::GpuMat resized;
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
imgRGB, inputH, inputW);
}
else {
resized = imgRGB;
const float scaleW = inputW / static_cast<float>(srcImg.cols);
const float scaleH = inputH / static_cast<float>(srcImg.rows);
outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
}
} else {
cpuResized = srcImg;
}
batchProcessed.push_back(std::move(resized));
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
batchProcessed.push_back(std::move(gpuResized));
}
stream.waitForCompletion();