Use CPU resize before upload to GPU to remove PCIe bottleneck
This commit is contained in:
@@ -587,41 +587,46 @@ namespace ANSCENTER
|
||||
const auto& inputDims = m_trtEngine->getInputDims();
|
||||
const int inputH = inputDims[0].d[1];
|
||||
const int inputW = inputDims[0].d[2];
|
||||
// Upload the image to GPU memory
|
||||
cv::cuda::Stream stream;
|
||||
cv::cuda::GpuMat img;
|
||||
|
||||
if (inputImage.channels() == 1) {
|
||||
// Convert grayscale to 3-channel BGR before uploading
|
||||
cv::Mat img3Channel;
|
||||
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
|
||||
img.upload(img3Channel, stream);
|
||||
}
|
||||
else {
|
||||
img.upload(inputImage, stream);
|
||||
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
|
||||
cv::Mat srcImg = inputImage;
|
||||
if (srcImg.channels() == 1) {
|
||||
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
|
||||
}
|
||||
|
||||
// Convert to RGB
|
||||
cv::cuda::GpuMat imgRGB;
|
||||
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
|
||||
stream.waitForCompletion();
|
||||
|
||||
// Set image size parameters into per-call metadata (not shared members)
|
||||
outMeta.imgHeight = static_cast<float>(imgRGB.rows);
|
||||
outMeta.imgWidth = static_cast<float>(imgRGB.cols);
|
||||
// Set image size parameters from ORIGINAL image (before resize)
|
||||
outMeta.imgHeight = static_cast<float>(srcImg.rows);
|
||||
outMeta.imgWidth = static_cast<float>(srcImg.cols);
|
||||
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
|
||||
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
|
||||
inputDims[0].d[1] / static_cast<float>(imgRGB.rows));
|
||||
outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
|
||||
inputDims[0].d[1] / static_cast<float>(srcImg.rows));
|
||||
|
||||
cv::cuda::GpuMat resized = imgRGB;
|
||||
const auto& outputDims = m_trtEngine->getOutputDims();
|
||||
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
|
||||
|
||||
// Resize to the model's expected input size while maintaining aspect ratio with padding
|
||||
if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
|
||||
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputDims[0].d[1], inputDims[0].d[2]);
|
||||
// CPU resize to model input size
|
||||
cv::Mat cpuResized;
|
||||
if (srcImg.rows != inputH || srcImg.cols != inputW) {
|
||||
if (isClassification) {
|
||||
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
|
||||
} else {
|
||||
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
|
||||
}
|
||||
} else {
|
||||
cpuResized = srcImg;
|
||||
}
|
||||
|
||||
// CPU BGR -> RGB
|
||||
cv::Mat cpuRGB;
|
||||
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
|
||||
|
||||
// Upload small image to GPU
|
||||
cv::cuda::Stream stream;
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
gpuResized.upload(cpuRGB, stream);
|
||||
stream.waitForCompletion();
|
||||
|
||||
// Convert to format expected by our inference engine
|
||||
std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
|
||||
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
|
||||
return inputs;
|
||||
}
|
||||
@@ -1174,29 +1179,20 @@ namespace ANSCENTER
|
||||
return {};
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat img;
|
||||
|
||||
if (inputImage.channels() == 1) {
|
||||
cv::Mat img3Channel;
|
||||
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
|
||||
img.upload(img3Channel, stream);
|
||||
}
|
||||
else if (inputImage.channels() == 3) {
|
||||
img.upload(inputImage, stream);
|
||||
}
|
||||
else {
|
||||
// CPU preprocessing: resize + BGR->RGB before GPU upload
|
||||
cv::Mat srcImg = inputImage;
|
||||
if (srcImg.channels() == 1) {
|
||||
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
|
||||
} else if (srcImg.channels() != 3) {
|
||||
_logger.LogError("TENSORRTOD::PreprocessBatch",
|
||||
"Unsupported channel count at index " + std::to_string(i),
|
||||
__FILE__, __LINE__);
|
||||
return {};
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat imgRGB;
|
||||
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
|
||||
|
||||
// Store in output metadata
|
||||
outMetadata.imgHeights[i] = imgRGB.rows;
|
||||
outMetadata.imgWidths[i] = imgRGB.cols;
|
||||
// Store in output metadata from ORIGINAL image
|
||||
outMetadata.imgHeights[i] = srcImg.rows;
|
||||
outMetadata.imgWidths[i] = srcImg.cols;
|
||||
|
||||
if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
|
||||
_logger.LogError("TENSORRTOD::PreprocessBatch",
|
||||
@@ -1205,20 +1201,30 @@ namespace ANSCENTER
|
||||
return {};
|
||||
}
|
||||
|
||||
const float scaleW = inputW / static_cast<float>(imgRGB.cols);
|
||||
const float scaleH = inputH / static_cast<float>(imgRGB.rows);
|
||||
outMetadata.ratios[i] = 1.f / std::min(scaleW, scaleH);
|
||||
const auto& outputDims = m_trtEngine->getOutputDims();
|
||||
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
|
||||
|
||||
cv::cuda::GpuMat resized;
|
||||
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
|
||||
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
|
||||
imgRGB, inputH, inputW);
|
||||
}
|
||||
else {
|
||||
resized = imgRGB;
|
||||
const float scaleW = inputW / static_cast<float>(srcImg.cols);
|
||||
const float scaleH = inputH / static_cast<float>(srcImg.rows);
|
||||
outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);
|
||||
|
||||
cv::Mat cpuResized;
|
||||
if (srcImg.rows != inputH || srcImg.cols != inputW) {
|
||||
if (isClassification) {
|
||||
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
|
||||
} else {
|
||||
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
|
||||
}
|
||||
} else {
|
||||
cpuResized = srcImg;
|
||||
}
|
||||
|
||||
batchProcessed.push_back(std::move(resized));
|
||||
cv::Mat cpuRGB;
|
||||
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
|
||||
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
gpuResized.upload(cpuRGB, stream);
|
||||
batchProcessed.push_back(std::move(gpuResized));
|
||||
}
|
||||
|
||||
stream.waitForCompletion();
|
||||
|
||||
Reference in New Issue
Block a user