Use CPU resize before upload to GPU to remove PCIe bottleneck
This commit is contained in:
@@ -534,27 +534,15 @@ namespace ANSCENTER
|
||||
const int inputH = inputDims[0].d[1];
|
||||
const int inputW = inputDims[0].d[2];
|
||||
|
||||
// Upload the image to GPU memory
|
||||
cv::cuda::Stream stream; // Create a custom stream
|
||||
cv::cuda::GpuMat img;
|
||||
|
||||
if (inputImage.channels() == 1) {
|
||||
// Convert grayscale to 3-channel BGR before uploading
|
||||
cv::Mat img3Channel;
|
||||
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
|
||||
img.upload(img3Channel, stream);
|
||||
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
|
||||
cv::Mat srcImg = inputImage;
|
||||
if (srcImg.channels() == 1) {
|
||||
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
|
||||
}
|
||||
else {
|
||||
img.upload(inputImage, stream);
|
||||
}
|
||||
|
||||
// Convert BGR to RGB
|
||||
cv::cuda::GpuMat imgRGB;
|
||||
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
|
||||
|
||||
// These parameters will be used in the post-processing stage
|
||||
outMeta.imgHeight = imgRGB.rows;
|
||||
outMeta.imgWidth = imgRGB.cols;
|
||||
outMeta.imgHeight = srcImg.rows;
|
||||
outMeta.imgWidth = srcImg.cols;
|
||||
|
||||
if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) {
|
||||
_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__);
|
||||
@@ -564,19 +552,26 @@ namespace ANSCENTER
|
||||
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
|
||||
outMeta.ratio = 1.f;
|
||||
|
||||
cv::cuda::GpuMat resized = imgRGB;
|
||||
|
||||
// Classification: direct resize (no letterbox padding) — matches ANSONNXCL
|
||||
// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
|
||||
if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
|
||||
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
|
||||
// Classification: direct CPU resize (no letterbox padding)
|
||||
cv::Mat cpuResized;
|
||||
if (srcImg.rows != inputH || srcImg.cols != inputW) {
|
||||
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
|
||||
} else {
|
||||
cpuResized = srcImg;
|
||||
}
|
||||
|
||||
// Wait for all GPU ops to complete before returning GpuMats
|
||||
// CPU BGR -> RGB
|
||||
cv::Mat cpuRGB;
|
||||
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
|
||||
|
||||
// Upload small image to GPU
|
||||
cv::cuda::Stream stream;
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
gpuResized.upload(cpuRGB, stream);
|
||||
stream.waitForCompletion();
|
||||
|
||||
// Convert to format expected by our inference engine
|
||||
std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
|
||||
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
|
||||
return inputs;
|
||||
}
|
||||
@@ -811,25 +806,17 @@ namespace ANSCENTER
|
||||
return {};
|
||||
}
|
||||
|
||||
// Upload to GPU
|
||||
cv::cuda::GpuMat img;
|
||||
if (inputImage.channels() == 1) {
|
||||
// Convert grayscale to BGR
|
||||
// CPU preprocessing: resize + BGR->RGB before GPU upload
|
||||
cv::Mat srcImg = inputImage;
|
||||
if (srcImg.channels() == 1) {
|
||||
cv::Mat img3Channel;
|
||||
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
|
||||
img.upload(img3Channel, stream);
|
||||
cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR);
|
||||
srcImg = img3Channel;
|
||||
}
|
||||
else {
|
||||
img.upload(inputImage, stream);
|
||||
}
|
||||
|
||||
// Convert BGR to RGB
|
||||
cv::cuda::GpuMat imgRGB;
|
||||
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
|
||||
|
||||
// Store original dimensions
|
||||
int imgHeight = imgRGB.rows;
|
||||
int imgWidth = imgRGB.cols;
|
||||
int imgHeight = srcImg.rows;
|
||||
int imgWidth = srcImg.cols;
|
||||
|
||||
if (imgHeight <= 0 || imgWidth <= 0) {
|
||||
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
|
||||
@@ -841,26 +828,25 @@ namespace ANSCENTER
|
||||
outMetadata.imgHeights.push_back(imgHeight);
|
||||
outMetadata.imgWidths.push_back(imgWidth);
|
||||
|
||||
// Calculate resize ratio
|
||||
float ratio = 1.f / std::min(
|
||||
inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
|
||||
inputDims[0].d[1] / static_cast<float>(imgRGB.rows)
|
||||
);
|
||||
outMetadata.ratios.push_back(ratio);
|
||||
// Classification: ratio is always 1.0
|
||||
outMetadata.ratios.push_back(1.f);
|
||||
|
||||
// Resize maintaining aspect ratio with padding
|
||||
cv::cuda::GpuMat resized;
|
||||
if (imgRGB.rows != inputDims[0].d[1] || imgRGB.cols != inputDims[0].d[2]) {
|
||||
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
|
||||
imgRGB, inputDims[0].d[1], inputDims[0].d[2]
|
||||
);
|
||||
}
|
||||
else {
|
||||
resized = imgRGB;
|
||||
// Classification: direct CPU resize (no letterbox padding)
|
||||
cv::Mat cpuResized;
|
||||
if (srcImg.rows != inputH || srcImg.cols != inputW) {
|
||||
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
|
||||
} else {
|
||||
cpuResized = srcImg;
|
||||
}
|
||||
|
||||
cv::Mat cpuRGB;
|
||||
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
|
||||
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
gpuResized.upload(cpuRGB, stream);
|
||||
|
||||
// Add to batch
|
||||
batchedImages.push_back(std::move(resized));
|
||||
batchedImages.push_back(std::move(gpuResized));
|
||||
}
|
||||
|
||||
// Wait for all GPU operations to complete
|
||||
|
||||
Reference in New Issue
Block a user