Use CPU resize before upload to GPU to remove PCIe bottleneck

This commit is contained in:
2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions

View File

@@ -665,38 +665,37 @@ namespace ANSCENTER {
}
if (!usedNV12) {
// Standard BGR upload + resize + center-pad path
cv::cuda::Stream stream;
cv::cuda::GpuMat d_bgr;
// CPU center-padded letterbox + BGR->RGB, then upload small image
cv::Mat srcImg;
if (input.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(input, img3Channel, cv::COLOR_GRAY2BGR);
d_bgr.upload(img3Channel, stream);
}
else if (input.channels() == 3) {
d_bgr.upload(input, stream);
}
else {
cv::cvtColor(input, srcImg, cv::COLOR_GRAY2BGR);
} else if (input.channels() == 3) {
srcImg = input;
} else {
this->_logger.LogError("ANSSCRFDFD::Detect", "Unsupported channel count", __FILE__, __LINE__);
return {};
}
cv::cuda::GpuMat d_rgb;
cv::cuda::GpuMat d_resized;
// CPU resize to unpadded size
cv::Mat cpuResized;
if (srcImg.rows != new_unpad_h || srcImg.cols != new_unpad_w) {
cv::resize(srcImg, cpuResized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
// CPU center-pad to net_w x net_h
cv::Mat cpuPadded(net_h, net_w, CV_8UC3, cv::Scalar(0, 0, 0));
cpuResized.copyTo(cpuPadded(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));
// CPU BGR -> RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuPadded, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small padded image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat d_padded;
cv::cuda::cvtColor(d_bgr, d_rgb, cv::COLOR_BGR2RGB, 0, stream);
cv::cuda::resize(d_rgb, d_resized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR, stream);
d_padded.create(net_h, net_w, d_resized.type());
d_padded.setTo(cv::Scalar(0, 0, 0), stream);
cv::Rect roi(dw, dh, new_unpad_w, new_unpad_h > 0 ? new_unpad_h : 0);
roi.width = new_unpad_w;
roi.height = new_unpad_h;
d_resized.copyTo(d_padded(roi), stream);
d_padded.upload(cpuRGB, stream);
stream.waitForCompletion();
std::vector<cv::cuda::GpuMat> inputVec;