Use CPU resize before upload to GPU to remove PCIe bottleneck

This commit is contained in:
2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions

View File

@@ -674,26 +674,22 @@ namespace ANSCENTER {
}
try {
// Fix #8: Use pooled GPU buffers to avoid per-frame allocation
m_gpuImg.upload(inputImage, m_gpuStream);
// Handle grayscale conversion on GPU
if (inputImage.channels() == 1) {
cv::cuda::cvtColor(m_gpuImg, m_gpuRgb, cv::COLOR_GRAY2BGR, 0, m_gpuStream);
std::swap(m_gpuImg, m_gpuRgb);
// CPU preprocessing: resize + BGR→RGB before GPU upload
// Reduces PCIe transfer and eliminates GPU cvtColor/resize overhead
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
// Resize on GPU if needed
if (inputImage.cols != GPU_FACE_WIDTH || inputImage.rows != GPU_FACE_HEIGHT) {
cv::cuda::resize(m_gpuImg, m_gpuResized, cv::Size(GPU_FACE_WIDTH, GPU_FACE_HEIGHT),
0, 0, cv::INTER_LINEAR, m_gpuStream);
cv::Mat cpuResized;
if (srcImg.cols != GPU_FACE_WIDTH || srcImg.rows != GPU_FACE_HEIGHT) {
cv::resize(srcImg, cpuResized, cv::Size(GPU_FACE_WIDTH, GPU_FACE_HEIGHT), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
else {
m_gpuResized = m_gpuImg;
}
// BGR to RGB conversion on GPU
cv::cuda::cvtColor(m_gpuResized, m_gpuRgb, cv::COLOR_BGR2RGB, 0, m_gpuStream);
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
m_gpuRgb.upload(cpuRGB, m_gpuStream);
m_gpuStream.waitForCompletion();
// Prepare inference inputs
std::vector<cv::cuda::GpuMat> inputVec;
@@ -781,33 +777,39 @@ namespace ANSCENTER {
batchGpu.reserve(chunkEnd - chunkStart);
for (size_t i = chunkStart; i < chunkEnd; i++) {
cv::cuda::GpuMat d_img;
// Use GPU-resident face if available (NV12 affine warp path),
// otherwise upload from CPU (standard path)
// otherwise do CPU resize + BGR→RGB before upload
if (i < gpuFaceROIs.size() && !gpuFaceROIs[i].empty()) {
d_img = gpuFaceROIs[i]; // already on GPU — skip upload
cv::cuda::GpuMat d_img = gpuFaceROIs[i]; // already on GPU
if (d_img.cols != GPU_FACE_WIDTH || d_img.rows != GPU_FACE_HEIGHT) {
cv::cuda::GpuMat d_resized;
cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, m_gpuStream);
d_img = d_resized;
}
cv::cuda::GpuMat d_rgb;
cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, m_gpuStream);
batchGpu.emplace_back(std::move(d_rgb));
} else {
const auto& roi = faceROIs[i];
if (roi.empty()) continue;
d_img.upload(roi, m_gpuStream);
if (roi.channels() == 1) {
cv::cuda::GpuMat d_bgr;
cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, m_gpuStream);
d_img = d_bgr;
// CPU preprocessing: resize + BGR→RGB before upload
cv::Mat srcImg = roi;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
cv::Mat cpuResized;
if (srcImg.cols != GPU_FACE_WIDTH || srcImg.rows != GPU_FACE_HEIGHT) {
cv::resize(srcImg, cpuResized, targetSize, 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat d_rgb;
d_rgb.upload(cpuRGB, m_gpuStream);
batchGpu.emplace_back(std::move(d_rgb));
}
if (d_img.cols != GPU_FACE_WIDTH || d_img.rows != GPU_FACE_HEIGHT) {
cv::cuda::GpuMat d_resized;
cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, m_gpuStream);
d_img = d_resized;
}
cv::cuda::GpuMat d_rgb;
cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, m_gpuStream);
batchGpu.emplace_back(std::move(d_rgb));
}
FR_END_TIMER(gpu_preproc, "RunArcFaceBatch GPU preprocess (" + std::to_string(batchGpu.size()) + " faces)");

View File

@@ -303,31 +303,27 @@ namespace ANSCENTER {
return embedding;
}
// GPU preprocessing pipeline
// CPU preprocessing: resize + color convert, then upload small image
cv::cuda::Stream stream;
cv::cuda::GpuMat d_img;
// Upload to GPU
d_img.upload(inputImage, stream);
// Handle grayscale conversion on GPU
if (inputImage.channels() == 1) {
cv::cuda::GpuMat d_bgr;
cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, stream);
d_img = d_bgr;
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
// Resize on GPU if needed
if (inputImage.cols != FACE_WIDTH || inputImage.rows != FACE_HEIGHT) {
cv::cuda::GpuMat d_resized;
cv::cuda::resize(d_img, d_resized, cv::Size(FACE_WIDTH, FACE_HEIGHT),
0, 0, cv::INTER_LINEAR, stream);
d_img = d_resized;
cv::Mat cpuResized;
if (srcImg.rows != FACE_HEIGHT || srcImg.cols != FACE_WIDTH) {
cv::resize(srcImg, cpuResized, cv::Size(FACE_WIDTH, FACE_HEIGHT), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
// BGR to RGB conversion on GPU
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat d_rgb;
cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, stream);
d_rgb.upload(cpuRGB, stream);
stream.waitForCompletion();
// Prepare inference inputs
std::vector<cv::cuda::GpuMat> inputVec;
@@ -404,27 +400,24 @@ namespace ANSCENTER {
continue;
}
// Upload to GPU
cv::cuda::GpuMat d_img;
d_img.upload(roi, stream);
// Handle grayscale conversion on GPU
if (roi.channels() == 1) {
cv::cuda::GpuMat d_bgr;
cv::cuda::cvtColor(d_img, d_bgr, cv::COLOR_GRAY2BGR, 0, stream);
d_img = d_bgr;
// CPU preprocessing: resize + color convert, then upload small image
cv::Mat srcImg = roi;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
}
// Resize on GPU if needed
if (roi.cols != FACE_WIDTH || roi.rows != FACE_HEIGHT) {
cv::cuda::GpuMat d_resized;
cv::cuda::resize(d_img, d_resized, targetSize, 0, 0, cv::INTER_LINEAR, stream);
d_img = d_resized;
cv::Mat cpuResized;
if (srcImg.rows != FACE_HEIGHT || srcImg.cols != FACE_WIDTH) {
cv::resize(srcImg, cpuResized, targetSize, 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
}
// BGR to RGB conversion on GPU
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat d_rgb;
cv::cuda::cvtColor(d_img, d_rgb, cv::COLOR_BGR2RGB, 0, stream);
d_rgb.upload(cpuRGB, stream);
batchGpu.emplace_back(std::move(d_rgb));
}