Use CPU resize before upload to GPU to remove PCIe bottleneck
This commit is contained in:
@@ -284,7 +284,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
// fatal "illegal memory access" that permanently corrupts the CUDA context.
|
||||
//
|
||||
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
|
||||
auto _mutexWaitStart = std::chrono::steady_clock::now();
|
||||
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
|
||||
auto _mutexAcquired = std::chrono::steady_clock::now();
|
||||
double _mutexWaitMs = std::chrono::duration<double, std::milli>(_mutexAcquired - _mutexWaitStart).count();
|
||||
if (_mutexWaitMs > 50.0) {
|
||||
ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// THREAD-SAFE GPU CONTEXT
|
||||
@@ -955,6 +961,20 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Per-inference total timing breakdown (mutex wait + preprocess + GPU)
|
||||
// ============================================================================
|
||||
{
|
||||
double totalMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _mutexWaitStart).count();
|
||||
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
|
||||
// Log every inference that takes >100ms total (including mutex wait)
|
||||
if (totalMs > 100.0) {
|
||||
ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
|
||||
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SM=100% DETECTOR — end-of-inference timing
|
||||
// ============================================================================
|
||||
|
||||
@@ -23,6 +23,29 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
|
||||
}
|
||||
output = std::move(input[0][0]);
|
||||
}
|
||||
// CPU letterbox resize — same logic as the GPU version but runs on CPU.
|
||||
// Used in Preprocess to resize BEFORE GPU upload, reducing PCIe transfer
|
||||
// from 25 MB (4K) to 1.2 MB (640×640) — 20x less bandwidth.
|
||||
template <typename T>
|
||||
cv::Mat Engine<T>::cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat& input,
|
||||
size_t height, size_t width,
|
||||
const cv::Scalar& bgcolor) {
|
||||
if (input.empty()) return cv::Mat();
|
||||
|
||||
float r = std::min(static_cast<float>(width) / input.cols,
|
||||
static_cast<float>(height) / input.rows);
|
||||
int unpad_w = static_cast<int>(r * input.cols);
|
||||
int unpad_h = static_cast<int>(r * input.rows);
|
||||
|
||||
cv::Mat re;
|
||||
cv::resize(input, re, cv::Size(unpad_w, unpad_h), 0, 0, cv::INTER_LINEAR);
|
||||
|
||||
cv::Mat out(static_cast<int>(height), static_cast<int>(width), input.type(), bgcolor);
|
||||
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
size_t height, size_t width,
|
||||
|
||||
Reference in New Issue
Block a user