Fix double stop in ANSVideoPlayer
This commit is contained in:
@@ -1167,7 +1167,12 @@ trt_cache_create_context:
|
||||
|
||||
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
||||
// Invalidate any graphs captured by a previous loadNetwork() call on this instance.
|
||||
for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
|
||||
for (auto& [bs, ge] : m_graphExecs) {
|
||||
if (ge) {
|
||||
cudaGraphExecDestroy(ge);
|
||||
m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
m_graphExecs.clear();
|
||||
// Free any previously allocated pinned buffers.
|
||||
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
||||
|
||||
@@ -731,7 +731,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
if (!m_graphExecs.empty()) {
|
||||
size_t destroyed = m_graphExecs.size();
|
||||
for (auto& [bs, ge] : m_graphExecs) {
|
||||
if (ge) cudaGraphExecDestroy(ge);
|
||||
if (ge) {
|
||||
cudaGraphExecDestroy(ge);
|
||||
m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
m_graphExecs.clear();
|
||||
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
|
||||
@@ -901,8 +904,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
|
||||
if (captureOk) {
|
||||
cudaGraphExec_t exec = nullptr;
|
||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess) {
|
||||
graphExec = exec;
|
||||
m_trtGraphCreates.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
cudaGraphDestroy(graph);
|
||||
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
|
||||
batchSize, (void*)graphExec);
|
||||
@@ -1053,5 +1058,32 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
s_globalActiveInf.fetch_sub(1);
|
||||
}
|
||||
|
||||
// Leak diagnostic — one [TRT_Leak] line per engine instance per 60 s.
|
||||
// Reports CUDA graph create/destroy balance and current cache size.
|
||||
// If (creates - destroys) climbs monotonically, graph execs are being
|
||||
// leaked on every shape change; each leaked exec is tens of MB.
|
||||
// Lock-free window claim via compare_exchange — concurrent inference
|
||||
// threads race to log but only one wins per 60-s window.
|
||||
{
|
||||
using clk = std::chrono::steady_clock;
|
||||
const long long tick = clk::now().time_since_epoch().count();
|
||||
long long expected = m_trtLeakNextLogTick.load(std::memory_order_relaxed);
|
||||
if (tick >= expected) {
|
||||
const long long deadline = tick +
|
||||
std::chrono::duration_cast<clk::duration>(
|
||||
std::chrono::seconds(60)).count();
|
||||
if (m_trtLeakNextLogTick.compare_exchange_strong(
|
||||
expected, deadline, std::memory_order_relaxed)) {
|
||||
const int64_t cr = m_trtGraphCreates.load(std::memory_order_relaxed);
|
||||
const int64_t ds = m_trtGraphDestroys.load(std::memory_order_relaxed);
|
||||
ANS_DBG("TRT_Leak",
|
||||
"engine=%p creates=%lld destroys=%lld net=%lld cached=%zu",
|
||||
(void*)this,
|
||||
(long long)cr, (long long)ds, (long long)(cr - ds),
|
||||
m_graphExecs.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user