Fix NV12 crash issue when recreate camera object

This commit is contained in:
2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions

View File

@@ -1455,7 +1455,7 @@ namespace ANSCENTER
}
}
std::vector<Object> ANSODBase::RunStaticInference(const cv::Mat& input, cv::Rect Bbox, const std::string& camera_id) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — only uses local variables and virtual RunInference() which has its own engine lock
std::vector<Object> output;
output.clear();
try {
@@ -2100,7 +2100,8 @@ namespace ANSCENTER
}
}
std::vector<Object> ANSODBase::RunInferenceWithOption(const cv::Mat& input, const std::string& camera_id, const std::string activeROIMode) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No coarse _mutex — sub-components (engines, trackers) have their own locks.
// LabVIEW semaphore controls concurrency at the caller level.
try {
int mode = 0;
double confidenceThreshold = 0.35;
@@ -2116,8 +2117,11 @@ namespace ANSCENTER
if (confidenceThreshold <= 0) confidenceThreshold = 0;
if (confidenceThreshold > 1) confidenceThreshold = 1;
// Update model configuration with the new parameters
if(confidenceThreshold>0)_modelConfig.detectionScoreThreshold = confidenceThreshold;
// Update model configuration with the new parameters (brief lock for config)
if (confidenceThreshold > 0) {
std::lock_guard<std::recursive_mutex> cfgLock(_mutex);
_modelConfig.detectionScoreThreshold = confidenceThreshold;
}
switch (mode) {
case 0: // Normal mode
return RunInference(input, camera_id); //RunInference

View File

@@ -275,6 +275,26 @@ namespace ANSCENTER {
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
// --- Debug: log pointer state before reading ---
{
char _nv12_dbg[512];
snprintf(_nv12_dbg, sizeof(_nv12_dbg),
"[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
"gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
"gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
(void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
(int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
(int)gpuMatch, (int)useZeroCopy,
gpuData->gpuCacheY, gpuData->gpuCacheUV,
(int)gpuData->gpuCacheValid,
gpuData->refcount.load(),
frameW, frameH);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg);
#endif
fprintf(stderr, "%s", _nv12_dbg);
}
// Effective plane pointers — for zero-copy, use CUDA device ptrs;
// for CPU upload, use the CPU snapshot buffers.
uint8_t* effYPlane;
@@ -283,7 +303,7 @@ namespace ANSCENTER {
int effUvLinesize;
if (useZeroCopy) {
// Same GPU: wrap NVDEC device pointers directly
// Same GPU: wrap owned CUDA device pointers directly
effYPlane = gpuData->yPlane;
effUvPlane = gpuData->uvPlane;
effYLinesize = gpuData->yLinesize;
@@ -435,6 +455,18 @@ namespace ANSCENTER {
gpuResized.create(inputH, inputW, CV_8UC3);
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
{
char _nv12_dbg2[256];
snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
"[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
(void*)gpuY.data, gpuY.cols, gpuY.rows,
(void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
inputW, inputH, (int)useZeroCopy);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg2);
#endif
fprintf(stderr, "%s", _nv12_dbg2);
}
launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
stream.waitForCompletion();
@@ -945,7 +977,15 @@ namespace ANSCENTER {
inputW, inputH, frameW, frameH, stream);
}
cudaStreamSynchronize(stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
{
cudaError_t err = cudaStreamQuery(stream);
while (err == cudaErrorNotReady) {
Sleep(0);
err = cudaStreamQuery(stream);
}
}
// (No registry lock to release — data kept alive by refcount)

View File

@@ -8,6 +8,9 @@
#include <cuda_runtime.h>
#include <cstdint>
#ifdef _WIN32
#include <windows.h> // Sleep()
#endif
#include <cstdio>
// ── Shared YUV→RGB computation ───────────────────────────────────────────
@@ -651,7 +654,24 @@ int ANSGpuNV12ToBGR(
width * 3, height,
cudaMemcpyDeviceToHost, t_bufs.stream);
cudaStreamSynchronize(t_bufs.stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
// Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
// cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
// the exclusive SRW lock.
{
cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
if (qerr == cudaErrorNotReady) {
for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
Sleep(0);
qerr = cudaStreamQuery(t_bufs.stream);
}
while (qerr == cudaErrorNotReady) {
Sleep(1);
qerr = cudaStreamQuery(t_bufs.stream);
}
}
}
// Check for errors
cudaError_t err = cudaGetLastError();