Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly

This commit is contained in:
2026-04-04 10:09:47 +11:00
parent 445abefebe
commit 3a21026790
19 changed files with 575 additions and 232 deletions

View File

@@ -41,7 +41,17 @@
"mcp__desktop-commander__get_file_info", "mcp__desktop-commander__get_file_info",
"mcp__desktop-commander__interact_with_process", "mcp__desktop-commander__interact_with_process",
"Bash(sort -t= -k2 -rn)", "Bash(sort -t= -k2 -rn)",
"Bash(sort -t= -k3 -rn)" "Bash(sort -t= -k3 -rn)",
"Bash(powershell -Command \"Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' | Select-Object -Last 30\")",
"Bash(powershell -Command \"\\(Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''POOL FULL''\\).Count\")",
"Bash(powershell -Command \"\\(Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''Cam\\(\\\\d+\\)'' -AllMatches | ForEach-Object { $_Matches } | ForEach-Object { $_Groups[1].Value } | Sort-Object -Unique\\)\")",
"Bash(powershell -Command \"Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''Cam\\(\\\\d+\\)'' -AllMatches | ForEach-Object { $_Matches[0].Groups[1].Value } | Sort-Object | Get-Unique\")",
"Bash(powershell -Command \"$lines = Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt''; $first = \\($lines | Select-String ''07:1'' | Select-Object -First 1\\).Line; $last = \\($lines | Select-String ''07:1'' | Select-Object -Last 1\\).Line; Write-Host ''First: '' $first; Write-Host ''Last: '' $last; Write-Host ''Total lines: '' $lines.Count\")",
"Bash(powershell -Command \"$c = \\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt''\\).Count; Write-Host ''Total lines:'' $c\")",
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug1.txt''\\).Count\")",
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION20.log''\\).Count\")",
"Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION21.log''\\).Count\")",
"Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")"
] ]
} }
} }

View File

@@ -1250,50 +1250,25 @@ cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame)
m_nv12OrigWidth = width; m_nv12OrigWidth = width;
m_nv12OrigHeight = height; m_nv12OrigHeight = height;
// Display optimization: resize NV12 planes to max 1080p before color conversion. // Return full-resolution BGR image.
// For 4K (3840x2160), this reduces pixel count by 4x: // No forced downscale — LabVIEW manages display resolution via SetDisplayResolution().
// - 4K NV12→BGR: ~13-76ms on slow CPU (Xeon 2GHz), ~2ms on fast CPU // If the caller needs a specific display size, SetDisplayResolution(w, h) applies
// - 1080p NV12→BGR: ~3-5ms on slow CPU, ~0.5ms on fast CPU // resizing in GetImage() at the ANSRTSP/ANS*Client level after this returns.
// The full-res NV12 is preserved separately for inference (m_currentNV12Frame).
const int MAX_DISPLAY_HEIGHT = 1080; // Store original NV12 dimensions for inference coordinate mapping
bool needsResize = (height > MAX_DISPLAY_HEIGHT); m_nv12OrigWidth = width;
m_nv12OrigHeight = height;
cv::Mat yPlane(height, width, CV_8UC1, frame->data[0], frame->linesize[0]); cv::Mat yPlane(height, width, CV_8UC1, frame->data[0], frame->linesize[0]);
cv::Mat uvPlane(height / 2, width / 2, CV_8UC2, frame->data[1], frame->linesize[1]); cv::Mat uvPlane(height / 2, width / 2, CV_8UC2, frame->data[1], frame->linesize[1]);
if (needsResize) { cv::Mat bgrImage;
// Scale to fit within 1080p, maintaining aspect ratio cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12);
double scale = (double)MAX_DISPLAY_HEIGHT / height;
int dstW = (int)(width * scale) & ~1; // even width for NV12
int dstH = (int)(height * scale) & ~1; // even height for NV12
cv::Mat yResized, uvResized; if (m_nImageQuality == 1) {
cv::resize(yPlane, yResized, cv::Size(dstW, dstH), 0, 0, cv::INTER_LINEAR);
cv::resize(uvPlane, uvResized, cv::Size(dstW / 2, dstH / 2), 0, 0, cv::INTER_LINEAR);
cv::Mat bgrImage;
cv::cvtColorTwoPlane(yResized, uvResized, bgrImage, cv::COLOR_YUV2BGR_NV12);
if (m_nImageQuality == 1) {
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
}
return bgrImage;
}
// No resize needed (already <= 1080p)
if (m_nImageQuality == 0) {
cv::Mat bgrImage;
cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12);
return bgrImage;
}
// Quality path with range expansion
{
cv::Mat bgrImage;
cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12);
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0); bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
return bgrImage;
} }
return bgrImage;
} }
catch (const std::exception& e) { catch (const std::exception& e) {
std::cerr << "Exception in avframeNV12ToCvMat: " << e.what() << std::endl; std::cerr << "Exception in avframeNV12ToCvMat: " << e.what() << std::endl;
@@ -1861,6 +1836,12 @@ double CVideoPlayer::getFrameRate()
return 0; return 0;
} }
void CVideoPlayer::setTargetFPS(double intervalMs)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
m_targetIntervalMs = intervalMs;
m_targetFPSInitialized = false; // reset timing on change
}
void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq) void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq)
{ {
if (m_bRecording) if (m_bRecording)
@@ -2080,6 +2061,25 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame)
} }
} }
// --- Frame rate limiting ---
// Skip post-decode processing (clone, queue push, CUDA clone) if not enough
// time has elapsed since the last processed frame. The decode itself still
// runs for every packet to maintain the H.264/H.265 reference frame chain.
if (m_targetIntervalMs > 0.0) {
auto now = std::chrono::steady_clock::now();
if (!m_targetFPSInitialized) {
m_lastProcessedTime = now;
m_targetFPSInitialized = true;
} else {
auto elapsed = std::chrono::duration<double, std::milli>(now - m_lastProcessedTime).count();
if (elapsed < m_targetIntervalMs) {
return; // Skip this frame — too soon
}
}
m_lastProcessedTime = now;
}
// --- End frame rate limiting ---
// Push frame to queue; during settle period getImage() will ignore the queue // Push frame to queue; during settle period getImage() will ignore the queue
// and keep returning the last good cached image // and keep returning the last good cached image
g_frameQueue.pushFrame(frame); // pushFrame() clones the frame internally g_frameQueue.pushFrame(frame); // pushFrame() clones the frame internally

View File

@@ -15,6 +15,7 @@
#include <opencv2/highgui.hpp> #include <opencv2/highgui.hpp>
#include <opencv2/opencv.hpp> #include <opencv2/opencv.hpp>
#include <turbojpeg.h> #include <turbojpeg.h>
#include <chrono>
typedef struct typedef struct
{ {
@@ -146,6 +147,7 @@ public:
} }
// Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms) // Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms)
virtual void setImageQuality(int mode) { m_nImageQuality = mode; } virtual void setImageQuality(int mode) { m_nImageQuality = mode; }
void setTargetFPS(double intervalMs); // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS)
virtual void setRtpMulticast(BOOL flag) {} virtual void setRtpMulticast(BOOL flag) {}
virtual void setRtpOverUdp(BOOL flag) {} virtual void setRtpOverUdp(BOOL flag) {}
@@ -266,6 +268,11 @@ protected:
int m_cleanFrameCount = 0; // Count of clean frames after keyframe int m_cleanFrameCount = 0; // Count of clean frames after keyframe
static const int SETTLE_FRAME_COUNT = 5; // Number of clean frames before delivering new frames static const int SETTLE_FRAME_COUNT = 5; // Number of clean frames before delivering new frames
// Frame rate limiting — skip post-decode processing for frames beyond target interval
double m_targetIntervalMs = 100.0; // default 100ms (~10 FPS), 0 = no limit (process all frames)
std::chrono::steady_clock::time_point m_lastProcessedTime; // timestamp of last processed frame
bool m_targetFPSInitialized = false; // first-frame flag
BOOL m_bPlaying; BOOL m_bPlaying;
BOOL m_bPaused; BOOL m_bPaused;

133
NV12_GLOBAL_POOL_FIX_V2.md Normal file
View File

@@ -0,0 +1,133 @@
# NV12 Global Slot Pool Fix — Complete Reference (v2)
## Problem Statement
When RTSP cameras disconnect in LabVIEW, the flow is `ReleaseANSRTSPHandle → Destroy() → delete → CreateANSRTSPHandle`. The old per-camera GPU buffer pool was destroyed during this cycle, causing:
1. **Frozen inference**`forceReleaseByOwner` deleted GpuFrameData mid-inference
2. **Processing spikes**`cudaDeviceSynchronize` blocked ALL GPU work (900ms+)
3. **Crashes** — inference read freed pool buffers after camera deletion
## Architecture: Global GpuNV12SlotPool
GPU buffer ownership is **decoupled from camera lifetime**:
- Buffers live in a **process-wide singleton** (`GpuNV12SlotPool`)
- Slots are **recycled** (never freed during camera Destroy)
- **50ms cooldown** prevents slot reuse while GPU kernels still read
- **Per-slot non-blocking CUDA stream** avoids NULL-stream implicit sync
- **Background av_frame_free thread** removes SRW lock blocking from hot path
## Files Modified (from original codebase)
### NEW FILES (3 + copies)
| File | Purpose |
|------|---------|
| `include/GpuNV12SlotPool.h` | Global pool header — GpuNV12Slot struct, GpuNV12SlotPool class |
| `modules/ANSCV/GpuNV12SlotPool.cpp` | Canonical singleton + acquire() implementation (CUDA) |
| `modules/ANSODEngine/GpuNV12SlotPool.cpp` | Cross-DLL resolver via GetProcAddress |
| `modules/ANSOCR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) |
| `modules/ANSFR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) |
| `modules/ANSLPR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) |
### MODIFIED FILES
| File | Changes |
|------|---------|
| `include/ANSGpuFrameRegistry.h` | Added `#include "GpuNV12SlotPool.h"`, `GpuNV12Slot* poolSlot` field in GpuFrameData, move constructor transfers poolSlot, `freeOwnedBuffers_locked()` calls `deferRelease(poolSlot)`, added `pushPendingFree_locked()`, debug macros guarded by `ANSCORE_GPU_DEBUG` |
| `modules/ANSCV/ANSGpuFrameOps.h` | `gpu_frame_attach_cuda()` rewritten: sync D2D on per-slot stream, deferred av_frame_free, CPU snapshot only on fallback, background av_frame_free thread in `gpu_frame_evict_stale()`. Debug macros guarded. |
| `modules/ANSCV/ANSRTSP.h` | Removed `GpuNV12Pool` struct, `EnsureGpuPool()`, `DestroyGpuPool()`, `GetGpuPool()` |
| `modules/ANSCV/ANSRTSP.cpp` | Removed `EnsureGpuPool`/`DestroyGpuPool` implementations. `Destroy()` and `Reconnect()` simplified: no `forceReleaseByOwner`, no `cudaDeviceSynchronize`, no `DestroyGpuPool`. `GetRTSPCVImage()` uses `GpuNV12SlotPool::instance().acquire()`. Added SLOW FRAME timing log (>500ms, to both spdlog and DebugView). Debug macros guarded. |
| `modules/ANSODEngine/NV12PreprocessHelper.cpp` | Debug logging blocks guarded by `ANSCORE_GPU_DEBUG`. One-time `[NV12 ACTIVE]` log to DebugView when NV12 fast path activates. |
| `modules/ANSODEngine/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list |
| `modules/ANSFR/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list |
| `modules/ANSLPR/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list |
| (ANSOCR uses file GLOB — auto-included) | |
## Key Design Decisions
| Decision | Rationale |
|----------|-----------|
| **Sync D2D on per-slot stream** | Non-blocking stream avoids NULL-stream implicit sync with inference (was causing 1-2s stalls). `cudaStreamSynchronize` waits only for the 2 copies (~1.5ms). Slot held briefly → pool stays small (64 slots for 20+ cameras). |
| **50ms cooldown on slot reuse** | GPU kernels complete in <10ms. 50ms = 5× safety margin. Prevents buffer overwrite while inference reads. Short enough to keep pool pressure low. |
| **Background av_frame_free thread** | `av_frame_free` on CUDA-mapped frames acquires nvcuda64 SRW lock (5-20ms each). Background thread frees in batches every 50ms, removing all SRW lock blocking from camera hot path. |
| **CPU NV12 snapshot deferred to fallback only** | 4K snapshot = ~12MB malloc+memcpy+free per frame (~276MB/s). Only needed for cross-GPU fallback (rare). Skipping on pool-success path eliminates heap churn. |
| **Debug logging guarded by ANSCORE_GPU_DEBUG** | 500-2000 OutputDebugString calls/sec caused process-wide mutex convoy stalls. Default off. Add `-DANSCORE_GPU_DEBUG=1` to CMake to re-enable. |
| **Always-on diagnostics** | NEW slot allocation, POOL FULL, SLOW FRAME (>500ms), and NV12 ACTIVE path selection always log to DebugView (low volume, ~1-10 per session). |
## Data Flow
```
GetRTSPCVImage (camera thread):
1. GetImage() → BGR frame (shallow copy)
2. anscv_mat_replace → swap Mat pointer
3. TryIncrementInFlight() → atomic guard
4. GetCudaHWFrame() → NVDEC device pointers
5. GetNV12Frame() → CPU NV12 AVFrame (cloned)
6. slot = GpuNV12SlotPool::acquire(gpuIdx, w, h)
└─ drainCooledSlots_locked() first (COOLING→FREE if >50ms)
7. gpu_frame_attach_cuda(*image, cudaFrame, gpuIdx, pts, cpuNV12, slot):
a. cudaMemcpy2DAsync(slot->bufY, ..., nvdecY, ..., slot->copyStream)
b. cudaMemcpy2DAsync(slot->bufUV, ..., nvdecUV, ..., slot->copyStream)
c. cudaStreamSynchronize(slot->copyStream) — waits ~1.5ms (copy only)
d. data.poolSlot = slot
e. DEFERRED: push cudaFrame+cpuNV12 to m_pendingFree (NOT av_frame_free)
f. registry.attach(mat, data)
8. Wire onReleaseFn → DecrementInFlight
9. return (~3-5ms total)
Inference (engine thread):
1. gpuFrame = lookup(*cvImage) → GpuFrameData*
2. tl_currentGpuFrame() = gpuFrame
3. tryNV12(): reads yPlane/uvPlane → slot buffers (data is valid, sync done)
4. NV12→RGB kernel launch → reads from slot buffer
5. Inference finishes → clone released → refcount→0
→ freeOwnedBuffers_locked → deferRelease(poolSlot) → COOLING
→ onReleaseFn → DecrementInFlight
Background av_frame_free thread (started once):
- Every 50ms: drain m_pendingFree → av_frame_free each
- Runs independently of camera/inference threads
- SRW lock blocking happens HERE, not in hot path
Slot lifecycle:
acquire() → STATE_ACTIVE
refcount→0 → deferRelease → STATE_COOLING (cooldownStart = now)
50ms later → drainCooledSlots_locked → STATE_FREE
next acquire() → reuse
Destroy (camera thread) — LIGHTWEIGHT:
1. _isPlaying = false
2. Wait _inFlightFrames == 0 (fast — sync copy means in-flight = GetRTSPCVImage only)
3. invalidateOwner(this) — prevent stale callbacks
4. close() — destroys NVDEC decoder only
*** NO forceReleaseByOwner ***
*** NO cudaDeviceSynchronize ***
*** NO DestroyGpuPool ***
Pool slots survive — inference keeps reading safely.
```
## DebugView Diagnostics (always-on)
```
[NV12Pool] NEW slot #1: 1920x1080 gpu=0 Y=0000001764000000 UV=... stream=...
[NV12Pool] NEW slot #2: 3840x2160 gpu=0 Y=... UV=... stream=...
[NV12 ACTIVE] ANSRTYOLO Path: CUDA_ZERO_COPY | isCuda=1 gpuMatch=1 decodeGpu=0 infGpu=0 frame=1920x1080
[GetRTSPCVImage] SLOW FRAME: total=523.1ms (getImage=2.1ms cuda=521.0ms) 3840x2160
[NV12Pool] POOL FULL (64 slots) — fallback to CPU
```
## Build Configuration
- **Production (default):** Debug logging OFF. Only slot allocation, POOL FULL, SLOW FRAME, and NV12 ACTIVE visible in DebugView.
- **Debug:** Add `-DANSCORE_GPU_DEBUG=1` to CMake. Enables per-frame verbose logging (WARNING: causes performance degradation from OutputDebugString lock contention at high frame rates).
## Test Checklist
- [ ] Start multiple RTSP cameras with HW decoding + multiple AI engines
- [ ] Verify DebugView shows: NEW slot allocations, NV12 ACTIVE with CUDA_ZERO_COPY
- [ ] Verify: zero POOL FULL entries
- [ ] Verify: zero or very few SLOW FRAME entries (>500ms)
- [ ] Trigger camera reconnect (disconnect cable or ReleaseHandle+CreateHandle)
- [ ] Verify: no crash, inference continues on remaining cameras
- [ ] Verify: processing time chart stable (no multi-second spikes)
- [ ] Check nvidia-smi: VRAM stable (slots recycled, not growing)
- [ ] Long run: 1+ hours with cameras reconnecting periodically

View File

@@ -132,6 +132,13 @@ struct GpuFrameData {
// freed while any consumer is still reading it. // freed while any consumer is still reading it.
GpuNV12Slot* poolSlot = nullptr; GpuNV12Slot* poolSlot = nullptr;
// --- Async D2D copy stream ---
// The CUDA stream used for the async D2D copy from NVDEC surface to pool buffer.
// Inference MUST call cudaStreamSynchronize on this before reading yPlane/uvPlane
// to ensure the copy has completed. Stored as void* to avoid cuda_runtime.h here.
// nullptr means D2D was synchronous (legacy path) or no D2D copy was done.
void* d2dCopyStream = nullptr;
// Default constructor // Default constructor
GpuFrameData() = default; GpuFrameData() = default;
@@ -151,6 +158,7 @@ struct GpuFrameData {
, refcount(o.refcount.load()), createdAt(o.createdAt) , refcount(o.refcount.load()), createdAt(o.createdAt)
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn) , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
, poolSlot(o.poolSlot) , poolSlot(o.poolSlot)
, d2dCopyStream(o.d2dCopyStream)
{ {
// Null out source to prevent double-free of owned pointers // Null out source to prevent double-free of owned pointers
o.cpuYPlane = nullptr; o.cpuYPlane = nullptr;
@@ -165,6 +173,7 @@ struct GpuFrameData {
o.ownerClient = nullptr; o.ownerClient = nullptr;
o.onReleaseFn = nullptr; o.onReleaseFn = nullptr;
o.poolSlot = nullptr; o.poolSlot = nullptr;
o.d2dCopyStream = nullptr;
} }
// No copy // No copy
@@ -360,6 +369,12 @@ public:
return result; return result;
} }
// Push an AVFrame* (as void*) for deferred freeing.
// Caller MUST hold the lock via acquire_lock().
void pushPendingFree_locked(void* ptr) {
if (ptr) m_pendingFree.push_back(ptr);
}
// --- Drain pending GPU device pointers for caller to cudaFree --- // --- Drain pending GPU device pointers for caller to cudaFree ---
// Each entry includes the device index for cudaSetDevice before cudaFree. // Each entry includes the device index for cudaSetDevice before cudaFree.
// If minAgeMs > 0, only drain entries older than minAgeMs milliseconds. // If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.

View File

@@ -97,7 +97,8 @@ struct GpuNV12Slot {
// first — causing 1-2 second stalls. Using a dedicated non-blocking // first — causing 1-2 second stalls. Using a dedicated non-blocking
// stream avoids this implicit sync entirely. // stream avoids this implicit sync entirely.
// Stored as void* to avoid cuda_runtime.h in the header. // Stored as void* to avoid cuda_runtime.h in the header.
void* copyStream = nullptr; // cudaStream_t void* copyStream = nullptr; // cudaStream_t
}; };
class GpuNV12SlotPool { class GpuNV12SlotPool {
@@ -119,6 +120,7 @@ public:
// Returns nullptr if pool full — caller falls back to CPU path. // Returns nullptr if pool full — caller falls back to CPU path.
GpuNV12Slot* acquire(int gpuIdx, int w, int h); GpuNV12Slot* acquire(int gpuIdx, int w, int h);
// Deferred release: moves slot from ACTIVE → COOLING. // Deferred release: moves slot from ACTIVE → COOLING.
// Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0. // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
// The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire). // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).

View File

@@ -621,6 +621,14 @@ namespace ANSCENTER {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
} }
void ANSFLVClient::SetTargetFPS(double intervalMs) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
}
void ANSFLVClient::SetNV12FastPath(bool enable) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_useNV12FastPath = enable;
}
AVFrame* ANSFLVClient::GetNV12Frame() { AVFrame* ANSFLVClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
@@ -767,17 +775,18 @@ extern "C" __declspec(dllexport) int GetFLVCVImage(ANSCENTER::ANSFLVClient** Han
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
anscv_mat_replace(image, std::move(img)); anscv_mat_replace(image, std::move(img));
// Attach NV12 frame for GPU fast-path inference (side-table registry) // NV12 GPU fast path (optional — disabled by default for stability)
// attach() takes ownership — do NOT av_frame_free here if ((*Handle)->IsNV12FastPath()) {
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) { if (cudaHW) {
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
} else { } else {
AVFrame* nv12 = (*Handle)->GetNV12Frame(); AVFrame* nv12 = (*Handle)->GetNV12Frame();
if (nv12) { if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
}
} }
} }
@@ -952,6 +961,18 @@ extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVC
(*Handle)->SetDisplayResolution(width, height); (*Handle)->SetDisplayResolution(width, height);
} catch (...) { } } catch (...) { }
} }
extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetTargetFPS(intervalMs);
} catch (...) { }
}
extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetNV12FastPath(enable != 0);
} catch (...) { }
}
// ============================================================================ // ============================================================================
// V2 entry points — accept handle by value (uint64_t) instead of Handle** // V2 entry points — accept handle by value (uint64_t) instead of Handle**

View File

@@ -36,6 +36,7 @@ namespace ANSCENTER
int _imageWidth, _imageHeight; int _imageWidth, _imageHeight;
int64_t _pts; int64_t _pts;
bool _isPlaying; bool _isPlaying;
bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path
std::recursive_mutex _mutex; std::recursive_mutex _mutex;
public: public:
ANSFLVClient(); ANSFLVClient();
@@ -71,6 +72,9 @@ namespace ANSCENTER
int GetHWDecodingGpuIndex(); int GetHWDecodingGpuIndex();
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
bool IsNV12FastPath() const { return _useNV12FastPath; }
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsFLVHWDecodingActive(ANSCENTER::ANSFLVCli
extern "C" __declspec(dllexport) int GetFLVHWDecodingGpuIndex(ANSCENTER::ANSFLVClient** Handle); extern "C" __declspec(dllexport) int GetFLVHWDecodingGpuIndex(ANSCENTER::ANSFLVClient** Handle);
extern "C" __declspec(dllexport) void SetFLVImageQuality(ANSCENTER::ANSFLVClient** Handle, int mode); extern "C" __declspec(dllexport) void SetFLVImageQuality(ANSCENTER::ANSFLVClient** Handle, int mode);
extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVClient** Handle, int width, int height);
extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs);
extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable);
#endif #endif

View File

@@ -23,6 +23,8 @@ extern "C" {
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cstring> #include <cstring>
#include <cstdlib> #include <cstdlib>
#include <thread>
#include <mutex>
#include <cstdio> #include <cstdio>
#ifdef _WIN32 #ifdef _WIN32
@@ -166,16 +168,13 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
if (old) { if (old) {
AVFrame* oldFrame = static_cast<AVFrame*>(old); // Defer old frame's AVFrame free
av_frame_free(&oldFrame); auto& reg = ANSGpuFrameRegistry::instance();
auto lk = reg.acquire_lock();
reg.pushPendingFree_locked(old);
} }
// Free stale entries evicted by TTL or previous attach // NOTE: No drain_pending() here (hot path). Freed by evict_stale.
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
} }
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory. // Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
@@ -226,13 +225,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) { if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
// --- Global pool path: D2D copy on per-slot non-blocking stream --- // --- Global pool path: D2D copy on per-slot non-blocking stream ---
// CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes // cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream):
// 1-2 second stalls on WDDM because it implicitly synchronizes with // - Non-blocking stream avoids NULL-stream implicit sync with inference
// ALL other streams before executing. By using cudaMemcpy2DAsync on // - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p)
// the slot's own non-blocking stream + cudaStreamSynchronize, we: // - Data valid after sync — av_frame_free is safe
// 1. Submit the copy immediately (no wait for inference kernels)
// 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
// 3. Data is valid after sync — av_frame_free is safe
int prevDev = -1; int prevDev = -1;
cudaGetDevice(&prevDev); cudaGetDevice(&prevDev);
if (gpuIdx >= 0) cudaSetDevice(gpuIdx); if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
@@ -247,13 +243,13 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV, e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
cudaFrame->data[1], cudaFrame->linesize[1], cudaFrame->data[1], cudaFrame->linesize[1],
w, h / 2, cudaMemcpyDeviceToDevice, copyStream); w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
if (e3 == cudaSuccess && e4 == cudaSuccess) { // NO cudaStreamSynchronize here — let the copy run asynchronously.
// Wait ONLY for this stream's 2 copies (~0.3-1.2ms). // The camera thread is NOT blocked by the WDDM SRW lock.
// Does NOT wait for inference kernels on other streams. // Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12()
cudaStreamSynchronize(copyStream); // before reading the buffer. By that time (~50-200ms later), the copy
} // (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync
// returns immediately with zero blocking.
} else { } else {
// Fallback if stream creation failed — NULL stream (may stall)
e3 = cudaMemcpy2D(slot->bufY, slot->pitchY, e3 = cudaMemcpy2D(slot->bufY, slot->pitchY,
cudaFrame->data[0], cudaFrame->linesize[0], cudaFrame->data[0], cudaFrame->linesize[0],
w, h, cudaMemcpyDeviceToDevice); w, h, cudaMemcpyDeviceToDevice);
@@ -270,15 +266,14 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
data.uvPlane = static_cast<uint8_t*>(slot->bufUV); data.uvPlane = static_cast<uint8_t*>(slot->bufUV);
data.yLinesize = static_cast<int>(slot->pitchY); data.yLinesize = static_cast<int>(slot->pitchY);
data.uvLinesize = static_cast<int>(slot->pitchUV); data.uvLinesize = static_cast<int>(slot->pitchUV);
data.poolSlot = slot; // Track for deferred release data.poolSlot = slot;
// gpuCacheY/UV stay nullptr — global pool owns the buffers data.d2dCopyStream = copyStream; // Inference syncs on this before reading
d2dOk = true; d2dOk = true;
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu", GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p",
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV); slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream);
} else { } else {
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback", GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
(int)e3, (int)e4); (int)e3, (int)e4);
// Release slot back to pool on failure (immediate, no cooldown needed)
slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release); slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
} }
} }
@@ -364,13 +359,34 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
data.uvLinesize = data.cpuUvLinesize; data.uvLinesize = data.cpuUvLinesize;
} }
// Free AVFrames immediately — synchronous D2D copy has completed, // AVFrame lifetime management:
// so NVDEC surfaces can be returned to the decoder's surface pool. // - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p", // GpuFrameData.avframe so the NVDEC surface (copy source) remains valid
(void*)cudaFrame, (void*)cpuNV12); // until the async copy completes. The AVFrame is freed when GpuFrameData
av_frame_free(&cudaFrame); // is released (after inference), by which time the 0.3ms copy is long done.
if (cpuNV12) av_frame_free(&cpuNV12); // - If D2D was SYNC or failed: push to pending free immediately (old behavior).
data.avframe = nullptr; if (data.d2dCopyStream && cudaFrame) {
// Async D2D — keep AVFrame alive, inference will outlive the copy
data.avframe = cudaFrame;
GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p",
(void*)cudaFrame);
} else {
// Sync D2D or fallback — safe to defer free now
GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p",
(void*)cudaFrame);
if (cudaFrame) {
auto& reg = ANSGpuFrameRegistry::instance();
auto lk = reg.acquire_lock();
reg.pushPendingFree_locked(cudaFrame);
}
data.avframe = nullptr;
}
// cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied
if (cpuNV12) {
auto& reg = ANSGpuFrameRegistry::instance();
auto lk = reg.acquire_lock();
reg.pushPendingFree_locked(cpuNV12);
}
data.cpuAvframe = nullptr; data.cpuAvframe = nullptr;
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p", GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
@@ -379,16 +395,16 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
if (old) { if (old) {
AVFrame* oldFrame = static_cast<AVFrame*>(old); // Old frame's AVFrame returned — defer its free too
av_frame_free(&oldFrame); auto& reg = ANSGpuFrameRegistry::instance();
auto lk = reg.acquire_lock();
reg.pushPendingFree_locked(old);
} }
// Free stale AVFrames evicted by TTL or previous attach // NOTE: No drain_pending() here (hot path). AVFrames accumulate in
auto pending = ANSGpuFrameRegistry::instance().drain_pending(); // m_pendingFree and are freed by gpu_frame_evict_stale() which runs
for (void* p : pending) { // every 500ms from anscv_mat_replace. This removes av_frame_free
AVFrame* stale = static_cast<AVFrame*>(p); // (5-20ms SRW lock per call) from the camera frame-grabbing path.
av_frame_free(&stale);
}
} }
// Release entry by cv::Mat* and free any returned AVFrames. // Release entry by cv::Mat* and free any returned AVFrames.
@@ -400,14 +416,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat); GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
ANSGpuFrameRegistry::instance().release(mat); ANSGpuFrameRegistry::instance().release(mat);
// Free any AVFrames that became pending from this release or prior eviction // NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale.
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
} }
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data. // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -425,10 +434,39 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
inline void gpu_frame_evict_stale() { inline void gpu_frame_evict_stale() {
ANSGpuFrameRegistry::instance().evictStaleFrames(); ANSGpuFrameRegistry::instance().evictStaleFrames();
auto pending = ANSGpuFrameRegistry::instance().drain_pending(); // Drain and free AVFrames on a background thread to avoid blocking the
for (void* p : pending) { // camera hot path. av_frame_free on CUDA-mapped frames can take 5-20ms
AVFrame* stale = static_cast<AVFrame*>(p); // per call due to nvcuda64 SRW lock. The background thread frees them
av_frame_free(&stale); // periodically (every 50ms) in batches.
{
static std::once_flag s_initOnce;
static std::mutex s_avFreeMutex;
static std::vector<void*> s_avFreeQueue;
// Move pending AVFrames to the background queue
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
if (!pending.empty()) {
std::lock_guard<std::mutex> lock(s_avFreeMutex);
s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end());
}
// Start background free thread on first call
std::call_once(s_initOnce, []() {
std::thread([]() {
while (true) {
std::vector<void*> batch;
{
std::lock_guard<std::mutex> lock(s_avFreeMutex);
batch.swap(s_avFreeQueue);
}
for (void* p : batch) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
}).detach();
});
} }
// Free GPU device pointers from evicted/released frames (legacy path). // Free GPU device pointers from evicted/released frames (legacy path).

View File

@@ -621,6 +621,14 @@ namespace ANSCENTER {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
} }
void ANSMJPEGClient::SetTargetFPS(double intervalMs) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
}
void ANSMJPEGClient::SetNV12FastPath(bool enable) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_useNV12FastPath = enable;
}
AVFrame* ANSMJPEGClient::GetNV12Frame() { AVFrame* ANSMJPEGClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
@@ -768,20 +776,18 @@ extern "C" __declspec(dllexport) int GetMJPEGCVImage(ANSCENTER::ANSMJPEGClient**
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
anscv_mat_replace(image, std::move(img)); anscv_mat_replace(image, std::move(img));
// Attach NV12 frame for GPU fast-path inference (side-table registry) // NV12 GPU fast path (optional — disabled by default for stability)
// attach() takes ownership — do NOT av_frame_free here if ((*Handle)->IsNV12FastPath()) {
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) { if (cudaHW) {
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
// Also attach CPU NV12 as fallback for cross-GPU inference gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible). } else {
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); AVFrame* nv12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); if (nv12) {
} else { gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
AVFrame* nv12 = (*Handle)->GetNV12Frame(); }
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
} }
} }
@@ -956,6 +962,18 @@ extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJ
(*Handle)->SetDisplayResolution(width, height); (*Handle)->SetDisplayResolution(width, height);
} catch (...) { } } catch (...) { }
} }
extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetTargetFPS(intervalMs);
} catch (...) { }
}
extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetNV12FastPath(enable != 0);
} catch (...) { }
}
// ============================================================================ // ============================================================================
// V2 entry points — accept handle as uint64_t by value (LabVIEW safe) // V2 entry points — accept handle as uint64_t by value (LabVIEW safe)

View File

@@ -35,6 +35,7 @@ namespace ANSCENTER
int _imageWidth, _imageHeight; int _imageWidth, _imageHeight;
int64_t _pts; int64_t _pts;
bool _isPlaying; bool _isPlaying;
bool _useNV12FastPath = false;
std::recursive_mutex _mutex; std::recursive_mutex _mutex;
public: public:
ANSMJPEGClient(); ANSMJPEGClient();
@@ -70,6 +71,9 @@ namespace ANSCENTER
int GetHWDecodingGpuIndex(); int GetHWDecodingGpuIndex();
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
bool IsNV12FastPath() const { return _useNV12FastPath; }
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsMJPEGHWDecodingActive(ANSCENTER::ANSMJPE
extern "C" __declspec(dllexport) int GetMJPEGHWDecodingGpuIndex(ANSCENTER::ANSMJPEGClient** Handle); extern "C" __declspec(dllexport) int GetMJPEGHWDecodingGpuIndex(ANSCENTER::ANSMJPEGClient** Handle);
extern "C" __declspec(dllexport) void SetMJPEGImageQuality(ANSCENTER::ANSMJPEGClient** Handle, int mode); extern "C" __declspec(dllexport) void SetMJPEGImageQuality(ANSCENTER::ANSMJPEGClient** Handle, int mode);
extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJPEGClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJPEGClient** Handle, int width, int height);
extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs);
extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable);
#endif #endif

View File

@@ -635,6 +635,14 @@ namespace ANSCENTER {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
} }
void ANSRTMPClient::SetTargetFPS(double intervalMs) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
}
void ANSRTMPClient::SetNV12FastPath(bool enable) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_useNV12FastPath = enable;
}
AVFrame* ANSRTMPClient::GetNV12Frame() { AVFrame* ANSRTMPClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
@@ -792,20 +800,18 @@ extern "C" __declspec(dllexport) int GetRTMPCVImage(ANSCENTER::ANSRTMPClient** H
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
anscv_mat_replace(image, std::move(img)); anscv_mat_replace(image, std::move(img));
// Attach NV12 frame for GPU fast-path inference (side-table registry) // NV12 GPU fast path (optional — disabled by default for stability)
// attach() takes ownership — do NOT av_frame_free here if ((*Handle)->IsNV12FastPath()) {
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) { if (cudaHW) {
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
// Also attach CPU NV12 as fallback for cross-GPU inference gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible). } else {
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); AVFrame* nv12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); if (nv12) {
} else { gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
AVFrame* nv12 = (*Handle)->GetNV12Frame(); }
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
} }
} }
@@ -978,6 +984,18 @@ extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTM
(*Handle)->SetDisplayResolution(width, height); (*Handle)->SetDisplayResolution(width, height);
} catch (...) { } } catch (...) { }
} }
extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetTargetFPS(intervalMs);
} catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetNV12FastPath(enable != 0);
} catch (...) { }
}
// ============================================================================ // ============================================================================
// V2 entry points: accept handle by value (uint64_t) to avoid LabVIEW // V2 entry points: accept handle by value (uint64_t) to avoid LabVIEW

View File

@@ -36,6 +36,7 @@ namespace ANSCENTER
int _imageWidth, _imageHeight; int _imageWidth, _imageHeight;
int64_t _pts; int64_t _pts;
bool _isPlaying; bool _isPlaying;
bool _useNV12FastPath = false;
std::recursive_mutex _mutex; std::recursive_mutex _mutex;
public: public:
ANSRTMPClient(); ANSRTMPClient();
@@ -71,6 +72,9 @@ namespace ANSCENTER
int GetHWDecodingGpuIndex(); int GetHWDecodingGpuIndex();
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
bool IsNV12FastPath() const { return _useNV12FastPath; }
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsRTMPHWDecodingActive(ANSCENTER::ANSRTMPC
extern "C" __declspec(dllexport) int GetRTMPHWDecodingGpuIndex(ANSCENTER::ANSRTMPClient** Handle); extern "C" __declspec(dllexport) int GetRTMPHWDecodingGpuIndex(ANSCENTER::ANSRTMPClient** Handle);
extern "C" __declspec(dllexport) void SetRTMPImageQuality(ANSCENTER::ANSRTMPClient** Handle, int mode); extern "C" __declspec(dllexport) void SetRTMPImageQuality(ANSCENTER::ANSRTMPClient** Handle, int mode);
extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTMPClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTMPClient** Handle, int width, int height);
extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs);
extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable);
#endif #endif

View File

@@ -213,44 +213,44 @@ namespace ANSCENTER {
bool ANSRTSPClient::Reconnect() { bool ANSRTSPClient::Reconnect() {
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage() // 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
// return the cached _pLastFrame instead of calling into the player, // return the cached _pLastFrame instead of calling into the player,
// and blocks new TryIncrementInFlight calls. // and blocks new TryIncrementInFlight calls (no new NV12 attachments).
{ {
std::unique_lock<std::recursive_mutex> lock(_mutex); std::unique_lock<std::recursive_mutex> lock(_mutex);
_isPlaying = false; _isPlaying = false;
// --- Inference guard: wait for in-flight D2D copies to finish --- // --- Inference guard: wait for ALL in-flight inference to finish ---
// With synchronous D2D copy, in-flight means "currently inside // _inFlightFrames tracks frames from GetRTSPCVImage through to the
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda". // end of inference (DecrementInFlight fires when last clone is released).
// This is typically <1ms, so the wait is very fast. // We MUST wait for this to reach 0 before calling close(), because
// inference may still be reading NV12 pool buffer data that depends
// on the NVDEC decoder context being alive.
//
// DO NOT force-reset _inFlightFrames or invalidate onReleaseFn —
// let inference finish naturally so DecrementInFlight fires correctly.
int inFlight = _inFlightFrames.load(std::memory_order_acquire); int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) { if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Reconnect", _logger.LogInfo("ANSRTSPClient::Reconnect",
std::format("waiting for {} in-flight frame(s)...", inFlight), std::format("waiting for {} in-flight inference(s) to complete...", inFlight),
__FILE__, __LINE__); __FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] { bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0; return _inFlightFrames.load(std::memory_order_acquire) <= 0;
}); });
if (!done) { if (!done) {
_logger.LogWarn("ANSRTSPClient::Reconnect", _logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("timed out — still {} in-flight", _inFlightFrames.load()), std::format("timed out — still {} in-flight, proceeding with close()",
_inFlightFrames.load()),
__FILE__, __LINE__); __FILE__, __LINE__);
// Force-reset only on timeout as last resort
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
} }
} }
// Invalidate owner callbacks — prevents stale DecrementInFlight
// calls after Reconnect re-creates the decoder.
// Frames and their global pool slots remain alive for inference.
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
// NO forceReleaseByOwner — frames survive reconnect.
// NO cudaDeviceSynchronize — no GPU buffers to free.
// NO DestroyGpuPool — per-camera pool has been removed.
} }
// 2. close() destroys NVDEC decoder ONLY — run outside _mutex to // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
// avoid deadlocking with nvcuda64 SRW lock held by inference. // avoid deadlocking with nvcuda64 SRW lock held by other cameras.
// Pool slot buffers are global and untouched. // At this point, all inference using this camera's NV12 data has
// completed (or timed out), so close() is safe.
_logger.LogInfo("ANSRTSPClient::Reconnect", _logger.LogInfo("ANSRTSPClient::Reconnect",
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__); "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this); RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -883,6 +883,14 @@ namespace ANSCENTER {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
} }
void ANSRTSPClient::SetTargetFPS(double intervalMs) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
}
void ANSRTSPClient::SetNV12FastPath(bool enable) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_useNV12FastPath = enable;
}
AVFrame* ANSRTSPClient::GetNV12Frame() { AVFrame* ANSRTSPClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed) if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
@@ -1045,67 +1053,60 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
auto t1 = std::chrono::steady_clock::now(); auto t1 = std::chrono::steady_clock::now();
// Attach NV12 frame for GPU fast-path inference (side-table registry) // NV12 GPU fast path: attach NV12 frame data for zero-copy inference.
// attach() takes ownership — do NOT av_frame_free here // When disabled (_useNV12FastPath=false), the original stable CPU path is used:
// // GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention.
// CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame(). // When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference.
// It atomically checks _isPlaying and increments _inFlightFrames under if ((*Handle)->IsNV12FastPath()) {
// the same mutex, so Reconnect() cannot call close() while we're doing int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
// the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda(). bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight(); (void*)*image, gpuIdx, (int)inFlightGuardHeld);
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
if (inFlightGuardHeld) { if (inFlightGuardHeld) {
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) { if (cudaHW) {
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p", RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
cudaHW->width, cudaHW->height, cudaHW->width, cudaHW->height,
(void*)cudaHW->data[0], (void*)cudaHW->data[1]); (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
// Acquire a slot from the global pool — survives camera Destroy. // Acquire a slot from the global pool — survives camera Destroy.
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire( GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
gpuIdx, cudaHW->width, cudaHW->height); gpuIdx, cudaHW->width, cudaHW->height);
// Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback). // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
// When slot is valid, the D2D copy goes GPU→GPU and CPU NV12 is never used. AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
// Skipping av_frame_clone + av_frame_free saves ~0.1ms per frame. gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame(); } else {
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot); // HW decode not active — try CPU NV12
} else { AVFrame* nv12 = (*Handle)->GetNV12Frame();
// HW decode not active — try CPU NV12 if (nv12) {
AVFrame* nv12 = (*Handle)->GetNV12Frame(); gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
if (nv12) { }
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
} }
}
// Wire up the registry callback to release the in-flight guard. // Wire up the registry callback to release the in-flight guard.
// TryIncrementInFlight already incremented; DecrementInFlight fires auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
// when the last clone of this frame is released after inference. RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image); (void*)gpuData,
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p", gpuData ? (void*)gpuData->yPlane : nullptr,
(void*)gpuData, gpuData ? (int)gpuData->isCudaDevicePtr : -1,
gpuData ? (void*)gpuData->yPlane : nullptr, gpuData ? (void*)gpuData->poolSlot : nullptr);
gpuData ? (int)gpuData->isCudaDevicePtr : -1, if (gpuData) {
gpuData ? (void*)gpuData->poolSlot : nullptr); gpuData->ownerClient = *Handle;
if (gpuData) { gpuData->onReleaseFn = [](void* client) {
gpuData->ownerClient = *Handle; static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
gpuData->onReleaseFn = [](void* client) { };
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight(); } else {
}; (*Handle)->DecrementInFlight();
// NOTE: Do NOT call IncrementInFlight() again here — }
// TryIncrementInFlight() already did it above.
} else { } else {
// No gpuData registered (attach failed?) — release the guard RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
(*Handle)->DecrementInFlight();
} }
} else {
// Player is stopping/reconnecting — skip CUDA path entirely.
// GetImage() already returned a cached BGR frame, which is safe.
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
} }
// else: original CPU path — cv::Mat** contains BGR data in CPU RAM.
// No CUDA calls, no pool slots, no GPU frame registry.
// Inference uses cv::Mat directly (upload to GPU in engine).
// Lightweight timing — logs only when frame grab + D2D exceeds 50ms. // Lightweight timing — logs only when frame grab + D2D exceeds 50ms.
// Goes to both spdlog (console/file) AND OutputDebugString (DebugView) // Goes to both spdlog (console/file) AND OutputDebugString (DebugView)
@@ -1115,7 +1116,7 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count(); double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count(); double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
double totalMs = getImageMs + cudaMs; double totalMs = getImageMs + cudaMs;
if (totalMs > 50.0) { if (totalMs > 500.0) {
auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}", auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
totalMs, getImageMs, cudaMs, width, height); totalMs, getImageMs, cudaMs, width, height);
(*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__); (*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__);
@@ -1452,6 +1453,18 @@ extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTS
(*Handle)->SetDisplayResolution(width, height); (*Handle)->SetDisplayResolution(width, height);
} catch (...) { } } catch (...) { }
} }
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
} catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path
} catch (...) { }
}
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) { extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
if (Handle == nullptr || *Handle == nullptr) return -1; if (Handle == nullptr || *Handle == nullptr) return -1;
try { try {

View File

@@ -38,6 +38,7 @@ namespace ANSCENTER
int _imageWidth,_imageHeight; int _imageWidth,_imageHeight;
int64_t _pts; int64_t _pts;
bool _isPlaying; bool _isPlaying;
bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path
std::recursive_mutex _mutex; std::recursive_mutex _mutex;
// --- Per-client inference guard --- // --- Per-client inference guard ---
@@ -102,6 +103,9 @@ namespace ANSCENTER
int GetHWDecodingGpuIndex(); int GetHWDecodingGpuIndex();
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable)
bool IsNV12FastPath() const { return _useNV12FastPath; }
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -139,4 +143,6 @@ extern "C" __declspec(dllexport) int IsRTSPHWDecodingActive(ANSCENTER::ANSRTSPC
extern "C" __declspec(dllexport) int GetRTSPHWDecodingGpuIndex(ANSCENTER::ANSRTSPClient** Handle); extern "C" __declspec(dllexport) int GetRTSPHWDecodingGpuIndex(ANSCENTER::ANSRTSPClient** Handle);
extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClient** Handle, int mode); extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClient** Handle, int mode);
extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height);
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs);
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable);
#endif #endif

View File

@@ -652,6 +652,14 @@ namespace ANSCENTER {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
} }
void ANSSRTClient::SetTargetFPS(double intervalMs) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
}
void ANSSRTClient::SetNV12FastPath(bool enable) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
_useNV12FastPath = enable;
}
AVFrame* ANSSRTClient::GetNV12Frame() { AVFrame* ANSSRTClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
@@ -809,20 +817,18 @@ extern "C" __declspec(dllexport) int GetSRTCVImage(ANSCENTER::ANSSRTClient** Han
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
anscv_mat_replace(image, std::move(img)); anscv_mat_replace(image, std::move(img));
// Attach NV12 frame for GPU fast-path inference (side-table registry) // NV12 GPU fast path (optional — disabled by default for stability)
// attach() takes ownership — do NOT av_frame_free here if ((*Handle)->IsNV12FastPath()) {
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) { if (cudaHW) {
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
// Also attach CPU NV12 as fallback for cross-GPU inference gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible). } else {
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); AVFrame* nv12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); if (nv12) {
} else { gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
AVFrame* nv12 = (*Handle)->GetNV12Frame(); }
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
} }
} }
@@ -994,6 +1000,18 @@ extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTC
(*Handle)->SetDisplayResolution(width, height); (*Handle)->SetDisplayResolution(width, height);
} catch (...) { } } catch (...) { }
} }
extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetTargetFPS(intervalMs);
} catch (...) { }
}
extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable) {
if (Handle == nullptr || *Handle == nullptr) return;
try {
(*Handle)->SetNV12FastPath(enable != 0);
} catch (...) { }
}
// ============================================================================ // ============================================================================
// V2 entry points: accept uint64_t handleVal by value instead of Handle** // V2 entry points: accept uint64_t handleVal by value instead of Handle**

View File

@@ -35,6 +35,7 @@ namespace ANSCENTER
int _imageWidth, _imageHeight; int _imageWidth, _imageHeight;
int64_t _pts; int64_t _pts;
bool _isPlaying; bool _isPlaying;
bool _useNV12FastPath = false;
std::recursive_mutex _mutex; std::recursive_mutex _mutex;
public: public:
ANSSRTClient(); ANSSRTClient();
@@ -70,6 +71,9 @@ namespace ANSCENTER
int GetHWDecodingGpuIndex(); int GetHWDecodingGpuIndex();
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
bool IsNV12FastPath() const { return _useNV12FastPath; }
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsSRTHWDecodingActive(ANSCENTER::ANSSRTCli
extern "C" __declspec(dllexport) int GetSRTHWDecodingGpuIndex(ANSCENTER::ANSSRTClient** Handle); extern "C" __declspec(dllexport) int GetSRTHWDecodingGpuIndex(ANSCENTER::ANSSRTClient** Handle);
extern "C" __declspec(dllexport) void SetSRTImageQuality(ANSCENTER::ANSSRTClient** Handle, int mode); extern "C" __declspec(dllexport) void SetSRTImageQuality(ANSCENTER::ANSSRTClient** Handle, int mode);
extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTClient** Handle, int width, int height); extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTClient** Handle, int width, int height);
extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs);
extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable);
#endif #endif

View File

@@ -23,6 +23,7 @@ GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
} }
// Transition all COOLING slots past the cooldown threshold to FREE. // Transition all COOLING slots past the cooldown threshold to FREE.
// Collects pending AVFrames for the caller to av_frame_free.
void GpuNV12SlotPool::drainCooledSlots_locked() { void GpuNV12SlotPool::drainCooledSlots_locked() {
auto now = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now();
auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS); auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
@@ -67,7 +68,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
return nullptr; return nullptr;
} }
// Allocate CUDA buffers on the target GPU // Allocate CUDA buffers + stream + event on the target GPU
int prevDev = -1; int prevDev = -1;
cudaGetDevice(&prevDev); cudaGetDevice(&prevDev);
if (gpuIdx >= 0) cudaSetDevice(gpuIdx); if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
@@ -76,10 +77,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h); cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);
cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2); cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
// Non-blocking stream avoids NULL-stream implicit sync with inference. // Non-blocking stream: avoids NULL-stream implicit sync with inference.
// On WDDM, the NULL stream must wait for ALL other streams to finish
// before executing — this caused 1-2 second stalls when inference
// kernels were running. A non-blocking stream runs independently.
cudaStream_t stream = nullptr; cudaStream_t stream = nullptr;
cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
@@ -88,7 +86,6 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
if (e1 != cudaSuccess || e2 != cudaSuccess) { if (e1 != cudaSuccess || e2 != cudaSuccess) {
NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d", NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
w, h, gpuIdx, (int)e1, (int)e2); w, h, gpuIdx, (int)e1, (int)e2);
// Clean up partial allocation
int prev2 = -1; cudaGetDevice(&prev2); int prev2 = -1; cudaGetDevice(&prev2);
if (gpuIdx >= 0) cudaSetDevice(gpuIdx); if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY); if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
@@ -107,21 +104,18 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
GpuNV12Slot* raw = slot.get(); GpuNV12Slot* raw = slot.get();
m_slots.push_back(std::move(slot)); m_slots.push_back(std::move(slot));
// Always log new slot allocation to DebugView (rare event — once per resolution per camera). // Always log new slot allocation to DebugView (rare event).
{ {
char _buf[256]; char _buf[256];
snprintf(_buf, sizeof(_buf), snprintf(_buf, sizeof(_buf),
"[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n", "[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",
m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream); m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY,
raw->copyStream);
#ifdef _WIN32 #ifdef _WIN32
OutputDebugStringA(_buf); OutputDebugStringA(_buf);
#endif #endif
fprintf(stderr, "%s", _buf); fprintf(stderr, "%s", _buf);
} }
// Also log POOL FULL to DebugView (important diagnostic).
NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
w, h, gpuIdx, raw->copyStream, m_slots.size());
return raw; return raw;
} }

View File

@@ -269,6 +269,15 @@ namespace ANSCENTER {
return result; return result;
} }
// Ensure async D2D copy (NVDEC → pool buffer) has completed before
// reading yPlane/uvPlane. The copy was queued in gpu_frame_attach_cuda()
// on a non-blocking stream. By the time inference runs (~50-200ms later),
// the copy (~0.3ms) has long finished, so this sync returns immediately.
if (gpuData->d2dCopyStream) {
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
gpuData->d2dCopyStream = nullptr; // Only sync once per frame
}
const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice || const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 || gpuData->gpuIndex < 0 ||
@@ -367,7 +376,6 @@ namespace ANSCENTER {
cv::cuda::GpuMat gpuY, gpuUV; cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) { if (useZeroCopy) {
// CUDA zero-copy: wrap pool buffer device pointers directly
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize)); effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
@@ -641,6 +649,12 @@ namespace ANSCENTER {
return result; return result;
} }
// Ensure async D2D copy has completed before reading NV12 buffers
if (gpuData->d2dCopyStream) {
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
gpuData->d2dCopyStream = nullptr;
}
const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice || const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 || gpuData->gpuIndex < 0 ||
@@ -775,6 +789,12 @@ namespace ANSCENTER {
if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane) if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
return result; // NV12 not on GPU return result; // NV12 not on GPU
// Ensure async D2D copy has completed before reading NV12 buffers
if (gpuData->d2dCopyStream) {
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
gpuData->d2dCopyStream = nullptr;
}
const int frameW = gpuData->width; const int frameW = gpuData->width;
const int frameH = gpuData->height; const int frameH = gpuData->height;
@@ -890,6 +910,12 @@ namespace ANSCENTER {
return result; return result;
} }
// Ensure async D2D copy has completed before reading NV12 buffers
if (gpuData->d2dCopyStream) {
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
gpuData->d2dCopyStream = nullptr;
}
const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice || const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 || gpuData->gpuIndex < 0 ||