Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly
This commit is contained in:
@@ -621,6 +621,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSFLVClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSFLVClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSFLVClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
|
||||
@@ -767,17 +775,18 @@ extern "C" __declspec(dllexport) int GetFLVCVImage(ANSCENTER::ANSFLVClient** Han
|
||||
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
|
||||
anscv_mat_replace(image, std::move(img));
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// NV12 GPU fast path (optional — disabled by default for stability)
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -952,6 +961,18 @@ extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVC
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0);
|
||||
} catch (...) { }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// V2 entry points — accept handle by value (uint64_t) instead of Handle**
|
||||
|
||||
@@ -36,6 +36,7 @@ namespace ANSCENTER
|
||||
int _imageWidth, _imageHeight;
|
||||
int64_t _pts;
|
||||
bool _isPlaying;
|
||||
bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path
|
||||
std::recursive_mutex _mutex;
|
||||
public:
|
||||
ANSFLVClient();
|
||||
@@ -71,6 +72,9 @@ namespace ANSCENTER
|
||||
int GetHWDecodingGpuIndex();
|
||||
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
|
||||
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
|
||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
|
||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||
@@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsFLVHWDecodingActive(ANSCENTER::ANSFLVCli
|
||||
extern "C" __declspec(dllexport) int GetFLVHWDecodingGpuIndex(ANSCENTER::ANSFLVClient** Handle);
|
||||
extern "C" __declspec(dllexport) void SetFLVImageQuality(ANSCENTER::ANSFLVClient** Handle, int mode);
|
||||
extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVClient** Handle, int width, int height);
|
||||
extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs);
|
||||
extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable);
|
||||
#endif
|
||||
@@ -23,6 +23,8 @@ extern "C" {
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef _WIN32
|
||||
@@ -166,16 +168,13 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||||
av_frame_free(&oldFrame);
|
||||
// Defer old frame's AVFrame free
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(old);
|
||||
}
|
||||
|
||||
// Free stale entries evicted by TTL or previous attach
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
// NOTE: No drain_pending() here (hot path). Freed by evict_stale.
|
||||
}
|
||||
|
||||
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
|
||||
@@ -226,13 +225,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
|
||||
if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
|
||||
// --- Global pool path: D2D copy on per-slot non-blocking stream ---
|
||||
// CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
|
||||
// 1-2 second stalls on WDDM because it implicitly synchronizes with
|
||||
// ALL other streams before executing. By using cudaMemcpy2DAsync on
|
||||
// the slot's own non-blocking stream + cudaStreamSynchronize, we:
|
||||
// 1. Submit the copy immediately (no wait for inference kernels)
|
||||
// 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
|
||||
// 3. Data is valid after sync — av_frame_free is safe
|
||||
// cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream):
|
||||
// - Non-blocking stream avoids NULL-stream implicit sync with inference
|
||||
// - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p)
|
||||
// - Data valid after sync — av_frame_free is safe
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
@@ -247,13 +243,13 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
// Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
|
||||
// Does NOT wait for inference kernels on other streams.
|
||||
cudaStreamSynchronize(copyStream);
|
||||
}
|
||||
// NO cudaStreamSynchronize here — let the copy run asynchronously.
|
||||
// The camera thread is NOT blocked by the WDDM SRW lock.
|
||||
// Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12()
|
||||
// before reading the buffer. By that time (~50-200ms later), the copy
|
||||
// (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync
|
||||
// returns immediately with zero blocking.
|
||||
} else {
|
||||
// Fallback if stream creation failed — NULL stream (may stall)
|
||||
e3 = cudaMemcpy2D(slot->bufY, slot->pitchY,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
@@ -270,15 +266,14 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.uvPlane = static_cast<uint8_t*>(slot->bufUV);
|
||||
data.yLinesize = static_cast<int>(slot->pitchY);
|
||||
data.uvLinesize = static_cast<int>(slot->pitchUV);
|
||||
data.poolSlot = slot; // Track for deferred release
|
||||
// gpuCacheY/UV stay nullptr — global pool owns the buffers
|
||||
data.poolSlot = slot;
|
||||
data.d2dCopyStream = copyStream; // Inference syncs on this before reading
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
|
||||
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p",
|
||||
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream);
|
||||
} else {
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
|
||||
(int)e3, (int)e4);
|
||||
// Release slot back to pool on failure (immediate, no cooldown needed)
|
||||
slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
@@ -364,13 +359,34 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.uvLinesize = data.cpuUvLinesize;
|
||||
}
|
||||
|
||||
// Free AVFrames immediately — synchronous D2D copy has completed,
|
||||
// so NVDEC surfaces can be returned to the decoder's surface pool.
|
||||
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
|
||||
(void*)cudaFrame, (void*)cpuNV12);
|
||||
av_frame_free(&cudaFrame);
|
||||
if (cpuNV12) av_frame_free(&cpuNV12);
|
||||
data.avframe = nullptr;
|
||||
// AVFrame lifetime management:
|
||||
// - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in
|
||||
// GpuFrameData.avframe so the NVDEC surface (copy source) remains valid
|
||||
// until the async copy completes. The AVFrame is freed when GpuFrameData
|
||||
// is released (after inference), by which time the 0.3ms copy is long done.
|
||||
// - If D2D was SYNC or failed: push to pending free immediately (old behavior).
|
||||
if (data.d2dCopyStream && cudaFrame) {
|
||||
// Async D2D — keep AVFrame alive, inference will outlive the copy
|
||||
data.avframe = cudaFrame;
|
||||
GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p",
|
||||
(void*)cudaFrame);
|
||||
} else {
|
||||
// Sync D2D or fallback — safe to defer free now
|
||||
GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p",
|
||||
(void*)cudaFrame);
|
||||
if (cudaFrame) {
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(cudaFrame);
|
||||
}
|
||||
data.avframe = nullptr;
|
||||
}
|
||||
// cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied
|
||||
if (cpuNV12) {
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(cpuNV12);
|
||||
}
|
||||
data.cpuAvframe = nullptr;
|
||||
|
||||
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
|
||||
@@ -379,16 +395,16 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||||
av_frame_free(&oldFrame);
|
||||
// Old frame's AVFrame returned — defer its free too
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(old);
|
||||
}
|
||||
|
||||
// Free stale AVFrames evicted by TTL or previous attach
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
// NOTE: No drain_pending() here (hot path). AVFrames accumulate in
|
||||
// m_pendingFree and are freed by gpu_frame_evict_stale() which runs
|
||||
// every 500ms from anscv_mat_replace. This removes av_frame_free
|
||||
// (5-20ms SRW lock per call) from the camera frame-grabbing path.
|
||||
}
|
||||
|
||||
// Release entry by cv::Mat* and free any returned AVFrames.
|
||||
@@ -400,14 +416,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
|
||||
ANSGpuFrameRegistry::instance().release(mat);
|
||||
|
||||
// Free any AVFrames that became pending from this release or prior eviction
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
|
||||
// NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale.
|
||||
}
|
||||
|
||||
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
||||
@@ -425,10 +434,39 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
|
||||
inline void gpu_frame_evict_stale() {
|
||||
ANSGpuFrameRegistry::instance().evictStaleFrames();
|
||||
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
// Drain and free AVFrames on a background thread to avoid blocking the
|
||||
// camera hot path. av_frame_free on CUDA-mapped frames can take 5-20ms
|
||||
// per call due to nvcuda64 SRW lock. The background thread frees them
|
||||
// periodically (every 50ms) in batches.
|
||||
{
|
||||
static std::once_flag s_initOnce;
|
||||
static std::mutex s_avFreeMutex;
|
||||
static std::vector<void*> s_avFreeQueue;
|
||||
|
||||
// Move pending AVFrames to the background queue
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
if (!pending.empty()) {
|
||||
std::lock_guard<std::mutex> lock(s_avFreeMutex);
|
||||
s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end());
|
||||
}
|
||||
|
||||
// Start background free thread on first call
|
||||
std::call_once(s_initOnce, []() {
|
||||
std::thread([]() {
|
||||
while (true) {
|
||||
std::vector<void*> batch;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(s_avFreeMutex);
|
||||
batch.swap(s_avFreeQueue);
|
||||
}
|
||||
for (void* p : batch) {
|
||||
AVFrame* f = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&f);
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
}).detach();
|
||||
});
|
||||
}
|
||||
|
||||
// Free GPU device pointers from evicted/released frames (legacy path).
|
||||
|
||||
@@ -621,6 +621,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSMJPEGClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSMJPEGClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSMJPEGClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
|
||||
@@ -768,20 +776,18 @@ extern "C" __declspec(dllexport) int GetMJPEGCVImage(ANSCENTER::ANSMJPEGClient**
|
||||
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
|
||||
anscv_mat_replace(image, std::move(img));
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
|
||||
// Also attach CPU NV12 as fallback for cross-GPU inference
|
||||
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// NV12 GPU fast path (optional — disabled by default for stability)
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -956,6 +962,18 @@ extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJ
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0);
|
||||
} catch (...) { }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// V2 entry points — accept handle as uint64_t by value (LabVIEW safe)
|
||||
|
||||
@@ -35,6 +35,7 @@ namespace ANSCENTER
|
||||
int _imageWidth, _imageHeight;
|
||||
int64_t _pts;
|
||||
bool _isPlaying;
|
||||
bool _useNV12FastPath = false;
|
||||
std::recursive_mutex _mutex;
|
||||
public:
|
||||
ANSMJPEGClient();
|
||||
@@ -70,6 +71,9 @@ namespace ANSCENTER
|
||||
int GetHWDecodingGpuIndex();
|
||||
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
|
||||
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
|
||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
|
||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||
@@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsMJPEGHWDecodingActive(ANSCENTER::ANSMJPE
|
||||
extern "C" __declspec(dllexport) int GetMJPEGHWDecodingGpuIndex(ANSCENTER::ANSMJPEGClient** Handle);
|
||||
extern "C" __declspec(dllexport) void SetMJPEGImageQuality(ANSCENTER::ANSMJPEGClient** Handle, int mode);
|
||||
extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJPEGClient** Handle, int width, int height);
|
||||
extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs);
|
||||
extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable);
|
||||
#endif
|
||||
@@ -635,6 +635,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSRTMPClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSRTMPClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSRTMPClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
|
||||
@@ -792,20 +800,18 @@ extern "C" __declspec(dllexport) int GetRTMPCVImage(ANSCENTER::ANSRTMPClient** H
|
||||
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
|
||||
anscv_mat_replace(image, std::move(img));
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
|
||||
// Also attach CPU NV12 as fallback for cross-GPU inference
|
||||
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// NV12 GPU fast path (optional — disabled by default for stability)
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -978,6 +984,18 @@ extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTM
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0);
|
||||
} catch (...) { }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// V2 entry points: accept handle by value (uint64_t) to avoid LabVIEW
|
||||
|
||||
@@ -36,6 +36,7 @@ namespace ANSCENTER
|
||||
int _imageWidth, _imageHeight;
|
||||
int64_t _pts;
|
||||
bool _isPlaying;
|
||||
bool _useNV12FastPath = false;
|
||||
std::recursive_mutex _mutex;
|
||||
public:
|
||||
ANSRTMPClient();
|
||||
@@ -71,6 +72,9 @@ namespace ANSCENTER
|
||||
int GetHWDecodingGpuIndex();
|
||||
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
|
||||
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
|
||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
|
||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||
@@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsRTMPHWDecodingActive(ANSCENTER::ANSRTMPC
|
||||
extern "C" __declspec(dllexport) int GetRTMPHWDecodingGpuIndex(ANSCENTER::ANSRTMPClient** Handle);
|
||||
extern "C" __declspec(dllexport) void SetRTMPImageQuality(ANSCENTER::ANSRTMPClient** Handle, int mode);
|
||||
extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTMPClient** Handle, int width, int height);
|
||||
extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs);
|
||||
extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable);
|
||||
#endif
|
||||
@@ -213,44 +213,44 @@ namespace ANSCENTER {
|
||||
bool ANSRTSPClient::Reconnect() {
|
||||
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
|
||||
// return the cached _pLastFrame instead of calling into the player,
|
||||
// and blocks new TryIncrementInFlight calls.
|
||||
// and blocks new TryIncrementInFlight calls (no new NV12 attachments).
|
||||
{
|
||||
std::unique_lock<std::recursive_mutex> lock(_mutex);
|
||||
_isPlaying = false;
|
||||
|
||||
// --- Inference guard: wait for in-flight D2D copies to finish ---
|
||||
// With synchronous D2D copy, in-flight means "currently inside
|
||||
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
|
||||
// This is typically <1ms, so the wait is very fast.
|
||||
// --- Inference guard: wait for ALL in-flight inference to finish ---
|
||||
// _inFlightFrames tracks frames from GetRTSPCVImage through to the
|
||||
// end of inference (DecrementInFlight fires when last clone is released).
|
||||
// We MUST wait for this to reach 0 before calling close(), because
|
||||
// inference may still be reading NV12 pool buffer data that depends
|
||||
// on the NVDEC decoder context being alive.
|
||||
//
|
||||
// DO NOT force-reset _inFlightFrames or invalidate onReleaseFn —
|
||||
// let inference finish naturally so DecrementInFlight fires correctly.
|
||||
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
|
||||
if (inFlight > 0) {
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
std::format("waiting for {} in-flight frame(s)...", inFlight),
|
||||
std::format("waiting for {} in-flight inference(s) to complete...", inFlight),
|
||||
__FILE__, __LINE__);
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] {
|
||||
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
|
||||
});
|
||||
if (!done) {
|
||||
_logger.LogWarn("ANSRTSPClient::Reconnect",
|
||||
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
|
||||
std::format("timed out — still {} in-flight, proceeding with close()",
|
||||
_inFlightFrames.load()),
|
||||
__FILE__, __LINE__);
|
||||
// Force-reset only on timeout as last resort
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
|
||||
// Invalidate owner callbacks — prevents stale DecrementInFlight
|
||||
// calls after Reconnect re-creates the decoder.
|
||||
// Frames and their global pool slots remain alive for inference.
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
|
||||
// NO forceReleaseByOwner — frames survive reconnect.
|
||||
// NO cudaDeviceSynchronize — no GPU buffers to free.
|
||||
// NO DestroyGpuPool — per-camera pool has been removed.
|
||||
}
|
||||
|
||||
// 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
|
||||
// avoid deadlocking with nvcuda64 SRW lock held by inference.
|
||||
// Pool slot buffers are global and untouched.
|
||||
// avoid deadlocking with nvcuda64 SRW lock held by other cameras.
|
||||
// At this point, all inference using this camera's NV12 data has
|
||||
// completed (or timed out), so close() is safe.
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
|
||||
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
|
||||
@@ -883,6 +883,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSRTSPClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSRTSPClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSRTSPClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
|
||||
@@ -1045,67 +1053,60 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
|
||||
auto t1 = std::chrono::steady_clock::now();
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
//
|
||||
// CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
|
||||
// It atomically checks _isPlaying and increments _inFlightFrames under
|
||||
// the same mutex, so Reconnect() cannot call close() while we're doing
|
||||
// the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
|
||||
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
|
||||
// NV12 GPU fast path: attach NV12 frame data for zero-copy inference.
|
||||
// When disabled (_useNV12FastPath=false), the original stable CPU path is used:
|
||||
// GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention.
|
||||
// When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference.
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
|
||||
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
|
||||
|
||||
if (inFlightGuardHeld) {
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
|
||||
cudaHW->width, cudaHW->height,
|
||||
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
|
||||
if (inFlightGuardHeld) {
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
|
||||
cudaHW->width, cudaHW->height,
|
||||
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
|
||||
|
||||
// Acquire a slot from the global pool — survives camera Destroy.
|
||||
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
|
||||
gpuIdx, cudaHW->width, cudaHW->height);
|
||||
// Acquire a slot from the global pool — survives camera Destroy.
|
||||
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
|
||||
gpuIdx, cudaHW->width, cudaHW->height);
|
||||
|
||||
// Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
|
||||
// When slot is valid, the D2D copy goes GPU→GPU and CPU NV12 is never used.
|
||||
// Skipping av_frame_clone + av_frame_free saves ~0.1ms per frame.
|
||||
AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
|
||||
} else {
|
||||
// HW decode not active — try CPU NV12
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
|
||||
AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
|
||||
} else {
|
||||
// HW decode not active — try CPU NV12
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wire up the registry callback to release the in-flight guard.
|
||||
// TryIncrementInFlight already incremented; DecrementInFlight fires
|
||||
// when the last clone of this frame is released after inference.
|
||||
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)gpuData,
|
||||
gpuData ? (void*)gpuData->yPlane : nullptr,
|
||||
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
|
||||
gpuData ? (void*)gpuData->poolSlot : nullptr);
|
||||
if (gpuData) {
|
||||
gpuData->ownerClient = *Handle;
|
||||
gpuData->onReleaseFn = [](void* client) {
|
||||
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
|
||||
};
|
||||
// NOTE: Do NOT call IncrementInFlight() again here —
|
||||
// TryIncrementInFlight() already did it above.
|
||||
// Wire up the registry callback to release the in-flight guard.
|
||||
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)gpuData,
|
||||
gpuData ? (void*)gpuData->yPlane : nullptr,
|
||||
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
|
||||
gpuData ? (void*)gpuData->poolSlot : nullptr);
|
||||
if (gpuData) {
|
||||
gpuData->ownerClient = *Handle;
|
||||
gpuData->onReleaseFn = [](void* client) {
|
||||
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
|
||||
};
|
||||
} else {
|
||||
(*Handle)->DecrementInFlight();
|
||||
}
|
||||
} else {
|
||||
// No gpuData registered (attach failed?) — release the guard
|
||||
(*Handle)->DecrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
|
||||
}
|
||||
} else {
|
||||
// Player is stopping/reconnecting — skip CUDA path entirely.
|
||||
// GetImage() already returned a cached BGR frame, which is safe.
|
||||
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
|
||||
}
|
||||
// else: original CPU path — cv::Mat** contains BGR data in CPU RAM.
|
||||
// No CUDA calls, no pool slots, no GPU frame registry.
|
||||
// Inference uses cv::Mat directly (upload to GPU in engine).
|
||||
|
||||
// Lightweight timing — logs only when frame grab + D2D exceeds 50ms.
|
||||
// Goes to both spdlog (console/file) AND OutputDebugString (DebugView)
|
||||
@@ -1115,7 +1116,7 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||
double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
|
||||
double totalMs = getImageMs + cudaMs;
|
||||
if (totalMs > 50.0) {
|
||||
if (totalMs > 500.0) {
|
||||
auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
|
||||
totalMs, getImageMs, cudaMs, width, height);
|
||||
(*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__);
|
||||
@@ -1452,6 +1453,18 @@ extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTS
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||
try {
|
||||
|
||||
@@ -38,6 +38,7 @@ namespace ANSCENTER
|
||||
int _imageWidth,_imageHeight;
|
||||
int64_t _pts;
|
||||
bool _isPlaying;
|
||||
bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path
|
||||
std::recursive_mutex _mutex;
|
||||
|
||||
// --- Per-client inference guard ---
|
||||
@@ -102,6 +103,9 @@ namespace ANSCENTER
|
||||
int GetHWDecodingGpuIndex();
|
||||
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
|
||||
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
|
||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable)
|
||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||
@@ -139,4 +143,6 @@ extern "C" __declspec(dllexport) int IsRTSPHWDecodingActive(ANSCENTER::ANSRTSPC
|
||||
extern "C" __declspec(dllexport) int GetRTSPHWDecodingGpuIndex(ANSCENTER::ANSRTSPClient** Handle);
|
||||
extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClient** Handle, int mode);
|
||||
extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height);
|
||||
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs);
|
||||
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable);
|
||||
#endif
|
||||
@@ -652,6 +652,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSSRTClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSSRTClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSSRTClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
|
||||
@@ -809,20 +817,18 @@ extern "C" __declspec(dllexport) int GetSRTCVImage(ANSCENTER::ANSSRTClient** Han
|
||||
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
|
||||
anscv_mat_replace(image, std::move(img));
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
|
||||
// Also attach CPU NV12 as fallback for cross-GPU inference
|
||||
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// NV12 GPU fast path (optional — disabled by default for stability)
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
} else {
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -994,6 +1000,18 @@ extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTC
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0);
|
||||
} catch (...) { }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// V2 entry points: accept uint64_t handleVal by value instead of Handle**
|
||||
|
||||
@@ -35,6 +35,7 @@ namespace ANSCENTER
|
||||
int _imageWidth, _imageHeight;
|
||||
int64_t _pts;
|
||||
bool _isPlaying;
|
||||
bool _useNV12FastPath = false;
|
||||
std::recursive_mutex _mutex;
|
||||
public:
|
||||
ANSSRTClient();
|
||||
@@ -70,6 +71,9 @@ namespace ANSCENTER
|
||||
int GetHWDecodingGpuIndex();
|
||||
void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize)
|
||||
void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display)
|
||||
void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
|
||||
void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable)
|
||||
bool IsNV12FastPath() const { return _useNV12FastPath; }
|
||||
AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
|
||||
AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
|
||||
bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM)
|
||||
@@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsSRTHWDecodingActive(ANSCENTER::ANSSRTCli
|
||||
extern "C" __declspec(dllexport) int GetSRTHWDecodingGpuIndex(ANSCENTER::ANSSRTClient** Handle);
|
||||
extern "C" __declspec(dllexport) void SetSRTImageQuality(ANSCENTER::ANSSRTClient** Handle, int mode);
|
||||
extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTClient** Handle, int width, int height);
|
||||
extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs);
|
||||
extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable);
|
||||
#endif
|
||||
@@ -23,6 +23,7 @@ GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
|
||||
}
|
||||
|
||||
// Transition all COOLING slots past the cooldown threshold to FREE.
|
||||
// Collects pending AVFrames for the caller to av_frame_free.
|
||||
void GpuNV12SlotPool::drainCooledSlots_locked() {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
|
||||
@@ -67,7 +68,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Allocate CUDA buffers on the target GPU
|
||||
// Allocate CUDA buffers + stream + event on the target GPU
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
@@ -76,10 +77,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
||||
cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);
|
||||
cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
|
||||
|
||||
// Non-blocking stream avoids NULL-stream implicit sync with inference.
|
||||
// On WDDM, the NULL stream must wait for ALL other streams to finish
|
||||
// before executing — this caused 1-2 second stalls when inference
|
||||
// kernels were running. A non-blocking stream runs independently.
|
||||
// Non-blocking stream: avoids NULL-stream implicit sync with inference.
|
||||
cudaStream_t stream = nullptr;
|
||||
cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
||||
|
||||
@@ -88,7 +86,6 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
||||
if (e1 != cudaSuccess || e2 != cudaSuccess) {
|
||||
NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
|
||||
w, h, gpuIdx, (int)e1, (int)e2);
|
||||
// Clean up partial allocation
|
||||
int prev2 = -1; cudaGetDevice(&prev2);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
|
||||
@@ -107,21 +104,18 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
||||
GpuNV12Slot* raw = slot.get();
|
||||
m_slots.push_back(std::move(slot));
|
||||
|
||||
// Always log new slot allocation to DebugView (rare event — once per resolution per camera).
|
||||
// Always log new slot allocation to DebugView (rare event).
|
||||
{
|
||||
char _buf[256];
|
||||
snprintf(_buf, sizeof(_buf),
|
||||
"[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",
|
||||
m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream);
|
||||
m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY,
|
||||
raw->copyStream);
|
||||
#ifdef _WIN32
|
||||
OutputDebugStringA(_buf);
|
||||
#endif
|
||||
fprintf(stderr, "%s", _buf);
|
||||
}
|
||||
|
||||
// Also log POOL FULL to DebugView (important diagnostic).
|
||||
NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
|
||||
raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
|
||||
w, h, gpuIdx, raw->copyStream, m_slots.size());
|
||||
return raw;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user