ANSCORE/modules/ANSCV/ANSRTSP.cpp

#include "ANSRTSP.h"
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "GpuNV12SlotPool.h"
#include "ANSLicense.h"       // ANS_DBG macro
#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include <memory>
#include <chrono>
#include <format>
#include "media_codec.h"
#include <cstdint>
#include <cuda_runtime.h>
#if defined(_WIN32)
#include <dxgi1_2.h>
#pragma comment(lib, "dxgi.lib")
#elif defined(__linux__)
#include <dirent.h>
#include <fstream>
#include <sstream>
#endif
extern "C"
{
#include <libswscale/swscale.h>
#include <libavutil/imgutils.h>
#include <libavutil/frame.h>
}
// Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
// Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex

// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
#ifndef RTSP_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define RTSP_DBG(fmt, ...) do { \
    char _rtsp_buf[512]; \
    snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
    OutputDebugStringA(_rtsp_buf); \
    fprintf(stderr, "%s", _rtsp_buf); \
} while(0)
#else
#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
#endif
#else
#define RTSP_DBG(fmt, ...) ((void)0)
#endif
#endif
static bool ansrtspLicenceValid = false;
// Global once_flag to protect license checking
static std::once_flag ansrtspLicenseOnceFlag;
static std::once_flag hwDecoderAutoConfigOnceFlag;
namespace ANSCENTER {
    ANSRTSPClient::ANSRTSPClient() {
        _useFullURL = false;
        _username = "";
        _password = "";
        _url = "";
        _imageWidth=0;
        _imageHeight=0;
        _pts=0;
        _lastJpegImage = "";
        _isPlaying=false;

        // Auto-configure HW decoder pool on first client creation.
        // This detects GPUs and sets per-GPU NVDEC session limits automatically.
        // No action needed from LabVIEW or any third-party caller.
        std::call_once(hwDecoderAutoConfigOnceFlag, []() {
            AutoConfigureHWDecoders(0);
        });
    }
    ANSRTSPClient::~ANSRTSPClient() noexcept {
		Destroy();
    }

    void ANSRTSPClient::Destroy() {
        ANS_DBG("RTSP_Lifecycle", "DESTROY called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
        // Move the player client pointer out of the lock scope, then
        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
        // If we hold _mutex during close(), and another thread holds
        // the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
        // inference), we get a deadlock: Stop() → _mutex → nvcuda64
        // vs inference → nvcuda64 → (blocked by exclusive waiter).
        decltype(_playerClient) clientToClose;
        {
            std::unique_lock<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
            }

            // --- Inference guard: wait for in-flight D2D copies to finish ---
            // With synchronous D2D copy, in-flight means "currently inside
            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
            // This is typically <1ms, so the wait is very fast.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Destroy",
                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Destroy",
                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                        __FILE__, __LINE__);
                }
            }

            // Invalidate owner callbacks so stale GpuFrameData don't try to
            // call DecrementInFlight on this (soon-to-be-deleted) object.
            // The GpuFrameData and their global pool slots remain alive —
            // inference engines can safely keep reading from them.
            ANSGpuFrameRegistry::instance().invalidateOwner(this);
            _inFlightFrames.store(0, std::memory_order_release);

            // NO forceReleaseByOwner — frames survive camera deletion.
            // Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
            // by this camera.  They are recycled when inference finishes
            // (GpuFrameData refcount → 0 → slot.inUse = false).
            // NO cudaDeviceSynchronize — no GPU buffers to free here.
            // NO DestroyGpuPool — per-camera pool has been removed.

            clientToClose = std::move(_playerClient);
        }
        // close() destroys the NVDEC decoder ONLY.  Pool slot buffers
        // (regular cudaMallocPitch allocations) are untouched — they
        // belong to the global GpuNV12SlotPool, not the decoder.
        if (clientToClose) {
            clientToClose->close();

            // Force CUDA runtime to release all cached memory from the destroyed
            // NVDEC decoder.  Without this, cuMemFree returns memory to the CUDA
            // driver's internal cache, and the next camera creation allocates fresh
            // memory → VRAM grows by ~200-300MB per destroy/create cycle.
            // cudaDeviceSynchronize ensures all pending GPU ops are done, then
            // cudaMemPool trim releases the freed blocks back to the OS.
            //
            // AMD/Intel/CPU gate: this entire block is a no-op on non-NVIDIA
            // machines because NVDEC never ran, the CUDA memory pool is empty,
            // and calling cuda*() here would wake up cudart_static for nothing
            // (and on AMD can destabilise amdkmdag when DirectML is active).
            if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
                cudaDeviceSynchronize();
                cudaMemPool_t memPool = nullptr;
                int currentDev = 0;
                cudaGetDevice(&currentDev);
                if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
                    cudaMemPoolTrimTo(memPool, 0);  // Release all unused memory
                }
                size_t vramFree = 0, vramTotal = 0;
                cudaMemGetInfo(&vramFree, &vramTotal);
                ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
                        currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
            } else {
                ANS_DBG("RTSP_Destroy", "non-NVIDIA hardware — skipped CUDA memory pool trim");
            }
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
        try {
            ansrtspLicenceValid = ANSCENTER::ANSLicenseHelper::LicenseVerification(licenseKey, 1007, "ANSCV");//Default productId=1005
            if (!ansrtspLicenceValid) { // we also support ANSTS license
                ansrtspLicenceValid = ANSCENTER::ANSLicenseHelper::LicenseVerification(licenseKey, 1003, "ANSVIS");//Default productId=1003 (ANSVIS)
            }
            if (!ansrtspLicenceValid) { // we also support ANSTS license
                ansrtspLicenceValid = ANSCENTER::ANSLicenseHelper::LicenseVerification(licenseKey, 1008, "ANSTS");//Default productId=1008 (ANSTS)
            }
        }
        catch (std::exception& e) {
            ansrtspLicenceValid = false;
        }
    }
    void ANSRTSPClient::CheckLicense() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            // Check once globally
            std::call_once(ansrtspLicenseOnceFlag, [this]() {
                VerifyGlobalANSRTSPLicense(_licenseKey);
                });

            // Update this instance's local license flag
            _licenseValid = ansrtspLicenceValid;
        }
        catch (const std::exception& e) {
            this->_logger.LogFatal("ANSODBase::CheckLicense. Error:", e.what(), __FILE__, __LINE__);
        }
    }

    bool ANSRTSPClient::Init(std::string licenseKey, std::string url) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        CheckLicense();
        if (!_licenseValid) {
            this->_logger.LogError("ANSRTSPClient::Init.", "Invalid license", __FILE__, __LINE__);
            return false;
        }
    /*    network_init();
        sys_buf_init(200);
        rtsp_parse_buf_init(200);
        http_msg_buf_init(200);*/
        _url = url;
        _useFullURL = true;
        return Setup();
    }

    bool ANSRTSPClient::Init(std::string licenseKey, std::string username, std::string password, std::string url) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        CheckLicense();
        if (!_licenseValid) {
            this->_logger.LogError("ANSRTSPClient::Init.", "Invalid license", __FILE__, __LINE__);
            return false;
        }
        //network_init();
        //sys_buf_init(200);
        //rtsp_parse_buf_init(200);
        //http_msg_buf_init(200);
        _url = url;
        _username = username;
        _password = password;
        _useFullURL = false;
		_isPlaying = false;
        return Setup();
    }

    bool ANSRTSPClient::Setup() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if(_useFullURL){
            return _playerClient->open(_url);
        }
        else {
            return _playerClient->open(_username, _password, _url);
        }
    }
    void ANSRTSPClient::SetBBox(cv::Rect bbox) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setBbox(bbox);
    }
    void ANSRTSPClient::SetCrop(bool crop) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
        ANS_DBG("RTSP_Lifecycle", "RECONNECT called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
        //    return the cached _pLastFrame instead of calling into the player,
        //    and blocks new TryIncrementInFlight calls (no new NV12 attachments).
        {
            std::unique_lock<std::recursive_mutex> lock(_mutex);
            _isPlaying = false;

            // --- Inference guard: wait for ALL in-flight inference to finish ---
            // _inFlightFrames tracks frames from GetRTSPCVImage through to the
            // end of inference (DecrementInFlight fires when last clone is released).
            // We MUST wait for this to reach 0 before calling close(), because
            // inference may still be reading NV12 pool buffer data that depends
            // on the NVDEC decoder context being alive.
            //
            // DO NOT force-reset _inFlightFrames or invalidate onReleaseFn —
            // let inference finish naturally so DecrementInFlight fires correctly.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Reconnect",
                    std::format("waiting for {} in-flight inference(s) to complete...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Reconnect",
                        std::format("timed out — still {} in-flight, proceeding with close()",
                                    _inFlightFrames.load()),
                        __FILE__, __LINE__);
                    // Force-reset only on timeout as last resort
                    ANSGpuFrameRegistry::instance().invalidateOwner(this);
                    _inFlightFrames.store(0, std::memory_order_release);
                }
            }
        }

        // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
        //    avoid deadlocking with nvcuda64 SRW lock held by other cameras.
        //    At this point, all inference using this camera's NV12 data has
        //    completed (or timed out), so close() is safe.
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
        auto _rc0 = std::chrono::steady_clock::now();
        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
        auto _rc1 = std::chrono::steady_clock::now();

        // Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
        // Gated on NVIDIA: on AMD/Intel/CPU there was no NVDEC decoder and no
        // CUDA memory pool to trim, so calling into cudart is pure overhead
        // (and combined with DirectML on AMD has been observed to destabilise
        // amdkmdag).  See ANSCVVendorGate.h for the rationale.
        if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
            cudaDeviceSynchronize();
            auto _rc2 = std::chrono::steady_clock::now();
            cudaMemPool_t memPool = nullptr;
            int currentDev = 0;
            cudaGetDevice(&currentDev);
            if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
                cudaMemPoolTrimTo(memPool, 0);
            }
            auto _rc3 = std::chrono::steady_clock::now();
            {
                size_t vf = 0, vt = 0;
                cudaMemGetInfo(&vf, &vt);
                double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
                double syncMs  = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
                double trimMs  = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
                ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
                        closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
            }
        } else {
            double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
            ANS_DBG("RTSP_Reconnect", "close=%.1fms (non-NVIDIA — CUDA memory pool trim skipped)", closeMs);
        }
        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);

        // 3. Re-setup and play under the mutex.
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling Setup() + play()", __FILE__, __LINE__);
        Setup();
        _isPlaying = _playerClient->play();
        RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
        return _isPlaying;
    }
    void ANSRTSPClient::EnableAudio(bool status) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->enableAudio(status);
    }
    void ANSRTSPClient::SetAudioVolume(int volume) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setVolume(volume);

    }
    bool ANSRTSPClient::Start() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying= _playerClient->play();
        return _isPlaying;
    }

     bool ANSRTSPClient::Stop() {
        // Stop playback but keep the RTSP connection and NVDEC decoder alive.
        // LabVIEW uses Stop/Start to pause cameras when no AI task is subscribed.
        // The camera resumes instantly on Start() without re-connecting.
        CRtspPlayer* player = nullptr;
        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
                _isPlaying = false;
                player = _playerClient.get();
            }
        }
        if (player) {
            player->stop();
        }
        ANS_DBG("RTSP_Lifecycle", "STOP complete: handle=%p (connection kept alive)", (void*)this);
        return true;
    }
	bool ANSRTSPClient::Pause() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
		_isPlaying = false;
        return _playerClient->pause();
	}
    bool ANSRTSPClient::IsPaused() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->isPaused();
    }
    bool ANSRTSPClient::IsPlaying() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->isPlaying();
    }
    bool ANSRTSPClient::IsRecording() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->isRecording();
    }
    std::string ANSRTSPClient::GetJpegImage(int& width, int& height, int64_t& pts) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        // If the player is playing, process the frame
        if (_isPlaying) {
            // Get a new frame from the player client
            _lastJpegImage = _playerClient->getJpegImage(width, height, pts);
            // Update internal state variables
            _pts = pts;
            _imageWidth = width;
            _imageHeight = height;
            // Return the frame
            return _lastJpegImage;
        }
        // If the player is not playing, return the last known frame
        else {
            width = _imageWidth;
            height = _imageHeight;
            pts = _pts;
            return _lastJpegImage;
        }
    }

    bool ANSRTSPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
        double ageMs = _playerClient->getLastFrameAgeMs();

        if (ageMs > 5000.0) {
            ANS_DBG("RTSP_Stale", "FROZEN DETECTED: ageMs=%.1f url=%s playing=%d — camera truly stale",
                    ageMs, _url.c_str(), (int)_isPlaying);
            return true;   // Truly stale — no decoder output for 5+ seconds
        }
        if (ageMs > 0.0) {
            return false;  // Decoder is receiving frames — camera is alive
        }

        // ageMs == 0 means no frame has been decoded yet (startup).
        // Fall back to pixel comparison for backward compatibility.
        if (img1.empty() && img2.empty()) return true;
        if (img1.empty() || img2.empty()) return false;
        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;

        // Same data pointer = same cv::Mat (shallow copy)
        if (img1.data == img2.data) return true;

        // Quick 5-point sampling
        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
            const size_t quarter = totalBytes / 4;
            const size_t half = totalBytes / 2;
            if (img1.data[0] != img2.data[0] ||
                img1.data[quarter] != img2.data[quarter] ||
                img1.data[half] != img2.data[half] ||
                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
                return false;
            }
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }

        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }
        return true;
    }
    cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);

        // Return last known frame if not playing
        if (!_isPlaying) {
            width = _imageWidth;
            height = _imageHeight;
            pts = _pts;
            return _pLastFrame;  // Shallow copy (fast)
        }

        // Early stale-out: if the decoder hasn't produced a frame in 5s the
        // camera is dead. Skip _playerClient->getImage() entirely and return
        // the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
        // poll earlier and triggers reconnect. Matches the 5000ms threshold
        // already used on the duplicate-PTS branch below and in
        // areImagesIdentical(), so all three stale paths agree.
        if (!_pLastFrame.empty()) {
            double ageMs = _playerClient->getLastFrameAgeMs();
            if (ageMs >= 5000.0) {
                ANS_DBG("RTSP_GetImage",
                        "EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
                        ageMs, (long long)_pts, _url.c_str());
                width = _imageWidth;
                height = _imageHeight;
                pts = _pts;
                return _pLastFrame;
            }
        }

        int imageW = 0, imageH = 0;
        int64_t currentPts = 0;

        // Get image directly without intermediate assignment
        cv::Mat currentImage = _playerClient->getImage(imageW, imageH, currentPts);

        // Check if image is valid first (cheaper than comparison)
        if (currentImage.empty()) {
            width = _imageWidth;
            height = _imageHeight;
            pts = _pts;
            return _pLastFrame;
        }

        // Use PTS to detect duplicate frames (much faster than pixel comparison)
        // The sequence-based check in getImage() already skips conversion for same frames,
        // so if PTS hasn't changed, it's the same frame — no need for expensive memcmp
        if (currentPts == _pts && !_pLastFrame.empty()) {
            width = _imageWidth;
            height = _imageHeight;
            // Return timestamp based on decoder frame age so LabVIEW can distinguish
            // "rate-limited duplicate" from "camera truly stale".
            // If decoder is still receiving frames (age < 5s), advance PTS so LabVIEW
            // sees a changing timestamp and doesn't trigger false reconnect.
            // If decoder is stale (age > 5s), return same PTS so LabVIEW detects it.
            double ageMs = _playerClient->getLastFrameAgeMs();
            if (ageMs > 0.0 && ageMs < 5000.0) {
                // Camera alive but rate-limited — advance PTS to prevent false stale detection
                _pts++;
            } else if (ageMs >= 5000.0) {
                // Camera stale — keep same PTS so LabVIEW triggers reconnect
                ANS_DBG("RTSP_GetImage", "STALE PTS: ageMs=%.1f pts=%lld url=%s — not advancing PTS",
                        ageMs, (long long)_pts, _url.c_str());
            }
            pts = _pts;
            return _pLastFrame;
        }

        // Update PTS first
        _pts = currentPts;
        pts = currentPts;

        // Handle non-rotated case
        if (_imageRotateDeg == 0) {
            // Apply display resize if configured
            if (_displayWidth > 0 && _displayHeight > 0 &&
                (currentImage.cols != _displayWidth || currentImage.rows != _displayHeight)) {
                cv::Mat displayResult;
                cv::resize(currentImage, displayResult, cv::Size(_displayWidth, _displayHeight),
                    0, 0, cv::INTER_LINEAR);
                currentImage = displayResult;
                imageW = _displayWidth;
                imageH = _displayHeight;
            }
            _imageWidth = imageW;
            _imageHeight = imageH;
            width = imageW;
            height = imageH;
            _pLastFrame = currentImage;  // Shallow copy (reference counted)
            return currentImage;
        }

        // Handle rotation case
        // Calculate proper rotated dimensions for 90/270 degree rotations
        int rotatedWidth, rotatedHeight;
        double absAngle = std::abs(_imageRotateDeg);

        if (absAngle == 90.0 || absAngle == 270.0) {
            // Swap dimensions for 90/270 degree rotations
            rotatedWidth = imageH;
            rotatedHeight = imageW;
        }
        else {
            rotatedWidth = imageW;
            rotatedHeight = imageH;
        }

        // Calculate rotation matrix
        cv::Point2f center(imageW / 2.0f, imageH / 2.0f);
        cv::Mat rotationMatrix = cv::getRotationMatrix2D(center, _imageRotateDeg, 1.0);

        // Apply rotation with optimized interpolation
        cv::Mat rotatedImage;
        cv::warpAffine(currentImage, rotatedImage, rotationMatrix,
            cv::Size(rotatedWidth, rotatedHeight),
            cv::INTER_LINEAR,  // Faster than INTER_CUBIC, still good quality
            cv::BORDER_CONSTANT,
            cv::Scalar(0, 0, 0));

        // Apply display resize if configured
        if (_displayWidth > 0 && _displayHeight > 0 &&
            (rotatedImage.cols != _displayWidth || rotatedImage.rows != _displayHeight)) {
            cv::Mat displayResult;
            cv::resize(rotatedImage, displayResult, cv::Size(_displayWidth, _displayHeight),
                0, 0, cv::INTER_LINEAR);
            rotatedImage = displayResult;
            rotatedWidth = _displayWidth;
            rotatedHeight = _displayHeight;
        }

        // Update dimensions - use calculated dimensions, not cols/rows
        _imageWidth = rotatedWidth;
        _imageHeight = rotatedHeight;
        width = rotatedWidth;
        height = rotatedHeight;

        // Store and return rotated image
        _pLastFrame = rotatedImage;
        return rotatedImage;
    }
    //cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) {
    //    std::lock_guard<std::recursive_mutex> lock(_mutex);
    //    // Return last known frame if not playing
    //    if (!_isPlaying) {
    //        width = _imageWidth;
    //        height = _imageHeight;
    //        pts = _pts;
    //        return _pLastFrame;
    //    }

    //    int imageW = 0, imageH = 0;
    //    int64_t currentPts = 0;
    //    cv::Mat currentImage;
    //    currentImage = _playerClient->getImage(imageW, imageH, currentPts);

    //    // If we still don't have a valid image, return last frame
    //    if (currentImage.empty() || areImagesIdentical(currentImage, _pLastFrame)) {
    //        width = _imageWidth;
    //        height = _imageHeight;
    //        pts = _pts;
    //        return _pLastFrame;
    //    }

    //    // Update internal state
    //    width = imageW;
    //    height = imageH;
    //    pts = currentPts;
    //    _pts = currentPts;
    //    _imageWidth = imageW;
    //    _imageHeight = imageH;

    //    if (_imageRotateDeg == 0) {
    //        _pLastFrame = currentImage;
    //        return currentImage;
    //    }
    //    else {
    //        // Rotate image if required
    //        cv::Point2f center(width / 2.0f, height / 2.0f);
    //        cv::Mat rotationMatrix = cv::getRotationMatrix2D(center, _imageRotateDeg, 1.0);
    //        cv::Mat rotatedImage;

    //        cv::warpAffine(currentImage, rotatedImage, rotationMatrix,
    //            cv::Size(width, height), cv::INTER_CUBIC,
    //            cv::BORDER_CONSTANT, cv::Scalar());

    //        // Update dimensions and store result
    //        width = rotatedImage.cols;
    //        height = rotatedImage.rows;
    //        _imageWidth = width;
    //        _imageHeight = height;

    //        _pLastFrame = rotatedImage;
    //        return rotatedImage;
    //    }
    //}
    void ANSRTSPClient::SetMaxHWDecoders(int maxDecoders) {
        if (maxDecoders >= 0) {
            g_hw_decoder_max = static_cast<uint32>(maxDecoders);
        }
    }
    // Estimate max NVDEC decode sessions from SM count and VRAM.
    // NVDEC has NO driver-enforced session limit (unlike NVENC which caps at 3-5).
    // Per NVIDIA Video Codec SDK 13.0 docs: "NVDEC natively supports multiple
    // hardware decoding contexts with negligible context-switching penalty."
    //
    // Two factors determine the soft cap:
    //   1. SM count  — proxy for decode throughput (more SMs = more parallel capacity)
    //   2. VRAM      — each 4K HEVC session uses ~80MB (decode surfaces + buffers)
    //                  We reserve 1GB for OS/display/inference workloads.
    // The lower of the two estimates is used.
    //
    //   GPU Example              | SMs | VRAM  | SM-cap | VRAM-cap | NVDEC | Final
    //   -------------------------|-----|-------|--------|----------|-------|------
    //   RTX 5090 (Blackwell)     | 170 | 32GB  |   64   |   393    |  32   |  32
    //   RTX 5080 (Blackwell)     |  84 | 16GB  |   48   |   192    |  32   |  32
    //   RTX 4090 (Ada)           | 128 | 24GB  |   64   |   294    |  32   |  32
    //   RTX 4070 Laptop (Ada)    |  36 |  8GB  |   32   |    89    |  32   |  32
    //   RTX 3060 (Ampere)        |  28 | 12GB  |   16   |   140    |  32   |  16
    //   GTX 1650 (Turing)        |  14 |  4GB  |    8   |    38    |  32   |   8
    // Estimate max decode sessions from SM count and VRAM.
    // Uses the minimum of THREE estimates to avoid overloading any subsystem.
    //
    // SM-based estimate: proxy for decode throughput
    // VRAM-based estimate: each 4K HEVC session uses ~80MB (decode surfaces + context)
    //   Reserve 1GB for OS/display/AI inference, rest available for decode sessions.
    // NVDEC hardware cap: NVDEC is a fixed-function ASIC independent of SM count.
    //   NVIDIA consumer GPUs (GeForce) have a hardware surface pool limit of
    //   ~32 concurrent decode sessions regardless of GPU tier.  Exceeding this
    //   causes CUVID Error 205 ("Error mapping a picture"), which triggers a
    //   sticky CUDA_ERROR_ILLEGAL_ADDRESS that permanently corrupts the CUDA
    //   context — crashing ALL GPU operations (inference, display, etc.).
    static constexpr int NVDEC_HW_SESSION_CAP = 32;

    static int estimateMaxSessions(int smCount, size_t totalVramBytes) {
        // SM-based throughput estimate
        int smBased;
        if (smCount >= 80)       smBased = 64;
        else if (smCount >= 50)  smBased = 48;
        else if (smCount >= 30)  smBased = 32;
        else if (smCount >= 20)  smBased = 16;
        else                     smBased = 8;

        // VRAM-based memory estimate
        size_t totalVramMB = totalVramBytes / (1024 * 1024);
        size_t reservedMB = 1024;  // Reserve 1GB for OS/display/inference
        size_t availableMB = (totalVramMB > reservedMB) ? (totalVramMB - reservedMB) : 256;
        int vramBased = static_cast<int>(availableMB / 80);  // ~80MB per 4K HEVC session
        if (vramBased < 4) vramBased = 4;  // Minimum 4 sessions

        // NVDEC hardware surface pool cap — prevents CUVID Error 205
        int result = (smBased < vramBased) ? smBased : vramBased;
        if (result > NVDEC_HW_SESSION_CAP) result = NVDEC_HW_SESSION_CAP;
        return result;
    }

    // ---------------------------------------------------------------------------
    //  Platform-specific GPU detection for non-NVIDIA systems (Intel/AMD)
    //  Estimates max VAAPI/D3D11VA decode sessions based on vendor and VRAM.
    //
    //  Vendor IDs: 0x10DE = NVIDIA, 0x8086 = Intel, 0x1002 = AMD
    //
    //  Intel Quick Sync (QSV) — via D3D11VA (Windows) or VAAPI (Linux):
    //    - Modern Intel (12th gen+, Iris Xe/Arc): ~16 concurrent sessions
    //    - Older Intel (8th-11th gen, UHD 630):   ~8 concurrent sessions
    //
    //  AMD VCN (Video Core Next) — via D3D11VA (Windows) or VAAPI (Linux):
    //    - RDNA 2/3 (RX 6000/7000, Ryzen 7000 iGPU): ~16 concurrent sessions
    //    - Older Vega/Polaris:                         ~8 concurrent sessions
    //
    //  Neither Intel nor AMD publish hard session limits for decode.
    // ---------------------------------------------------------------------------

    // Helper: estimate max sessions from vendor ID and available VRAM
    static int estimateSessionsByVendor(uint32_t vendorId, size_t dedicatedMB, size_t sharedMB) {
        if (vendorId == 0x10DE) {
            // NVIDIA — CUDA path failed, estimate from VRAM
            size_t availMB = (dedicatedMB > 1024) ? (dedicatedMB - 1024) : 512;
            int sessions = static_cast<int>(availMB / 80);
            if (sessions < 8) sessions = 8;
            if (sessions > 64) sessions = 64;
            return sessions;
        }
        else if (vendorId == 0x8086) {
            // Intel — Quick Sync via D3D11VA/VAAPI
            // iGPUs have little/no dedicated VRAM; use shared memory as proxy
            size_t effectiveMB = (dedicatedMB > 0) ? dedicatedMB : (sharedMB / 4);
            if (effectiveMB >= 2048)     return 16;  // Arc A770/A750 (discrete)
            else if (effectiveMB >= 512) return 12;  // Iris Xe, 12th gen+
            else                         return 8;   // UHD 630, older
        }
        else if (vendorId == 0x1002) {
            // AMD — VCN via D3D11VA/VAAPI
            if (dedicatedMB >= 4096)      return 32;  // RX 6000/7000 discrete
            else if (dedicatedMB >= 1024) return 16;  // RX 6500, older discrete
            else                          return 8;   // Ryzen iGPU (Vega/RDNA)
        }
        return 4;  // Unknown vendor
    }

    static const char* vendorIdToName(uint32_t vendorId) {
        if (vendorId == 0x10DE) return "NVIDIA";
        if (vendorId == 0x8086) return "Intel";
        if (vendorId == 0x1002) return "AMD";
        return "Unknown";
    }

#if defined(_WIN32)
    // ---- Windows: enumerate GPUs via DXGI ----
    static int AutoConfigureHWDecoders_Platform() {
        IDXGIFactory1* pFactory = nullptr;
        HRESULT hr = CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory);
        if (FAILED(hr) || !pFactory) {
            g_hw_decoder_max = 4;
            fprintf(stderr, "[HWDecode] AutoConfigure: DXGI unavailable, defaulting to %d sessions\n",
                g_hw_decoder_max);
            return g_hw_decoder_max;
        }

        IDXGIAdapter1* pAdapter = nullptr;
        int totalSessions = 0;
        bool foundHwGpu = false;

        for (UINT i = 0; pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND; i++) {
            DXGI_ADAPTER_DESC1 desc;
            pAdapter->GetDesc1(&desc);
            pAdapter->Release();

            // Skip software/remote adapters
            if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) continue;

            char gpuName[128] = {};
            wcstombs(gpuName, desc.Description, sizeof(gpuName) - 1);

            uint32_t vendorId = desc.VendorId;
            size_t dedicatedMB = desc.DedicatedVideoMemory / (1024 * 1024);
            size_t sharedMB = desc.SharedSystemMemory / (1024 * 1024);

            int maxSessions = estimateSessionsByVendor(vendorId, dedicatedMB, sharedMB);

            fprintf(stderr, "[HWDecode] AutoConfigure: GPU[%d] \"%s\" Vendor=%s(0x%04X) VRAM=%zuMB Shared=%zuMB -> max %d decode sessions\n",
                i, gpuName, vendorIdToName(vendorId), vendorId, dedicatedMB, sharedMB, maxSessions);

            totalSessions += maxSessions;
            foundHwGpu = true;
        }

        pFactory->Release();

        if (!foundHwGpu) {
            g_hw_decoder_max = 4;
            fprintf(stderr, "[HWDecode] AutoConfigure: No hardware GPU found, defaulting to %d sessions (software fallback)\n",
                g_hw_decoder_max);
            return g_hw_decoder_max;
        }

        g_hw_decoder_max = static_cast<uint32>(totalSessions);
        fprintf(stderr, "[HWDecode] AutoConfigure: Total %d decode sessions across all GPUs\n", totalSessions);
        return totalSessions;
    }

#elif defined(__linux__)
    // ---- Linux: enumerate GPUs via /sys/class/drm and /proc/driver ----
    // Reads vendor ID from sysfs and estimates VRAM from DRM memory info.
    static int AutoConfigureHWDecoders_Platform() {
        int totalSessions = 0;
        bool foundHwGpu = false;
        int gpuIndex = 0;

        // Scan /sys/class/drm/card0, card1, ... for GPU info
        for (int cardNum = 0; cardNum < 16; cardNum++) {
            // Read vendor ID from PCI device info
            char vendorPath[256];
            snprintf(vendorPath, sizeof(vendorPath),
                "/sys/class/drm/card%d/device/vendor", cardNum);

            std::ifstream vendorFile(vendorPath);
            if (!vendorFile.is_open()) continue;

            uint32_t vendorId = 0;
            vendorFile >> std::hex >> vendorId;
            vendorFile.close();

            // Skip non-GPU devices (vendor 0 = invalid)
            if (vendorId == 0) continue;

            // Try to get GPU name from /sys/class/drm/cardN/device/label or
            // fall back to /sys/class/drm/cardN/device/uevent
            char gpuName[128] = "Unknown GPU";
            char ueventPath[256];
            snprintf(ueventPath, sizeof(ueventPath),
                "/sys/class/drm/card%d/device/uevent", cardNum);
            std::ifstream ueventFile(ueventPath);
            if (ueventFile.is_open()) {
                std::string line;
                while (std::getline(ueventFile, line)) {
                    if (line.find("PCI_SLOT_NAME=") == 0) {
                        snprintf(gpuName, sizeof(gpuName), "PCI %s", line.substr(14).c_str());
                        break;
                    }
                }
                ueventFile.close();
            }

            // Try to read VRAM size from DRM memory info
            // Intel iGPU: /sys/class/drm/card0/device/resource reports BAR sizes
            // Discrete GPU: reported via resource file (BAR 0 size)
            size_t dedicatedMB = 0;
            size_t sharedMB = 0;

            // Read BAR 0 from PCI resource file to estimate VRAM
            char resourcePath[256];
            snprintf(resourcePath, sizeof(resourcePath),
                "/sys/class/drm/card%d/device/resource", cardNum);
            std::ifstream resourceFile(resourcePath);
            if (resourceFile.is_open()) {
                std::string line;
                if (std::getline(resourceFile, line)) {
                    // Format: "start end flags" — BAR 0 (usually VRAM)
                    unsigned long long start = 0, end = 0;
                    if (sscanf(line.c_str(), "%llx %llx", &start, &end) == 2 && end > start) {
                        dedicatedMB = (end - start + 1) / (1024 * 1024);
                    }
                }
                resourceFile.close();
            }

            // For Intel iGPU, BAR size is small (~256MB) but system memory is shared
            // Use total system memory / 4 as shared memory estimate
            if (vendorId == 0x8086 && dedicatedMB < 512) {
                std::ifstream meminfo("/proc/meminfo");
                if (meminfo.is_open()) {
                    std::string line;
                    while (std::getline(meminfo, line)) {
                        if (line.find("MemTotal:") == 0) {
                            unsigned long totalKB = 0;
                            sscanf(line.c_str(), "MemTotal: %lu", &totalKB);
                            sharedMB = totalKB / 1024;  // Total system RAM in MB
                            break;
                        }
                    }
                    meminfo.close();
                }
            }

            int maxSessions = estimateSessionsByVendor(vendorId, dedicatedMB, sharedMB);

            fprintf(stderr, "[HWDecode] AutoConfigure: GPU[%d] \"%s\" Vendor=%s(0x%04X) VRAM=%zuMB Shared=%zuMB -> max %d VAAPI decode sessions\n",
                gpuIndex, gpuName, vendorIdToName(vendorId), vendorId, dedicatedMB, sharedMB, maxSessions);

            totalSessions += maxSessions;
            foundHwGpu = true;
            gpuIndex++;
        }

        if (!foundHwGpu) {
            g_hw_decoder_max = 4;
            fprintf(stderr, "[HWDecode] AutoConfigure: No hardware GPU found in /sys/class/drm, defaulting to %d sessions (software fallback)\n",
                g_hw_decoder_max);
            return g_hw_decoder_max;
        }

        g_hw_decoder_max = static_cast<uint32>(totalSessions);
        fprintf(stderr, "[HWDecode] AutoConfigure: Total %d VAAPI decode sessions across all GPUs\n", totalSessions);
        return totalSessions;
    }

#else
    // ---- Unsupported platform: conservative default ----
    static int AutoConfigureHWDecoders_Platform() {
        g_hw_decoder_max = 4;
        fprintf(stderr, "[HWDecode] AutoConfigure: Unsupported platform, defaulting to %d sessions\n",
            g_hw_decoder_max);
        return g_hw_decoder_max;
    }
#endif

    int ANSRTSPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
        // Skip the CUDA probe entirely on non-NVIDIA hardware — the Platform
        // fallback (DXGI on Windows, sysfs on Linux) handles Intel/AMD auto
        // configuration, and calling cudaGetDeviceCount() on AMD wakes up
        // cudart_static for no benefit.  See ANSCVVendorGate.h.
        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
            return AutoConfigureHWDecoders_Platform();
        }

        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
            // No NVIDIA GPU — detect Intel/AMD via platform API (DXGI on Windows, sysfs on Linux)
            return AutoConfigureHWDecoders_Platform();
        }

        // Detect each GPU's SM count and set per-GPU session limits
        std::vector<int> maxPerGpuList(gpuCount);
        int total = 0;

        for (int i = 0; i < gpuCount; i++) {
            cudaDeviceProp prop;
            cudaGetDeviceProperties(&prop, i);
            int smCount = prop.multiProcessorCount;

            if (maxPerGpuOverride > 0) {
                // Manual override: same limit for all GPUs
                maxPerGpuList[i] = maxPerGpuOverride;
            } else {
                // Auto-detect based on SM count + VRAM
                maxPerGpuList[i] = estimateMaxSessions(smCount, prop.totalGlobalMem);
            }

            total += maxPerGpuList[i];
            size_t vramMB = prop.totalGlobalMem / (1024 * 1024);
            fprintf(stderr, "[NVDEC] AutoConfigure: GPU[%d] \"%s\" SM=%d VRAM=%zuMB -> max %d decode sessions\n",
                i, prop.name, smCount, vramMB, maxPerGpuList[i]);
        }

        // Configure the per-GPU pool with individual limits
        HWDecoderPool::instance().configure(maxPerGpuList);

        return total;
    }
    void ANSRTSPClient::SetHWDecoding(int hwMode, int preferredGpu) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setHWDecoding(hwMode, preferredGpu);
    }
    bool ANSRTSPClient::IsHWDecodingActive() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->isHWDecodingActive();
    }
    int ANSRTSPClient::GetHWDecodingGpuIndex() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->getHWDecodingGpuIndex();
    }
    void ANSRTSPClient::SetDisplayResolution(int width, int height) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _displayWidth = width;
        _displayHeight = height;
    }
    void ANSRTSPClient::SetImageQuality(int mode) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setImageQuality(mode);  // 0=fast (AI), 1=quality (display)
    }
    void ANSRTSPClient::SetTargetFPS(double intervalMs) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setTargetFPS(intervalMs);  // 0=no limit, 100=~10FPS, 200=~5FPS
    }
    void ANSRTSPClient::SetNV12FastPath(bool enable) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _useNV12FastPath = enable;
    }
    double ANSRTSPClient::GetLastFrameAgeMs() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->getLastFrameAgeMs();
    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getNV12Frame();  // Returns clone, caller must av_frame_free
    }
    AVFrame* ANSRTSPClient::GetCudaHWFrame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getCudaHWFrame();
    }
    bool ANSRTSPClient::IsCudaHWAccel() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        return _playerClient->isCudaHWAccel();
    }
    std::string ANSRTSPClient::MatToBinaryData(const cv::Mat& image) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!image.empty()) {
            if ((image.data != nullptr) && (image.u != nullptr)) {
                try {
                    // Encode the image to a memory buffer
                    std::vector<uchar> imageData;
                    bool success = cv::imencode(".jpg", image, imageData);
                    if (!success) {
                        this->_logger.LogError("ANSRTSPClient::MatToBinaryData. Error:", "Failed to encode the image.", __FILE__, __LINE__);
                        return "";
                    }
                    std::string binaryData(imageData.begin(), imageData.end());
                    return binaryData;
                }
                catch (const std::exception& e) {
                    this->_logger.LogFatal("ANSRTSPClient::MatToBinaryData. Error:", e.what(), __FILE__, __LINE__);
                    return "";
                }
                catch (...) {
                    this->_logger.LogFatal("ANSRTSPClient::MatToBinaryData. Error:", "Caught unknown exception!", __FILE__, __LINE__);
                    return "";
                }
            }
            else return "";
        }
        else return "";
    }

}

extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle, const char* licenseKey, const char* username, const char* password, const char* url) {
    ANS_DBG("RTSP_Lifecycle", "CREATE: url=%s", url ? url : "null");
    if (!Handle || !licenseKey || !url) return -1;
    try {
        auto ptr = std::make_unique<ANSCENTER::ANSRTSPClient>();
        std::string _username = username ? username : "";
        std::string _password = password ? password : "";
        bool result = false;
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
            // Software decode by default — saves VRAM (no NVDEC DPB surfaces).
            // With 100 cameras, HW decode would consume ~5-21 GB VRAM for idle decoders.
            // User can enable HW decode per-camera via SetRTSPHWDecoding(handle, 7).
            // ptr->SetHWDecoding(7);  // Disabled — was HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
            anscv_register_handle(*Handle, [](void* p) {
                auto* h = static_cast<ANSCENTER::ANSRTSPClient*>(p);
                try { h->Stop(); } catch (...) {}
                try { h->Destroy(); } catch (...) {}
                try { delete h; } catch (...) {}
            });
            return 1;
        }
        *Handle = nullptr;
        return 0;
    } catch (...) { return -1; }
}
extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle) {
    ANS_DBG("RTSP_Lifecycle", "RELEASE: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        extern void anscv_unregister_handle(void*);
        anscv_unregister_handle(*Handle);

        // Grab the raw pointer and NULL the caller's handle immediately.
        // This prevents the caller (LabVIEW) from issuing new calls.
        ANSCENTER::ANSRTSPClient* raw = *Handle;
        *Handle = nullptr;

        // Mark as not-playing under _mutex ONLY.  This makes
        // GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
        // on any subsequent call, and prevents NEW NV12 GPU surface
        // pointers from being handed out.
        //
        // Synchronous cleanup — ensures all GPU resources (NVDEC surfaces, VRAM)
        // are fully released BEFORE LabVIEW creates a new camera.
        // Previously deferred to a background thread, but that caused the old
        // camera's resources to overlap with the new camera's allocations,
        // leading to temporary VRAM doubling (~240MB per camera) and eventual
        // VRAM exhaustion on cameras with frequent reconnects.
        {
            auto t0 = std::chrono::steady_clock::now();
            raw->Stop();
            auto t1 = std::chrono::steady_clock::now();
            raw->Destroy();
            auto t2 = std::chrono::steady_clock::now();
            delete raw;
            auto t3 = std::chrono::steady_clock::now();

            double stopMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
            double destroyMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
            double deleteMs = std::chrono::duration<double, std::milli>(t3 - t2).count();
            ANS_DBG("RTSP_Lifecycle", "RELEASE complete: stop=%.1fms destroy=%.1fms delete=%.1fms total=%.1fms",
                    stopMs, destroyMs, deleteMs, stopMs + destroyMs + deleteMs);
        }

        return 0;
    } catch (...) {
        if (Handle) *Handle = nullptr;
        return -1;
    }
}
extern "C" __declspec(dllexport) int GetRTSPStrImage(ANSCENTER::ANSRTSPClient * *Handle, int& width, int& height, int64_t & timeStamp, std::string & jpegImage) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        jpegImage = (*Handle)->GetJpegImage(width, height, timeStamp);
        if (!jpegImage.empty()) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error getting RTSP image: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error getting RTSP image: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int GetRTSPCVImage(
    ANSCENTER::ANSRTSPClient** Handle,
    int& width,
    int& height,
    int64_t& timeStamp,
    cv::Mat** image)
{
    // Validate input parameters
    if (!Handle || !*Handle || !image) {
        std::cerr << "Error: Invalid input parameters in GetRTSPCVImage" << std::endl;
        return -1;
    }

    try {
        auto t0 = std::chrono::steady_clock::now();

        // Get image (shallow copy - reference counted, fast)
        cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);

        // Check if valid image was retrieved
        if (img.empty()) {
            return 0;  // No valid image available
        }

        // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
        anscv_mat_replace(image, std::move(img));

        auto t1 = std::chrono::steady_clock::now();

        // NV12 GPU fast path: attach NV12 frame data for zero-copy inference.
        // When disabled (_useNV12FastPath=false), the original stable CPU path is used:
        //   GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention.
        // When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference.
        if ((*Handle)->IsNV12FastPath()) {
            int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
            bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
            RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
                    (void*)*image, gpuIdx, (int)inFlightGuardHeld);

            if (inFlightGuardHeld) {
                AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
                if (cudaHW) {
                    RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
                            cudaHW->width, cudaHW->height,
                            (void*)cudaHW->data[0], (void*)cudaHW->data[1]);

                    // Acquire a slot from the global pool — survives camera Destroy.
                    GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
                        gpuIdx, cudaHW->width, cudaHW->height);

                    // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
                    AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
                    gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
                } else {
                    // HW decode not active — try CPU NV12
                    AVFrame* nv12 = (*Handle)->GetNV12Frame();
                    if (nv12) {
                        gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
                    }
                }

                // Wire up the registry callback to release the in-flight guard.
                auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
                RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
                        (void*)gpuData,
                        gpuData ? (void*)gpuData->yPlane : nullptr,
                        gpuData ? (int)gpuData->isCudaDevicePtr : -1,
                        gpuData ? (void*)gpuData->poolSlot : nullptr);
                if (gpuData) {
                    gpuData->ownerClient = *Handle;
                    gpuData->onReleaseFn = [](void* client) {
                        static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
                    };
                } else {
                    (*Handle)->DecrementInFlight();
                }
            } else {
                RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
            }
        }
        // else: original CPU path — cv::Mat** contains BGR data in CPU RAM.
        // No CUDA calls, no pool slots, no GPU frame registry.
        // Inference uses cv::Mat directly (upload to GPU in engine).

        // Lightweight timing — logs only when frame grab + D2D exceeds 50ms.
        // Goes to both spdlog (console/file) AND OutputDebugString (DebugView)
        // but ONLY for slow frames, so the overhead is negligible (<1 call/sec
        // under normal conditions vs 500+/sec with full debug logging).
        auto t2 = std::chrono::steady_clock::now();
        double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
        double cudaMs     = std::chrono::duration<double, std::milli>(t2 - t1).count();
        double totalMs    = getImageMs + cudaMs;
        if (totalMs > 500.0) {
            auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
                    totalMs, getImageMs, cudaMs, width, height);
            (*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__);
#ifdef _WIN32
            auto dbg = std::format("[GetRTSPCVImage] {}\n", msg);
            OutputDebugStringA(dbg.c_str());
#endif
        }

        return 1;  // Success
    }
    catch (const cv::Exception& e) {
        std::cerr << "OpenCV exception in GetRTSPCVImage: " << e.what() << std::endl;
        return -2;
    }
    catch (const std::exception& e) {
        std::cerr << "Exception in GetRTSPCVImage: " << e.what() << std::endl;
        return -2;
    }
    catch (...) {
        std::cerr << "Unknown exception in GetRTSPCVImage" << std::endl;
        return -2;
    }
}
//extern "C" __declspec(dllexport) int GetRTSPCVImage(
//    ANSCENTER::ANSRTSPClient** Handle,
//    int& width,
//    int& height,
//    int64_t& timeStamp,
//    cv::Mat** image)
//{
//    // Validate input parameters
//    if (!Handle || !(*Handle) || !image) {
//        std::cerr << "Error: Invalid input parameters in GetRTSPCVImage." << std::endl;
//        return -1;  // Error code for invalid parameters
//    }
//
//    try {
//        // Ensure thread safety before calling GetImage
//        std::lock_guard<std::mutex> lock(rtspMutex);
//
//        // Get image (shallow copy - fast, reference counted)
//        cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);
//
//        // Check if valid image was retrieved
//        if (img.empty()) {
//            return 0;  // No valid image available
//        }
//
//        // Exception-safe allocation: create new image BEFORE deleting old one
//        // This ensures if clone() throws, we don't lose the old image pointer
//        cv::Mat* newImage = new cv::Mat(img.clone());  // Single deep copy at DLL boundary
//
//        // Now safe to delete old image
//        if (*image) {
//            delete* image;
//        }
//
//        // Assign new image
//        *image = newImage;
//
//        return 1;  // Success
//    }
//    catch (const cv::Exception& e) {
//        std::cerr << "OpenCV exception in GetRTSPCVImage: " << e.what() << std::endl;
//        return -2;  // Error code for OpenCV exceptions
//    }
//    catch (const std::exception& e) {
//        std::cerr << "Exception in GetRTSPCVImage: " << e.what() << std::endl;
//        return -2;  // Error code for standard exceptions
//    }
//    catch (...) {
//        std::cerr << "Unknown exception in GetRTSPCVImage." << std::endl;
//        return -2;  // Error code for unknown exceptions
//    }
//}
//extern "C" __declspec(dllexport) int GetRTSPCVImage(ANSCENTER::ANSRTSPClient** Handle, int& width, int& height, int64_t& timeStamp, cv::Mat** image) {
//    if (!Handle || !(*Handle) || !image) {
//        std::cerr << "Error: Invalid input parameters in GetRTSPCVImage." << std::endl;
//        return -1;  // Error code for invalid parameters
//    }
//
//    try {
//        std::lock_guard<std::mutex> lock(rtspMutex);  // Ensure thread safety before calling GetImage
//
//        cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);
//
//        if (img.empty()) {
//            return 0;  // No valid image retrieved
//        }
//
//        // Properly release previous memory
//        if (*image) {
//            delete* image;
//            *image = nullptr;
//        }
//
//        // Allocate new cv::Mat and ensure memory is properly managed
//        *image = new cv::Mat(img.clone());  // Avoid std::move to prevent releasing shared data
//
//        return 1;  // Success
//    }
//    catch (const std::exception& e) {
//        std::cerr << "Exception in GetRTSPCVImage: " << e.what() << std::endl;
//        return -2;  // Error code for exceptions
//    }
//    catch (...) {
//        std::cerr << "Exception in GetRTSPCVImage: Unknown exception." << std::endl;
//        return -2;  // Generic error code for exceptions
//    }
//}

extern "C" __declspec(dllexport) int GetRTSPImage(ANSCENTER::ANSRTSPClient** Handle, int& width, int& height, int64_t& timeStamp, LStrHandle jpegImage) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        std::string jpegString = (*Handle)->GetJpegImage(width, height, timeStamp);
        int size = jpegString.length();
        if (size > 0) {
            MgErr error;
            // Resize the jpegImage handle to hold the image data
            error = DSSetHandleSize(jpegImage, sizeof(int32) + size * sizeof(uChar));
            // Check if resizing the handle was successful
            if (error == noErr) {
                // Set the size of the image in the handle
                (*jpegImage)->cnt = size;
                // Use memcpy to copy the data from the std::string to the LStrHandle's str buffer
                memcpy((*jpegImage)->str, jpegString.c_str(), size);
                // Return success
                return 1;
            }
            else {
                // Return failure if there was an error in resizing the handle
                std::cerr << "Error resizing jpegImage handle: " << error << std::endl;
                return 0;
            }
        }
        else {
            // If the JPEG image string is empty, return failure
            std::cerr << "No image data retrieved from FLV client." << std::endl;
            return 0;
        }
    }
	catch (const std::exception& e) {
		std::cerr << "Error getting RTSP image: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error getting RTSP image: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int StartRTSP(ANSCENTER::ANSRTSPClient **Handle) {
    ANS_DBG("RTSP_Lifecycle", "START: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Start();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error starting RTSP client: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error starting RTSP client: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int ReconnectRTSP(ANSCENTER::ANSRTSPClient * *Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Reconnect();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error reconnecting RTSP client: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error reconnecting RTSP client: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int StopRTSP(ANSCENTER::ANSRTSPClient * *Handle) {
    ANS_DBG("RTSP_Lifecycle", "STOP: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Stop();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error stopping RTSP client: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error stopping RTSP client: Unknown exception." << std::endl;
		return 0;
	}

}
extern "C" __declspec(dllexport) int PauseRTSP(ANSCENTER::ANSRTSPClient** Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Pause();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error pausing RTSP client: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error pausing RTSP client: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int IsRTSPPaused(ANSCENTER::ANSRTSPClient** Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->IsPaused();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error checking if RTSP client is paused: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error checking if RTSP client is paused: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) int IsRTSPRunning(ANSCENTER::ANSRTSPClient **Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->IsPlaying();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error checking if RTSP client is running: " << e.what() << std::endl;
		return 0;
	}
    catch (...) {
        std::cerr << "Error checking if RTSP client is running: Unknown exception." << std::endl;
        return 0;
    }
}
extern "C" __declspec(dllexport) int IsRTSPRecording(ANSCENTER::ANSRTSPClient **Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->IsRecording();
        if (result) return 1;
        else return 0;
    }
	catch (const std::exception& e) {
		std::cerr << "Error checking if RTSP client is recording: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error checking if RTSP client is recording: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) void SetRTSPAudioVolume(ANSCENTER::ANSRTSPClient * *Handle, int volume) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetAudioVolume(volume);
    } catch (...) { }
}
extern "C" __declspec(dllexport) void EnableRTSPAudioVolume(ANSCENTER::ANSRTSPClient * *Handle, int status) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        bool audioStatus = false;
        if (status == 1)audioStatus = true;
        (*Handle)->EnableAudio(audioStatus);
    } catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTSPImageRotation(ANSCENTER::ANSRTSPClient * *Handle, double rotationAngle) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetImageRotate(rotationAngle);
    } catch (...) { }
}
extern "C" __declspec(dllexport) int SetBBoxRTSP(ANSCENTER::ANSRTSPClient** Handle, int x, int y, int width, int height) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        cv::Rect bbox(x, y, width, height);
        (*Handle)->SetBBox(bbox);
        return 1;
    }
	catch (const std::exception& e) {
		std::cerr << "Error setting RTSP bounding box: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error setting RTSP bounding box: Unknown exception." << std::endl;
		return 0;
	}
}
extern "C" __declspec(dllexport) void SetMaxHWDecoders(int maxDecoders) {
    ANSCENTER::ANSRTSPClient::SetMaxHWDecoders(maxDecoders);
}
extern "C" __declspec(dllexport) int AutoConfigureHWDecoders(int maxPerGpuOverride) {
    return ANSCENTER::ANSRTSPClient::AutoConfigureHWDecoders(maxPerGpuOverride);
}
extern "C" __declspec(dllexport) void SetRTSPHWDecoding(ANSCENTER::ANSRTSPClient** Handle, int hwMode, int preferredGpu) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetHWDecoding(hwMode, preferredGpu);
    } catch (...) { }
}
extern "C" __declspec(dllexport) int IsRTSPHWDecodingActive(ANSCENTER::ANSRTSPClient** Handle) {
    if (Handle == nullptr || *Handle == nullptr) return 0;
    try {
        return (*Handle)->IsHWDecodingActive() ? 1 : 0;
    } catch (...) { return 0; }
}
extern "C" __declspec(dllexport) int GetRTSPHWDecodingGpuIndex(ANSCENTER::ANSRTSPClient** Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        return (*Handle)->GetHWDecodingGpuIndex();
    } catch (...) { return -1; }
}
extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClient** Handle, int mode) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetImageQuality(mode);  // 0=fast (AI, default), 1=quality (display BT.709)
    } catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetDisplayResolution(width, height);
    } catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetTargetFPS(intervalMs);  // 0=no limit, 100=~10FPS, 200=~5FPS
    } catch (...) { }
}
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
        (*Handle)->SetNV12FastPath(enable != 0);
    } catch (...) { }
}
extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle) {
    if (Handle == nullptr || *Handle == nullptr) return -1.0;
    try {
        return (*Handle)->GetLastFrameAgeMs();
    } catch (...) { return -1.0; }
}
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool crop = false;
        if (cropFlag == 1) crop = true;
        (*Handle)->SetCrop(crop);
        return 1;
    }
	catch (const std::exception& e) {
		std::cerr << "Error setting RTSP crop flag: " << e.what() << std::endl;
		return 0;
	}
	catch (...) {
		std::cerr << "Error setting RTSP crop flag: Unknown exception." << std::endl;
		return 0;
	}
}

// ============================================================================
// V2 entry points: accept uint64_t handle-by-value instead of Handle**
// This eliminates a LabVIEW buffer reuse bug when concurrent calls share the
// same Handle** buffer address.
// ============================================================================

extern "C" __declspec(dllexport) int GetRTSPImage_V2(uint64_t handleVal, int& width, int& height, int64_t& timeStamp, LStrHandle jpegImage) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        std::string jpegString = h->GetJpegImage(width, height, timeStamp);
        int size = jpegString.length();
        if (size > 0) {
            MgErr error;
            error = DSSetHandleSize(jpegImage, sizeof(int32) + size * sizeof(uChar));
            if (error == noErr) {
                (*jpegImage)->cnt = size;
                memcpy((*jpegImage)->str, jpegString.c_str(), size);
                return 1;
            }
            else {
                std::cerr << "Error resizing jpegImage handle: " << error << std::endl;
                return 0;
            }
        }
        else {
            std::cerr << "No image data retrieved from FLV client." << std::endl;
            return 0;
        }
    }
    catch (const std::exception& e) {
        std::cerr << "Error getting RTSP image: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error getting RTSP image: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int GetRTSPCVImage_V2(
    uint64_t handleVal,
    int& width,
    int& height,
    int64_t& timeStamp,
    cv::Mat** image)
{
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    if (!image) {
        std::cerr << "Error: Invalid input parameters in GetRTSPCVImage_V2" << std::endl;
        return -1;
    }
    try {
        cv::Mat img = h->GetImage(width, height, timeStamp);
        if (img.empty()) {
            return 0;
        }
        anscv_mat_replace(image, std::move(img));
        return 1;
    }
    catch (const cv::Exception& e) {
        std::cerr << "OpenCV exception in GetRTSPCVImage_V2: " << e.what() << std::endl;
        return -2;
    }
    catch (const std::exception& e) {
        std::cerr << "Exception in GetRTSPCVImage_V2: " << e.what() << std::endl;
        return -2;
    }
    catch (...) {
        std::cerr << "Unknown exception in GetRTSPCVImage_V2" << std::endl;
        return -2;
    }
}

extern "C" __declspec(dllexport) int StartRTSP_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->Start();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error starting RTSP client: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error starting RTSP client: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int ReconnectRTSP_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->Reconnect();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error reconnecting RTSP client: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error reconnecting RTSP client: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int StopRTSP_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->Stop();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error stopping RTSP client: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error stopping RTSP client: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int PauseRTSP_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->Pause();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error pausing RTSP client: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error pausing RTSP client: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int IsRTSPPaused_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->IsPaused();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error checking if RTSP client is paused: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error checking if RTSP client is paused: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int IsRTSPRunning_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->IsPlaying();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error checking if RTSP client is running: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error checking if RTSP client is running: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int IsRTSPRecording_V2(uint64_t handleVal) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool result = h->IsRecording();
        if (result) return 1;
        else return 0;
    }
    catch (const std::exception& e) {
        std::cerr << "Error checking if RTSP client is recording: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error checking if RTSP client is recording: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int SetRTSPAudioVolume_V2(uint64_t handleVal, int volume) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        h->SetAudioVolume(volume);
        return 1;
    } catch (...) { return 0; }
}

extern "C" __declspec(dllexport) int EnableRTSPAudioVolume_V2(uint64_t handleVal, int status) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool audioStatus = false;
        if (status == 1) audioStatus = true;
        h->EnableAudio(audioStatus);
        return 1;
    } catch (...) { return 0; }
}

extern "C" __declspec(dllexport) int SetRTSPImageRotation_V2(uint64_t handleVal, double rotationAngle) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        h->SetImageRotate(rotationAngle);
        return 1;
    } catch (...) { return 0; }
}

extern "C" __declspec(dllexport) int SetBBoxRTSP_V2(uint64_t handleVal, int x, int y, int width, int height) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        cv::Rect bbox(x, y, width, height);
        h->SetBBox(bbox);
        return 1;
    }
    catch (const std::exception& e) {
        std::cerr << "Error setting RTSP bounding box: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error setting RTSP bounding box: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int SetCropFlagRTSP_V2(uint64_t handleVal, int cropFlag) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        bool crop = false;
        if (cropFlag == 1) crop = true;
        h->SetCrop(crop);
        return 1;
    }
    catch (const std::exception& e) {
        std::cerr << "Error setting RTSP crop flag: " << e.what() << std::endl;
        return 0;
    }
    catch (...) {
        std::cerr << "Error setting RTSP crop flag: Unknown exception." << std::endl;
        return 0;
    }
}

extern "C" __declspec(dllexport) int SetRTSPHWDecoding_V2(uint64_t handleVal, int hwMode, int preferredGpu) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        h->SetHWDecoding(hwMode, preferredGpu);
        return 1;
    } catch (...) { return 0; }
}

extern "C" __declspec(dllexport) int SetRTSPImageQuality_V2(uint64_t handleVal, int mode) {
    auto* h = reinterpret_cast<ANSCENTER::ANSRTSPClient*>(handleVal); if (!h) return -1;
    try {
        h->SetImageQuality(mode);  // 0=fast (AI, default), 1=quality (display BT.709)
        return 1;
    } catch (...) { return 0; }
}