modules/ANSODEngine/ANSSAM3.cpp

#include "ANSSAM3.h"
#include "ANSCLIPTokenizer.h"
#include "Utility.h"
#include <NvInferPlugin.h>

#include <opencv2/opencv.hpp>
#include <fstream>
#include <filesystem>
#include <cmath>
#include <algorithm>
#include <numeric>

namespace ANSCENTER
{
    // =========================================================================
    // Helpers
    // =========================================================================

    // Portable FP16 ↔ FP32 conversion (works in plain C++ without NVCC)
    static float fp16ToFloat(uint16_t h)
    {
        uint32_t sign = static_cast<uint32_t>(h >> 15) << 31;
        uint32_t expo = (h >> 10) & 0x1Fu;
        uint32_t mant = h & 0x3FFu;
        if (expo == 0) {
            if (mant == 0) { float f; std::memcpy(&f, &sign, 4); return f; }
            while (!(mant & 0x400u)) { mant <<= 1; expo--; }
            expo++; mant &= 0x3FFu;
        } else if (expo == 31) {
            expo = 255;
        }
        uint32_t bits = sign | ((expo + 127u - 15u) << 23) | (mant << 13);
        float f; std::memcpy(&f, &bits, 4); return f;
    }
    static uint16_t floatToFp16(float val)
    {
        uint32_t bits; std::memcpy(&bits, &val, 4);
        uint16_t sign = static_cast<uint16_t>((bits >> 16) & 0x8000u);
        int32_t expo = ((bits >> 23) & 0xFF) - 127 + 15;
        uint32_t mant = bits & 0x7FFFFFu;
        if (expo <= 0) return sign;                         // underflow → ±0
        if (expo >= 31) return sign | 0x7C00u;             // overflow → ±inf
        return sign | static_cast<uint16_t>(expo << 10) | static_cast<uint16_t>(mant >> 13);
    }

    size_t ANSSAM3::DataTypeSize(nvinfer1::DataType dtype)
    {
        switch (dtype) {
        case nvinfer1::DataType::kFLOAT: return 4;
        case nvinfer1::DataType::kHALF:  return 2;
        case nvinfer1::DataType::kINT32: return 4;
        case nvinfer1::DataType::kINT64: return 8;
        case nvinfer1::DataType::kINT8:  return 1;
        case nvinfer1::DataType::kBOOL:  return 1;
#if NV_TENSORRT_MAJOR >= 10
        case nvinfer1::DataType::kUINT8: return 1;
        case nvinfer1::DataType::kFP8:   return 1;
        case nvinfer1::DataType::kBF16:  return 2;
        case nvinfer1::DataType::kINT4:  return 1;  // conservative — 4-bit packed
#endif
        default: return 4;
        }
    }

    void ANSSAM3::TRTBundle::destroy()
    {
        context.reset();
        engine.reset();
        runtime.reset();
        for (int i = 0; i < (int)gpuBuffers.size(); ++i) {
            if (!gpuBuffers[i]) continue;
            if (hostBufferIdx.count(i))
                free(gpuBuffers[i]);      // host-allocated (shape tensor)
            else
                cudaFree(gpuBuffers[i]);  // device-allocated
            gpuBuffers[i] = nullptr;
        }
        gpuBuffers.clear();
        gpuBufferSizes.clear();
        hostBufferIdx.clear();
        nameToIdx.clear();
    }

    // (CreateOrtDecoderSession removed — decoder now runs under TRT)

    // -----------------------------------------------------------------
    // PassthroughOutputAllocator — lightweight IOutputAllocator that simply
    // returns the pre-allocated gpuBuffer.  Defined here (not in the header)
    // so the vtable and CUDA symbols stay inside the engine DLL.
    // Created on the stack in Detect() — no persistent class members needed.
    // -----------------------------------------------------------------
#if NV_TENSORRT_MAJOR >= 10
    struct PassthroughOutputAllocator : public nvinfer1::IOutputAllocator
    {
        void*  preAllocBuf;       // existing gpuBuffers[idx]
        size_t preAllocSize;      // existing gpuBufferSizes[idx]
        nvinfer1::Dims actualDims{};
        bool   shapeKnown = false;

        PassthroughOutputAllocator(void* buf, size_t sz)
            : preAllocBuf(buf), preAllocSize(sz) {}

        void* reallocateOutput(
            char const* /*tensorName*/, void* currentMemory,
            uint64_t size, uint64_t /*alignment*/) noexcept override
        {
            if (size <= preAllocSize) return preAllocBuf;

            // Fallback: grow (should not happen with generous pre-allocation)
            void* newBuf = nullptr;
            if (cudaMalloc(&newBuf, size) == cudaSuccess) {
                preAllocBuf  = newBuf;
                preAllocSize = size;
            }
            return preAllocBuf;
        }

        void notifyShape(char const* /*tensorName*/,
                         nvinfer1::Dims const& dims) noexcept override
        {
            actualDims = dims;
            shapeKnown = true;
        }
    };
#endif

    // =========================================================================
    // EngineFileName — generate cache path: <stem>.engine.<GPUName>.<fp16|fp32>
    // =========================================================================

    std::string ANSSAM3::EngineFileName(const std::string& onnxPath, TrtPrecision precision) const
    {
        // Extract stem from ONNX path
        std::filesystem::path p(onnxPath);
        std::string stem = p.stem().string();

        // Get GPU name
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, 0);
        std::string gpuName(prop.name);
        gpuName.erase(std::remove_if(gpuName.begin(), gpuName.end(), ::isspace), gpuName.end());

        // BF16 requires Ampere+ (compute capability >= 8.0).
        // On older GPUs BuildAndLoadEngine silently falls back to FP32,
        // so the filename must match to avoid a .bf16 / .fp32 mismatch.
        TrtPrecision effective = precision;
        if (precision == TrtPrecision::BF16 && prop.major < 8)
            effective = TrtPrecision::FP32;

        std::string precStr;
        switch (effective) {
            case TrtPrecision::FP16: precStr = "fp16"; break;
            case TrtPrecision::BF16: precStr = "bf16"; break;
            case TrtPrecision::FP32: precStr = "fp32"; break;
        }

        std::string dir = p.parent_path().string();
        return dir + "\\" + stem + ".engine." + gpuName + "." + precStr;
    }

    // =========================================================================
    // BuildAndLoadEngine — build TRT engine from ONNX + load for inference
    // =========================================================================

    bool ANSSAM3::BuildAndLoadEngine(TRTBundle& bundle, const std::string& onnxPath,
                                      const std::string& label, TrtPrecision precision)
    {
        // Register TRT built-in plugins (needed for RoiAlign in decoder, etc.)
        // Safe to call multiple times — idempotent.
        initLibNvInferPlugins(&m_trtLogger, "");

        std::string enginePath = EngineFileName(onnxPath, precision);

        // Check for cached engine
        if (FileExist(enginePath)) {
            std::cout << "[ANSSAM3] " << label << ": cached engine found: " << enginePath << std::endl;
            return LoadTRTEngineBundle(bundle, enginePath, label);
        }

        // --- Build from ONNX ---
        std::cout << "[ANSSAM3] " << label << ": building TRT engine from " << onnxPath << std::endl;

        if (!FileExist(onnxPath)) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": ONNX file not found: " + onnxPath, __FILE__, __LINE__);
            return false;
        }

        auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_trtLogger));
        if (!builder) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createInferBuilder failed", __FILE__, __LINE__);
            return false;
        }

        auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
        if (!network) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createNetworkV2 failed", __FILE__, __LINE__);
            return false;
        }

        auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_trtLogger));
        if (!parser) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createParser failed", __FILE__, __LINE__);
            return false;
        }

        // parseFromFile resolves .onnx_data external files relative to the ONNX directory
        if (!parser->parseFromFile(onnxPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))) {
            for (int i = 0; i < parser->getNbErrors(); ++i)
                std::cerr << "[ANSSAM3] " << label << " parse error: " << parser->getError(i)->desc() << std::endl;
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": parseFromFile failed", __FILE__, __LINE__);
            return false;
        }
        std::cout << "[ANSSAM3] " << label << ": ONNX parsed successfully." << std::endl;

        // --- Log and configure input tensors ---
        auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());

        // Workspace governs max scratch memory TRT can use at runtime.
        // ImageEncoder FP32 needs huge scratch (3.5 GiB at 4 GiB workspace) which
        // causes OOM on 8 GiB GPUs.  Cap it to 2 GiB — TRT picks leaner tactics
        // with minimal quality loss.  LangEncoder/Decoder are small; keep 4 GiB.
        const bool isImageEncoder = (label.find("ImageEncoder") != std::string::npos);
        size_t workspaceBytes = isImageEncoder
            ? 2048ULL * 1024 * 1024   // 2 GiB for ImageEncoder (prevents OOM)
            : 4096ULL * 1024 * 1024;  // 4 GiB for LangEncoder / Decoder
#if NV_TENSORRT_MAJOR >= 10
        config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspaceBytes);
#else
        config->setMaxWorkspaceSize(workspaceBytes);
#endif

        // Set precision flags
        switch (precision) {
            case TrtPrecision::BF16:
            {
                // BF16 requires Ampere+ (compute capability >= 8.0) and TRT 8.6+.
                // Check GPU capability, then attempt to set the flag if available.
                cudaDeviceProp devProp;
                cudaGetDeviceProperties(&devProp, 0);
#if NV_TENSORRT_MAJOR > 8 || (NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR >= 6)
                if (devProp.major >= 8) {
                    config->setFlag(nvinfer1::BuilderFlag::kBF16);
                    std::cout << "[ANSSAM3] " << label << ": BF16 precision enabled." << std::endl;
                } else {
                    std::cout << "[ANSSAM3] " << label << ": GPU CC " << devProp.major << "." << devProp.minor
                              << " does not support BF16, falling back to FP32." << std::endl;
                }
#else
                (void)devProp; // suppress unused warning
                std::cout << "[ANSSAM3] " << label << ": TensorRT version does not support BF16, falling back to FP32." << std::endl;
#endif
                break;
            }
            case TrtPrecision::FP16:
                if (builder->platformHasFastFp16()) {
                    config->setFlag(nvinfer1::BuilderFlag::kFP16);
                    std::cout << "[ANSSAM3] " << label << ": FP16 precision enabled." << std::endl;
                }
                break;
            case TrtPrecision::FP32:
                // No precision flags = FP32
                break;
        }

        // Create optimization profile with actual ONNX dimensions
        auto profile = builder->createOptimizationProfile();
        int numInputs = network->getNbInputs();
        for (int i = 0; i < numInputs; ++i) {
            auto input = network->getInput(i);
            const char* name = input->getName();
            auto dims = input->getDimensions();

            std::cout << "[ANSSAM3] " << label << " input[" << i << "] '" << name << "': [";
            for (int d = 0; d < dims.nbDims; ++d) {
                if (d > 0) std::cout << ", ";
                std::cout << (dims.d[d] == -1 ? "dyn" : std::to_string(dims.d[d]));
            }
            std::cout << "]" << (input->isShapeTensor() ? " (shape tensor)" : "") << std::endl;

            // Shape tensors: scalar int64 inputs whose VALUES determine output shapes
            // (e.g. original_height, original_width). Use setShapeValues() not setDimensions().
            if (input->isShapeTensor()) {
                // nbValues = product of dims; for a scalar (nbDims==0) that is 1
                int nbValues = 1;
                for (int d = 0; d < dims.nbDims; ++d) {
                    if (dims.d[d] > 0) nbValues *= dims.d[d];
                }
                std::vector<int32_t> minV(nbValues, 1);
                std::vector<int32_t> optV(nbValues, 1024);
                std::vector<int32_t> maxV(nbValues, 4096);
                profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, minV.data(), nbValues);
                profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, optV.data(), nbValues);
                profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, maxV.data(), nbValues);
                continue;
            }

            // Regular execution tensors: replace dynamic dims with concrete values
            bool hasDynamic = false;
            nvinfer1::Dims fixedDims = dims;
            for (int d = 0; d < dims.nbDims; ++d) {
                if (dims.d[d] == -1) {
                    hasDynamic = true;
                    fixedDims.d[d] = 1; // default batch or sequence
                }
            }

            if (hasDynamic) {
                profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, fixedDims);
                profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, fixedDims);
                profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, fixedDims);
            }
        }
        config->addOptimizationProfile(profile);

        // --- Mixed precision for Decoder: keep FP16 for bulk ops but force
        //     score/NMS/comparison layers to FP32 so that the internal
        //     thresholding doesn't lose detections due to half-precision
        //     rounding.  We mark layers whose names contain NMS-related
        //     keywords, plus all layers of types that perform comparisons
        //     or index-selection (which are part of the NMS pipeline).
        if (precision == TrtPrecision::FP16 && label == std::string("Decoder")) {
            config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);

            int numLayers = network->getNbLayers();
            int markedCount = 0;
            for (int li = 0; li < numLayers; ++li) {
                auto* layer = network->getLayer(li);
                if (!layer) continue;

                std::string lname(layer->getName());
                auto ltype = layer->getType();

                // Force FP32 on layers involved in score thresholding / NMS:
                //  - Comparison ops (Greater, Less, Equal)
                //  - Sigmoid (final score activation)
                //  - TopK, NonZero, Gather, Select (index-selection in NMS)
                //  - Any layer whose name hints at score/nms/threshold
                bool needFP32 = false;

                // By layer type
                switch (ltype) {
                case nvinfer1::LayerType::kTOPK:
                case nvinfer1::LayerType::kGATHER:
                case nvinfer1::LayerType::kSELECT:
                case nvinfer1::LayerType::kNON_ZERO:
                case nvinfer1::LayerType::kSCATTER:
                    needFP32 = true;
                    break;
                default:
                    break;
                }

                // By layer name (ONNX op names often preserved by parser)
                if (!needFP32) {
                    // Convert to lowercase for matching
                    std::string lower = lname;
                    std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
                    if (lower.find("score") != std::string::npos ||
                        lower.find("nms") != std::string::npos ||
                        lower.find("sigmoid") != std::string::npos ||
                        lower.find("threshold") != std::string::npos ||
                        lower.find("greater") != std::string::npos ||
                        lower.find("less") != std::string::npos ||
                        lower.find("where") != std::string::npos ||
                        lower.find("nonzero") != std::string::npos ||
                        lower.find("topk") != std::string::npos) {
                        needFP32 = true;
                    }
                }

                // TensorRT forbids setPrecision(kFLOAT) on layers that
                // produce non-float types (booleans, indices/int32, int64).
                // Only force FP32 when ALL outputs are floating-point.
                if (needFP32) {
                    bool allFloat = true;
                    for (int oi = 0; oi < layer->getNbOutputs(); ++oi) {
                        auto dt = layer->getOutputType(oi);
                        if (dt != nvinfer1::DataType::kFLOAT &&
                            dt != nvinfer1::DataType::kHALF) {
                            allFloat = false;
                            break;
                        }
                    }
                    if (allFloat) {
                        layer->setPrecision(nvinfer1::DataType::kFLOAT);
                        for (int oi = 0; oi < layer->getNbOutputs(); ++oi)
                            layer->setOutputType(oi, nvinfer1::DataType::kFLOAT);
                        ++markedCount;
                    }
                }
            }
            std::cout << "[ANSSAM3] " << label << ": mixed precision — "
                      << markedCount << "/" << numLayers
                      << " layers forced to FP32 (score/NMS ops)." << std::endl;
        }

        // --- Build serialized engine ---
        std::cout << "[ANSSAM3] " << label << ": building engine (this may take a few minutes)..." << std::endl;

        unsigned long sehCode = 0;
        auto plan = std::unique_ptr<nvinfer1::IHostMemory>(
            buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCode));

        if (sehCode != 0) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine",
                label + ": engine build crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
            return false;
        }
        if (!plan) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": buildSerializedNetwork returned null", __FILE__, __LINE__);
            return false;
        }

        // --- Save to disk ---
        std::ofstream outFile(enginePath, std::ios::binary);
        if (!outFile.is_open()) {
            _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": cannot write engine file: " + enginePath, __FILE__, __LINE__);
            return false;
        }
        outFile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
        outFile.close();
        std::cout << "[ANSSAM3] " << label << ": engine saved to " << enginePath << std::endl;

        plan.reset();

        // --- Load the just-built engine ---
        return LoadTRTEngineBundle(bundle, enginePath, label);
    }

    // =========================================================================
    // LoadTRTEngineBundle — deserialize engine, allocate GPU buffers, bind
    // =========================================================================

    bool ANSSAM3::LoadTRTEngineBundle(TRTBundle& bundle, const std::string& enginePath, const std::string& label)
    {
        // Read engine file
        std::ifstream file(enginePath, std::ios::binary | std::ios::ate);
        if (!file.is_open()) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cannot open: " + enginePath, __FILE__, __LINE__);
            return false;
        }
        std::streamsize fileSize = file.tellg();
        file.seekg(0, std::ios::beg);
        std::vector<char> engineData(fileSize);
        if (!file.read(engineData.data(), fileSize)) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": read failed", __FILE__, __LINE__);
            return false;
        }
        file.close();

        // Deserialize
        bundle.runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(m_trtLogger));
        if (!bundle.runtime) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createInferRuntime failed", __FILE__, __LINE__);
            return false;
        }

        unsigned long sehCode = 0;
        bundle.engine = std::unique_ptr<nvinfer1::ICudaEngine>(
            deserializeCudaEngineSafe(bundle.runtime.get(), engineData.data(), engineData.size(), &sehCode));
        if (sehCode != 0) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle",
                label + ": deserialize crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
            return false;
        }
        if (!bundle.engine) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize returned null", __FILE__, __LINE__);
            return false;
        }

        // --- Weight streaming (TRT 10+): keep only a budget of weights on GPU,
        //     stream the rest from CPU pinned memory on demand.
        //     Saves ~1.3 GiB VRAM for ImageEncoder (1.8 GiB weights → 512 MiB on GPU).
#if NV_TENSORRT_MAJOR >= 10
        {
            int64_t streamableBytes = bundle.engine->getStreamableWeightsSize();
            if (streamableBytes > 0 && label.find("ImageEncoder") != std::string::npos) {
                // Budget = how much weight memory stays on GPU.
                // 512 MiB keeps hot layers cached; rest streamed via PCIe.
                const int64_t budgetBytes = 512LL * 1024 * 1024;
                int64_t actualBudget = std::min(budgetBytes, streamableBytes);
                bundle.engine->setWeightStreamingBudgetV2(actualBudget);
                std::cout << "[ANSSAM3] " << label
                          << ": weight streaming enabled (streamable="
                          << (streamableBytes / (1024*1024)) << " MiB, budget="
                          << (actualBudget / (1024*1024)) << " MiB)" << std::endl;
            }
        }
#endif

        bundle.context = std::unique_ptr<nvinfer1::IExecutionContext>(bundle.engine->createExecutionContext());
        if (!bundle.context) {
            _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createExecutionContext failed", __FILE__, __LINE__);
            return false;
        }

        // Set optimization profile
        int numProfiles = bundle.engine->getNbOptimizationProfiles();
        if (numProfiles > 0) {
            bundle.context->setOptimizationProfileAsync(0, m_cudaStream);
            cudaStreamSynchronize(m_cudaStream);
        }

        // Allocate buffers (device for execution tensors, host for shape tensors)
        const int numTensors = bundle.engine->getNbIOTensors();
        bundle.gpuBuffers.resize(numTensors, nullptr);
        bundle.gpuBufferSizes.resize(numTensors, 0);
        bundle.hostBufferIdx.clear();
        bundle.nameToIdx.clear();

        for (int i = 0; i < numTensors; ++i) {
            const char* name = bundle.engine->getIOTensorName(i);
            auto mode  = bundle.engine->getTensorIOMode(name);
            auto shape = bundle.engine->getTensorShape(name);
            auto dtype = bundle.engine->getTensorDataType(name);
            auto loc   = bundle.engine->getTensorLocation(name);
            bool isHost = (loc == nvinfer1::TensorLocation::kHOST);

            // Check if any dimension is dynamic (-1)
            bool hasDynamic = false;
            int64_t numElements = 1;
            for (int d = 0; d < shape.nbDims; ++d) {
                int64_t v = shape.d[d];
                if (v <= 0) { hasDynamic = true; v = 1; }
                numElements *= v;
            }
            // Scalars (0-dim) still need at least 1 element
            if (numElements < 1) numElements = 1;

            // For output tensors with ANY dynamic dim, pre-allocate a generous buffer.
            // The decoder outputs (boxes [-1,4], scores [-1], masks [-1,-1,-1,-1])
            // all have data-dependent first dimension from NonZero/NMS.
            if (mode == nvinfer1::TensorIOMode::kOUTPUT && hasDynamic) {
                // Pre-allocate for up to 256 detections with generous mask size
                numElements = 256 * 1 * 256 * 256;
            }

            size_t bufSize = numElements * DataTypeSize(dtype);
            bundle.gpuBufferSizes[i] = bufSize;

            if (isHost) {
                // Shape tensor — allocate host memory
                bundle.gpuBuffers[i] = calloc(numElements, DataTypeSize(dtype));
                if (!bundle.gpuBuffers[i]) {
                    _logger.LogError("ANSSAM3::LoadTRTEngineBundle",
                        label + ": host alloc failed for " + std::string(name), __FILE__, __LINE__);
                    return false;
                }
                bundle.hostBufferIdx.insert(i);
            } else {
                // Execution tensor — allocate device memory
                cudaError_t err = cudaMalloc(&bundle.gpuBuffers[i], bufSize);
                if (err != cudaSuccess) {
                    _logger.LogError("ANSSAM3::LoadTRTEngineBundle",
                        label + ": cudaMalloc failed for " + std::string(name) + ": " + cudaGetErrorString(err),
                        __FILE__, __LINE__);
                    return false;
                }
                cudaMemset(bundle.gpuBuffers[i], 0, bufSize);
            }

            // Bind tensor address (host ptr for shape tensors, device ptr for execution tensors)
            if (!bundle.context->setTensorAddress(name, bundle.gpuBuffers[i])) {
                _logger.LogError("ANSSAM3::LoadTRTEngineBundle",
                    label + ": setTensorAddress failed for " + std::string(name), __FILE__, __LINE__);
                return false;
            }

            bundle.nameToIdx[std::string(name)] = i;

            std::cout << "[ANSSAM3] " << label << " tensor[" << i << "] '"
                      << name << "' "
                      << (mode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT")
                      << (isHost ? " HOST" : " DEVICE")
                      << " dtype=" << static_cast<int>(dtype)
                      << " bufSize=" << bufSize << std::endl;
        }

        // Set input shapes (replace dynamic dims with concrete values)
        for (int i = 0; i < numTensors; ++i) {
            const char* name = bundle.engine->getIOTensorName(i);
            if (bundle.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kINPUT)
                continue;

            // Shape tensors (scalar, host memory): TRT reads the value directly
            // from the host buffer. setInputShape for scalars uses Dims{0, {}}.
            // Write a default value (1024) into the host buffer at load time.
            if (bundle.hostBufferIdx.count(i)) {
                auto dtype = bundle.engine->getTensorDataType(name);
                if (dtype == nvinfer1::DataType::kINT64)
                    *reinterpret_cast<int64_t*>(bundle.gpuBuffers[i]) = 1024;
                else
                    *reinterpret_cast<int32_t*>(bundle.gpuBuffers[i]) = 1024;

                nvinfer1::Dims scalarDims;
                scalarDims.nbDims = 0;
                bundle.context->setInputShape(name, scalarDims);
                continue;
            }

            auto dims = bundle.engine->getTensorShape(name);
            nvinfer1::Dims inputDims = dims;
            for (int d = 0; d < inputDims.nbDims; ++d) {
                if (inputDims.d[d] == -1)
                    inputDims.d[d] = 1;
            }

            if (!bundle.context->setInputShape(name, inputDims)) {
                _logger.LogError("ANSSAM3::LoadTRTEngineBundle",
                    label + ": setInputShape failed for " + std::string(name), __FILE__, __LINE__);
                return false;
            }
        }

        std::cout << "[ANSSAM3] " << label << ": loaded successfully (" << numTensors << " tensors)." << std::endl;
        return true;
    }


    // =========================================================================
    // EnsureEnginesBuilt — pre-build uncached engines one at a time
    // Avoids GPU OOM when building one engine while others are already loaded.
    // =========================================================================

    bool ANSSAM3::EnsureEnginesBuilt(const std::string& imgOnnx, const std::string& langOnnx, const std::string& decOnnx)
    {
        struct Job { const std::string* onnx; const char* label; TrtPrecision prec; };
        Job jobs[] = {
            {&langOnnx, "LangEncoder",  TrtPrecision::FP16},   // FP16 — verified identical to FP32
            {&decOnnx,  "Decoder",      TrtPrecision::FP16},   // FP16 decoder
            {&imgOnnx,  "ImageEncoder", TrtPrecision::FP32},   // FP32 — FP16/BF16 both corrupt backbone FPN
        };
        for (auto& j : jobs) {
            if (!FileExist(EngineFileName(*j.onnx, j.prec))) {
                TRTBundle tmp;
                if (!BuildAndLoadEngine(tmp, *j.onnx, j.label, j.prec)) {
                    _logger.LogError("ANSSAM3::EnsureEnginesBuilt",
                        std::string("Failed to pre-build engine: ") + j.label, __FILE__, __LINE__);
                    tmp.destroy();
                    return false;
                }
                tmp.destroy();   // free GPU memory before next build
            }
        }
        return true;
    }

    // =========================================================================
    // OptimizeModel
    // =========================================================================

    bool ANSSAM3::OptimizeModel(bool fp16, std::string& optimizedModelFolder)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder))
            return false;

        _fp16 = fp16;
        optimizedModelFolder = _modelFolder;

        std::string imgOnnx  = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
        std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
        std::string decOnnx  = CreateFilePath(_modelFolder, "sam3_decoder.onnx");

        cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);

        // Build engines one at a time, destroying each to free GPU memory
        TRTBundle tmp;
        bool ok = true;

        ok = BuildAndLoadEngine(tmp, imgOnnx, "ImageEncoder", TrtPrecision::FP32);
        tmp.destroy();

        if (ok) { ok = BuildAndLoadEngine(tmp, langOnnx, "LangEncoder", TrtPrecision::FP16); tmp.destroy(); }
        if (ok) { ok = BuildAndLoadEngine(tmp, decOnnx,  "Decoder",     TrtPrecision::FP16); tmp.destroy(); }

        if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }
        return ok;
    }

    // =========================================================================
    // Initialize
    // =========================================================================

    bool ANSSAM3::Initialize(std::string licenseKey, ModelConfig modelConfig,
                             const std::string& modelZipFilePath, const std::string& modelZipPassword,
                             std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
            if (!result) return false;

            _modelConfig.detectionType = DetectionType::SEGMENTATION;
            if (_modelConfig.modelConfThreshold < 0.1f)
                _modelConfig.modelConfThreshold = 0.5f;
            m_segThreshold = _modelConfig.modelConfThreshold;
            _fp16 = true;

            // Create CUDA stream
            cudaSetDevice(_modelConfig.gpuDeviceIndex);
            cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);

            // Build/load TRT engines for image + language encoders
            std::string imgOnnx  = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
            std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
            std::string decOnnx  = CreateFilePath(_modelFolder, "sam3_decoder.onnx");

            // Pre-build uncached TRT engines (avoids GPU OOM during build)
            if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
                _modelLoadValid = false;
                return false;
            }

            if (!BuildAndLoadEngine(m_imgEncoder,  imgOnnx,  "ImageEncoder", TrtPrecision::FP32) ||
                !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder",  TrtPrecision::FP16) ||
                !BuildAndLoadEngine(m_decoder,     decOnnx,  "Decoder",      TrtPrecision::FP16)) {
                _logger.LogError("ANSSAM3::Initialize", "Failed to build/load TRT engines", __FILE__, __LINE__);
                _modelLoadValid = false;
                return false;
            }

            _modelLoadValid = true;
            _isInitialized  = true;

            // Load tokenizer
            m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
            std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
            if (FileExist(tokenizerPath)) {
                m_tokenizer->Load(tokenizerPath);
                _logger.LogDebug("ANSSAM3::Initialize", "CLIP tokenizer loaded", __FILE__, __LINE__);
            }

            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSSAM3::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // =========================================================================
    // LoadModel
    // =========================================================================

    bool ANSSAM3::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
            if (!result) return false;

            _modelConfig.detectionType = DetectionType::SEGMENTATION;
            if (_modelConfig.modelConfThreshold < 0.1f)
                _modelConfig.modelConfThreshold = 0.5f;
            m_segThreshold = _modelConfig.modelConfThreshold;
            _fp16 = true;

            cudaSetDevice(_modelConfig.gpuDeviceIndex);
            cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);

            std::string imgOnnx  = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
            std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
            std::string decOnnx  = CreateFilePath(_modelFolder, "sam3_decoder.onnx");

            // Pre-build uncached TRT engines (avoids GPU OOM during build)
            if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
                _modelLoadValid = false;
                return false;
            }

            if (!BuildAndLoadEngine(m_imgEncoder,  imgOnnx,  "ImageEncoder", TrtPrecision::FP32) ||
                !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder",  TrtPrecision::FP16) ||
                !BuildAndLoadEngine(m_decoder,     decOnnx,  "Decoder",      TrtPrecision::FP16)) {
                _logger.LogError("ANSSAM3::LoadModel", "Failed to build/load TRT engines", __FILE__, __LINE__);
                _modelLoadValid = false;
                return false;
            }

            _modelLoadValid = true;
            _isInitialized  = true;

            m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
            std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
            if (FileExist(tokenizerPath)) {
                m_tokenizer->Load(tokenizerPath);
            }

            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSSAM3::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // =========================================================================
    // LoadModelFromFolder
    // =========================================================================

    bool ANSSAM3::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
                                       std::string modelName, std::string className,
                                       const std::string& modelFolder, std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
            if (!result) return false;

            _modelConfig = modelConfig;
            _modelConfig.detectionType = DetectionType::SEGMENTATION;
            if (_modelConfig.modelConfThreshold < 0.1f)
                _modelConfig.modelConfThreshold = 0.5f;
            m_segThreshold = _modelConfig.modelConfThreshold;
            _fp16 = true;

            cudaSetDevice(_modelConfig.gpuDeviceIndex);
            cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);

            std::string imgOnnx  = CreateFilePath(modelFolder, "sam3_image_encoder.onnx");
            std::string langOnnx = CreateFilePath(modelFolder, "sam3_language_encoder.onnx");
            std::string decOnnx  = CreateFilePath(modelFolder, "sam3_decoder.onnx");

            // Pre-build uncached TRT engines (avoids GPU OOM during build)
            if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
                _modelLoadValid = false;
                return false;
            }

            if (!BuildAndLoadEngine(m_imgEncoder,  imgOnnx,  "ImageEncoder", TrtPrecision::FP32) ||
                !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder",  TrtPrecision::FP16) ||
                !BuildAndLoadEngine(m_decoder,     decOnnx,  "Decoder",      TrtPrecision::FP16)) {
                _logger.LogError("ANSSAM3::LoadModelFromFolder", "Failed to build/load TRT engines", __FILE__, __LINE__);
                _modelLoadValid = false;
                return false;
            }

            _modelLoadValid = true;
            _isInitialized  = true;

            m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
            std::string tokenizerPath = CreateFilePath(modelFolder, "merges.txt");
            if (FileExist(tokenizerPath)) {
                m_tokenizer->Load(tokenizerPath);
                _logger.LogDebug("ANSSAM3::LoadModelFromFolder", "CLIP tokenizer loaded", __FILE__, __LINE__);
            }

            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSSAM3::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // =========================================================================
    // SetPrompt — run language encoder, cache outputs on GPU
    // =========================================================================

    bool ANSSAM3::SetPrompt(const std::vector<int64_t>& inputIds, const std::vector<int64_t>& attentionMask)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);

        if (!m_langEncoder.context) {
            _logger.LogError("ANSSAM3::SetPrompt", "Language encoder not loaded", __FILE__, __LINE__);
            return false;
        }
   
        // Language encoder input: "tokens" [1, 32] int64
        // Find the tokens input tensor
        auto it = m_langEncoder.nameToIdx.find("tokens");
        if (it == m_langEncoder.nameToIdx.end()) {
            // Try first input
            const char* firstName = m_langEncoder.engine->getIOTensorName(0);
            it = m_langEncoder.nameToIdx.find(firstName);
        }
        if (it == m_langEncoder.nameToIdx.end()) {
            _logger.LogError("ANSSAM3::SetPrompt", "Cannot find tokens input tensor", __FILE__, __LINE__);
            return false;
        }

        int tokIdx = it->second;
        const char* tokName = m_langEncoder.engine->getIOTensorName(tokIdx);
        auto tokDtype = m_langEncoder.engine->getTensorDataType(tokName);

        // Upload tokens — handle int64 vs int32 data type
        if (tokDtype == nvinfer1::DataType::kINT64) {
            cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], inputIds.data(),
                            inputIds.size() * sizeof(int64_t),
                            cudaMemcpyHostToDevice, m_cudaStream);
        }
        else if (tokDtype == nvinfer1::DataType::kINT32) {
            // TRT may have converted int64 to int32 at build time
            std::vector<int32_t> tokens32(inputIds.size());
            for (size_t i = 0; i < inputIds.size(); ++i)
                tokens32[i] = static_cast<int32_t>(inputIds[i]);
            cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], tokens32.data(),
                            tokens32.size() * sizeof(int32_t),
                            cudaMemcpyHostToDevice, m_cudaStream);
        }

        // Set input shape
        nvinfer1::Dims tokenDims;
        tokenDims.nbDims = 2;
        tokenDims.d[0] = 1;
        tokenDims.d[1] = static_cast<int>(inputIds.size());
        m_langEncoder.context->setInputShape(tokName, tokenDims);

        // Run language encoder
#if NV_TENSORRT_MAJOR >= 10
        bool ok = m_langEncoder.context->enqueueV3(m_cudaStream);
#else
        bool ok = m_langEncoder.context->enqueueV2(
            reinterpret_cast<void**>(m_langEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
        if (!ok) {
            _logger.LogError("ANSSAM3::SetPrompt", "Language encoder enqueue failed", __FILE__, __LINE__);
            return false;
        }
        cudaStreamSynchronize(m_cudaStream);

        // Find language encoder outputs: text_attention_mask and text_memory
        // output[0]: text_attention_mask [1, 32] bool  → cached as m_cachedLangMask
        // output[1]: text_memory [32, 1, 256] float32  → cached as m_cachedLangFeats
        // output[2]: text_embeds [32, 1, 1024] float32 → NOT used
        int maskOutIdx = -1, featsOutIdx = -1;
        const int numTensors = m_langEncoder.engine->getNbIOTensors();
        for (int i = 0; i < numTensors; ++i) {
            const char* name = m_langEncoder.engine->getIOTensorName(i);
            if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
                continue;
            std::string sname(name);
            if (sname.find("attention_mask") != std::string::npos ||
                sname.find("text_attention") != std::string::npos) {
                maskOutIdx = i;
            }
            else if (sname.find("text_memory") != std::string::npos ||
                     sname.find("memory") != std::string::npos) {
                featsOutIdx = i;
            }
        }

        // Fallback: first 2 outputs in order
        if (maskOutIdx < 0 || featsOutIdx < 0) {
            int outCount = 0;
            for (int i = 0; i < numTensors; ++i) {
                const char* name = m_langEncoder.engine->getIOTensorName(i);
                if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
                    continue;
                if (outCount == 0 && maskOutIdx < 0) maskOutIdx = i;
                else if (outCount == 1 && featsOutIdx < 0) featsOutIdx = i;
                outCount++;
            }
        }

        // Cache mask on GPU
        if (maskOutIdx >= 0) {
            size_t bytes = m_langEncoder.gpuBufferSizes[maskOutIdx];
            if (m_cachedLangMask && m_cachedLangMaskBytes < bytes) {
                cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr;
            }
            if (!m_cachedLangMask) {
                cudaMalloc(&m_cachedLangMask, bytes);
            }
            m_cachedLangMaskBytes = bytes;
            cudaMemcpyAsync(m_cachedLangMask, m_langEncoder.gpuBuffers[maskOutIdx],
                            bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
        }

        // Cache features on GPU
        if (featsOutIdx >= 0) {
            size_t bytes = m_langEncoder.gpuBufferSizes[featsOutIdx];
            if (m_cachedLangFeats && m_cachedLangFeatsBytes < bytes) {
                cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr;
            }
            if (!m_cachedLangFeats) {
                cudaMalloc(&m_cachedLangFeats, bytes);
            }
            m_cachedLangFeatsBytes = bytes;
            cudaMemcpyAsync(m_cachedLangFeats, m_langEncoder.gpuBuffers[featsOutIdx],
                            bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
        }

        cudaStreamSynchronize(m_cudaStream);
        m_promptSet = true;
		return true;
    }

    bool ANSSAM3::SetPrompt(const std::string& text)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!m_tokenizer || !m_tokenizer->IsLoaded()) {
            _logger.LogError("ANSSAM3::SetPrompt",
                "Tokenizer not loaded. Place merges.txt in model folder.", __FILE__, __LINE__);
            return false;
        }
        auto result = m_tokenizer->Tokenize(text, m_tokenLength);
        SetPrompt(result.inputIds, result.attentionMask);
		return true;    
    }

    // =========================================================================
    // RunInference
    // =========================================================================

    std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input)
    {
        return RunInference(input, "");
    }

    std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input, const std::string& camera_id)
    {
        if (!PreInferenceCheck("ANSSAM3::RunInference")) return {};
        try {
            return Detect(input, camera_id);
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSSAM3::RunInference", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // =========================================================================
    // Detect — image encoder + decoder pipeline
    // =========================================================================

    std::vector<Object> ANSSAM3::Detect(const cv::Mat& input, const std::string& camera_id)
    {
        if (!_modelLoadValid || !m_imgEncoder.context || !m_cudaStream) {
            return {};
        }

        const int origW = input.cols;
        const int origH = input.rows;

        // ---- 1) Find image encoder input tensor and determine dtype ----
        auto imgIt = m_imgEncoder.nameToIdx.find("image");
        if (imgIt == m_imgEncoder.nameToIdx.end()) {
            const char* firstName = m_imgEncoder.engine->getIOTensorName(0);
            imgIt = m_imgEncoder.nameToIdx.find(firstName);
        }
        if (imgIt == m_imgEncoder.nameToIdx.end()) {
            _logger.LogError("ANSSAM3::Detect", "Cannot find image input tensor", __FILE__, __LINE__);
            return {};
        }

        int imgInputIdx = imgIt->second;
        const char* imgInputName = m_imgEncoder.engine->getIOTensorName(imgInputIdx);
        auto imgDtype = m_imgEncoder.engine->getTensorDataType(imgInputName);

        bool isUint8Input = (imgDtype == nvinfer1::DataType::kINT8 || imgDtype == nvinfer1::DataType::kBOOL);
#if NV_TENSORRT_MAJOR >= 10
        isUint8Input = isUint8Input || (imgDtype == nvinfer1::DataType::kUINT8);
#endif

        // ---- 1b) Try NV12 fast path — fused NV12→RGB resize CHW directly into TRT buffer ----
        bool usedNV12 = false;
        {
            auto nv12 = m_nv12Helper.tryNV12DirectToBuffer(
                input, 0 /*inferenceGpu*/,
                m_imgEncoder.gpuBuffers[imgInputIdx],
                m_inputSize, m_inputSize,
                !isUint8Input,   // float32 if not uint8
                m_cudaStream,
                _logger, "ANSSAM3");
            usedNV12 = nv12.succeeded;
            m_nv12Helper.tickInference();
        }

        // ---- 1c) CPU fallback: BGR → RGB, resize to 1008, HWC→CHW, upload ----
        if (!usedNV12) {
            cv::Mat resized;
            cv::resize(input, resized, cv::Size(m_inputSize, m_inputSize));
            cv::Mat rgb;
            cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);

            const size_t planeSize = static_cast<size_t>(m_inputSize) * m_inputSize;
            std::vector<uint8_t> imgBuffer(3 * planeSize);
            cv::Mat channels[3];
            cv::split(rgb, channels);
            for (int c = 0; c < 3; ++c)
                std::memcpy(imgBuffer.data() + c * planeSize, channels[c].data, planeSize);

            if (isUint8Input) {
                cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgBuffer.data(),
                                imgBuffer.size(), cudaMemcpyHostToDevice, m_cudaStream);
            }
            else {
                std::vector<float> imgFloat(imgBuffer.size());
                for (size_t i = 0; i < imgBuffer.size(); ++i)
                    imgFloat[i] = static_cast<float>(imgBuffer[i]);
                cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgFloat.data(),
                                imgFloat.size() * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream);
            }
        }

        // Set image input shape
        nvinfer1::Dims imgDims;
        imgDims.nbDims = 3;
        imgDims.d[0] = 3;
        imgDims.d[1] = m_inputSize;
        imgDims.d[2] = m_inputSize;
        m_imgEncoder.context->setInputShape(imgInputName, imgDims);

        // ---- 2) Run image encoder ----
        // Check for prior CUDA errors (e.g. OOM from memcpy) before enqueue
        {
            cudaError_t preErr = cudaGetLastError();
            if (preErr != cudaSuccess) {
                _logger.LogError("ANSSAM3::Detect",
                    std::string("CUDA error before enqueue: ") + cudaGetErrorString(preErr),
                    __FILE__, __LINE__);
                return {};
            }
        }
#if NV_TENSORRT_MAJOR >= 10
        bool okImg = m_imgEncoder.context->enqueueV3(m_cudaStream);
#else
        bool okImg = m_imgEncoder.context->enqueueV2(
            reinterpret_cast<void**>(m_imgEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
        if (!okImg) {
            cudaError_t postErr = cudaGetLastError();
            _logger.LogError("ANSSAM3::Detect",
                std::string("Image encoder enqueue failed") +
                (postErr != cudaSuccess ? std::string(": ") + cudaGetErrorString(postErr) : ""),
                __FILE__, __LINE__);
            return {};
        }
        cudaStreamSynchronize(m_cudaStream);

        // ---- 3) Feed encoder outputs into TRT decoder (zero-copy via setTensorAddress) ----
        // Helper: copy GPU buffer from image encoder output → decoder input
        auto feedImgToDec = [&](const std::string& tensorName) {
            auto srcIt = m_imgEncoder.nameToIdx.find(tensorName);
            auto dstIt = m_decoder.nameToIdx.find(tensorName);
            if (srcIt == m_imgEncoder.nameToIdx.end() || dstIt == m_decoder.nameToIdx.end()) return;

            int srcIdx = srcIt->second;
            int dstIdx = dstIt->second;

            // Point decoder input directly at encoder output buffer (true zero-copy)
            const char* dstName = m_decoder.engine->getIOTensorName(dstIdx);
            m_decoder.context->setTensorAddress(dstName, m_imgEncoder.gpuBuffers[srcIdx]);

            // Set input shape from encoder's actual output shape
            const char* srcName = m_imgEncoder.engine->getIOTensorName(srcIdx);
            auto shape = m_imgEncoder.context->getTensorShape(srcName);
            m_decoder.context->setInputShape(dstName, shape);
        };

        feedImgToDec("vision_pos_enc_2");
        feedImgToDec("backbone_fpn_0");
        feedImgToDec("backbone_fpn_1");
        feedImgToDec("backbone_fpn_2");

        // Language features — point decoder inputs at cached GPU buffers
        {
            auto it = m_decoder.nameToIdx.find("language_mask");
            if (it != m_decoder.nameToIdx.end()) {
                const char* name = m_decoder.engine->getIOTensorName(it->second);
                m_decoder.context->setTensorAddress(name, m_cachedLangMask);
                nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = m_tokenLength;
                m_decoder.context->setInputShape(name, d);
            }
        }
        {
            auto it = m_decoder.nameToIdx.find("language_features");
            if (it != m_decoder.nameToIdx.end()) {
                const char* name = m_decoder.engine->getIOTensorName(it->second);
                m_decoder.context->setTensorAddress(name, m_cachedLangFeats);
                nvinfer1::Dims d; d.nbDims = 3; d.d[0] = m_tokenLength; d.d[1] = 1; d.d[2] = 256;
                m_decoder.context->setInputShape(name, d);
            }
        }

        // Scalar inputs (original_height, original_width) — host-memory shape tensors
        {
            auto it = m_decoder.nameToIdx.find("original_height");
            if (it != m_decoder.nameToIdx.end()) {
                int idx = it->second;
                auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
                if (dtype == nvinfer1::DataType::kINT64)
                    *reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origH);
                else
                    *reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origH;
            }
        }
        {
            auto it = m_decoder.nameToIdx.find("original_width");
            if (it != m_decoder.nameToIdx.end()) {
                int idx = it->second;
                auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
                if (dtype == nvinfer1::DataType::kINT64)
                    *reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origW);
                else
                    *reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origW;
            }
        }

        // Prompt inputs: box_coords [1,1,4], box_labels [1,1], box_masks [1,1]
        {
            auto it = m_decoder.nameToIdx.find("box_coords");
            if (it != m_decoder.nameToIdx.end()) {
                int idx = it->second;
                float boxCoords[4] = { 0.f, 0.f, 0.f, 0.f };
                cudaMemcpyAsync(m_decoder.gpuBuffers[idx], boxCoords, sizeof(boxCoords),
                                cudaMemcpyHostToDevice, m_cudaStream);
                const char* name = m_decoder.engine->getIOTensorName(idx);
                nvinfer1::Dims d; d.nbDims = 3; d.d[0] = 1; d.d[1] = 1; d.d[2] = 4;
                m_decoder.context->setInputShape(name, d);
            }
        }
        {
            auto it = m_decoder.nameToIdx.find("box_labels");
            if (it != m_decoder.nameToIdx.end()) {
                int idx = it->second;
                const char* name = m_decoder.engine->getIOTensorName(idx);
                auto dtype = m_decoder.engine->getTensorDataType(name);
                if (dtype == nvinfer1::DataType::kINT64) {
                    int64_t val = -1;
                    cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
                                    cudaMemcpyHostToDevice, m_cudaStream);
                } else {
                    int32_t val = -1;
                    cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
                                    cudaMemcpyHostToDevice, m_cudaStream);
                }
                nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
                m_decoder.context->setInputShape(name, d);
            }
        }
        {
            auto it = m_decoder.nameToIdx.find("box_masks");
            if (it != m_decoder.nameToIdx.end()) {
                int idx = it->second;
                const char* name = m_decoder.engine->getIOTensorName(idx);
                auto dtype = m_decoder.engine->getTensorDataType(name);
                if (dtype == nvinfer1::DataType::kBOOL) {
                    bool val = false;
                    cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
                                    cudaMemcpyHostToDevice, m_cudaStream);
                } else {
                    int32_t val = 0;
                    cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
                                    cudaMemcpyHostToDevice, m_cudaStream);
                }
                nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
                m_decoder.context->setInputShape(name, d);
            }
        }

        // ---- 4) Run TRT decoder ----
        // Find output tensor indices for boxes/scores/masks
        std::string boxesTName, scoresTName, masksTName;
        int boxesIdx = -1, scoresIdx = -1, masksIdx = -1;
        for (auto& kv : m_decoder.nameToIdx) {
            auto mode = m_decoder.engine->getTensorIOMode(kv.first.c_str());
            if (mode != nvinfer1::TensorIOMode::kOUTPUT) continue;
            if (kv.first.find("box") != std::string::npos && kv.first.find("mask") == std::string::npos)
                { boxesTName = kv.first; boxesIdx = kv.second; }
            else if (kv.first.find("score") != std::string::npos)
                { scoresTName = kv.first; scoresIdx = kv.second; }
            else if (kv.first.find("mask") != std::string::npos)
                { masksTName = kv.first; masksIdx = kv.second; }
        }
        if (boxesIdx < 0 || scoresIdx < 0 || masksIdx < 0) {
            _logger.LogError("ANSSAM3::Detect", "Cannot find decoder output tensors", __FILE__, __LINE__);
            return {};
        }

#if NV_TENSORRT_MAJOR >= 10
        // Stack-local allocators that return the pre-allocated gpuBuffers.
        // Registering these enables getTensorShape() to return actual
        // (not -1) dimensions for data-dependent outputs after enqueueV3.
        PassthroughOutputAllocator boxAlloc (m_decoder.gpuBuffers[boxesIdx],  m_decoder.gpuBufferSizes[boxesIdx]);
        PassthroughOutputAllocator scoreAlloc(m_decoder.gpuBuffers[scoresIdx], m_decoder.gpuBufferSizes[scoresIdx]);
        PassthroughOutputAllocator maskAlloc (m_decoder.gpuBuffers[masksIdx],  m_decoder.gpuBufferSizes[masksIdx]);

        m_decoder.context->setOutputAllocator(boxesTName.c_str(),  &boxAlloc);
        m_decoder.context->setOutputAllocator(scoresTName.c_str(), &scoreAlloc);
        m_decoder.context->setOutputAllocator(masksTName.c_str(),  &maskAlloc);
#endif

#if NV_TENSORRT_MAJOR >= 10
        bool okDec = m_decoder.context->enqueueV3(m_cudaStream);
#else
        bool okDec = m_decoder.context->enqueueV2(
            reinterpret_cast<void**>(m_decoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
        cudaStreamSynchronize(m_cudaStream);

        if (!okDec) {
            _logger.LogError("ANSSAM3::Detect", "Decoder enqueue failed", __FILE__, __LINE__);
            return {};
        }

        // ---- 5) Parse TRT decoder outputs: boxes [N,4], scores [N], masks [N,1,H,W] ----
        int numBoxes = 0, maskH = 0, maskW = 0;
        void* boxesGpu  = nullptr;
        void* scoresGpu = nullptr;
        void* masksGpu  = nullptr;

#if NV_TENSORRT_MAJOR >= 10
        // Read actual shapes from notifyShape() callback
        if (boxAlloc.shapeKnown) {
            auto& s = boxAlloc.actualDims;
            numBoxes = (s.nbDims >= 1) ? static_cast<int>(s.d[0]) : 0;
            boxesGpu = boxAlloc.preAllocBuf;
        }
        if (scoreAlloc.shapeKnown) {
            scoresGpu = scoreAlloc.preAllocBuf;
        }
        if (maskAlloc.shapeKnown) {
            auto& s = maskAlloc.actualDims;
            maskH = (s.nbDims >= 3) ? static_cast<int>(s.d[2]) : 0;
            maskW = (s.nbDims >= 4) ? static_cast<int>(s.d[3]) : 0;
            masksGpu = maskAlloc.preAllocBuf;
        }
#else
        {
            auto boxShape  = m_decoder.context->getTensorShape(boxesTName.c_str());
            auto maskShape = m_decoder.context->getTensorShape(masksTName.c_str());
            numBoxes = (boxShape.nbDims >= 1) ? static_cast<int>(boxShape.d[0]) : 0;
            maskH = (maskShape.nbDims >= 3) ? static_cast<int>(maskShape.d[2]) : 0;
            maskW = (maskShape.nbDims >= 4) ? static_cast<int>(maskShape.d[3]) : 0;
            boxesGpu  = m_decoder.gpuBuffers[boxesIdx];
            scoresGpu = m_decoder.gpuBuffers[scoresIdx];
            masksGpu  = m_decoder.gpuBuffers[masksIdx];
        }
#endif

        if (numBoxes <= 0 || !boxesGpu || !scoresGpu) return {};

        // Download decoder outputs from GPU → CPU for postprocessing
        size_t boxesBytes  = static_cast<size_t>(numBoxes) * 4 * sizeof(float);
        size_t scoresBytes = static_cast<size_t>(numBoxes) * sizeof(float);
        size_t masksBytes  = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(bool);

        std::vector<float> boxesCpu(numBoxes * 4);
        std::vector<float> scoresCpu(numBoxes);

        cudaMemcpy(boxesCpu.data(),  boxesGpu,  boxesBytes,  cudaMemcpyDeviceToHost);
        cudaMemcpy(scoresCpu.data(), scoresGpu, scoresBytes, cudaMemcpyDeviceToHost);

        // Masks may be bool or float depending on TRT's internal optimization.
        // Download to a raw buffer and convert to bool.
        auto maskDtype = m_decoder.engine->getTensorDataType(masksTName.c_str());
        std::vector<uint8_t> masksRaw(masksBytes);
        if (masksGpu && maskH > 0 && maskW > 0) {
            if (maskDtype == nvinfer1::DataType::kBOOL) {
                cudaMemcpy(masksRaw.data(), masksGpu, masksBytes, cudaMemcpyDeviceToHost);
            } else {
                // Float masks — download and threshold
                size_t floatBytes = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(float);
                std::vector<float> masksFloat(static_cast<size_t>(numBoxes) * maskH * maskW);
                cudaMemcpy(masksFloat.data(), masksGpu, floatBytes, cudaMemcpyDeviceToHost);
                for (size_t j = 0; j < masksFloat.size(); ++j)
                    masksRaw[j] = masksFloat[j] > m_segThreshold ? 1 : 0;
            }
        }

        // Convert raw buffer to bool pointer for PostprocessInstances
        const bool* masksData = reinterpret_cast<const bool*>(masksRaw.data());

        auto ret = PostprocessInstances(boxesCpu.data(), numBoxes, scoresCpu.data(),
                                   masksData, maskH, maskW,
                                   origW, origH, camera_id);
        if (_trackerEnabled) {
            ret = ApplyTracking(ret, camera_id);
            if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id);
        }
        return ret;
    }

    // =========================================================================
    // PostprocessInstances — same logic as ONNXSAM3::postprocessResults
    // =========================================================================

    std::vector<Object> ANSSAM3::PostprocessInstances(
        const float* boxesData, int numBoxes,
        const float* scoresData,
        const bool*  masksData,
        int maskH, int maskW,
        int origWidth, int origHeight,
        const std::string& camera_id)
    {
        std::vector<Object> results;

        for (int i = 0; i < numBoxes; ++i) {
            float score = scoresData[i];
            if (score < m_segThreshold)
                continue;

            // Box: [x1, y1, x2, y2] in original image coordinates
            float x1 = std::max(0.0f, std::min(boxesData[i * 4 + 0], static_cast<float>(origWidth)));
            float y1 = std::max(0.0f, std::min(boxesData[i * 4 + 1], static_cast<float>(origHeight)));
            float x2 = std::max(0.0f, std::min(boxesData[i * 4 + 2], static_cast<float>(origWidth)));
            float y2 = std::max(0.0f, std::min(boxesData[i * 4 + 3], static_cast<float>(origHeight)));

            cv::Rect box(static_cast<int>(x1), static_cast<int>(y1),
                         static_cast<int>(x2 - x1), static_cast<int>(y2 - y1));
            if (box.width <= 0 || box.height <= 0)
                continue;

            // Extract this instance's mask: [1, H, W] at index i
            // ORT decoder always outputs bool masks — convert to 0/255 uint8
            cv::Mat boolMask(maskH, maskW, CV_8UC1);
            size_t maskOffset = static_cast<size_t>(i) * 1 * maskH * maskW;
            const bool* src = masksData + maskOffset;
            for (int y = 0; y < maskH; ++y)
                for (int x = 0; x < maskW; ++x)
                    boolMask.at<uint8_t>(y, x) = src[y * maskW + x] ? 255 : 0;

            // Resize mask to original resolution
            cv::Mat fullMask;
            cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight), 0, 0, cv::INTER_LINEAR);
            cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY);

            // Crop to bounding box
            cv::Mat roiMask = fullMask(box).clone();

            Object obj;
            obj.box        = box;
            obj.confidence = score;
            obj.classId    = 0;
            obj.className  = "object";
            obj.cameraId   = camera_id;
            obj.mask       = roiMask;

            // Create normalized polygon from mask (closed, maxPoints-limited)
            obj.polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
                roiMask, box,
                static_cast<float>(origWidth), static_cast<float>(origHeight));

            // Fallback: normalized box corners if mask polygon failed
            if (obj.polygon.empty()) {
                obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
                    box, static_cast<float>(origWidth), static_cast<float>(origHeight));
            }

            results.push_back(std::move(obj));
        }

        return results;
    }

    // =========================================================================
    // Destroy
    // =========================================================================

    bool ANSSAM3::Destroy()
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            // TRT engine bundles
            m_imgEncoder.destroy();
            m_langEncoder.destroy();
            m_decoder.destroy();

            // Cached language encoder GPU buffers
            if (m_cachedLangMask) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; }
            m_cachedLangMaskBytes = 0;
            if (m_cachedLangFeats) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; }
            m_cachedLangFeatsBytes = 0;

            if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }

            m_tokenizer.reset();
            m_promptSet     = false;
            _modelLoadValid = false;
            _isInitialized  = false;

            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSSAM3::Destroy", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    ANSSAM3::~ANSSAM3()
    {
        Destroy();
    }
}