ANSCORE/engines/ONNXEngine/ONNXSAM3.cpp

#include "ONNXSAM3.h"
#include "ONNXEngine.h"   // OrtCompatiableGetInputName/OutputName helpers

#include <iostream>
#include <fstream>
#include <filesystem>
#include <cmath>
#include <algorithm>
#include <unordered_map>

namespace ANSCENTER
{
    // ====================================================================
    // SessionBundle destructor
    // ====================================================================

    ONNXSAM3::SessionBundle::~SessionBundle()
    {
        if (session) {
            delete session;
            session = nullptr;
        }
    }

    // ====================================================================
    // EP helpers (same logic as BasicOrtHandler)
    // ====================================================================

    bool ONNXSAM3::TryAppendCUDA(Ort::SessionOptions& session_options)
    {
        try {
            OrtCUDAProviderOptionsV2* cuda_options = nullptr;
            Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);
            const char* keys[] = {
                "device_id",
                "arena_extend_strategy",
                "cudnn_conv_algo_search",
                "cudnn_conv_use_max_workspace",  // reduce cuDNN temp memory
                "do_copy_in_default_stream",      // allow async copies
            };
            const char* values[] = {
                "0",
                "kSameAsRequested",
                "HEURISTIC",
                "0",                              // 0 = minimal workspace
                "0",                              // 0 = use separate stream
            };
            Ort::GetApi().UpdateCUDAProviderOptions(cuda_options, keys, values, 5);
            session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
            Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
            std::cout << "[ONNXSAM3] CUDA EP attached." << std::endl;
            return true;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ONNXSAM3] CUDA EP failed: " << e.what() << std::endl;
            return false;
        }
    }

    bool ONNXSAM3::TryAppendDirectML(Ort::SessionOptions& session_options)
    {
        try {
            std::unordered_map<std::string, std::string> options = { {"device_id","0"} };
            session_options.AppendExecutionProvider("DML", options);
            std::cout << "[ONNXSAM3] DirectML EP attached." << std::endl;
            return true;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ONNXSAM3] DirectML EP failed: " << e.what() << std::endl;
            return false;
        }
    }

    bool ONNXSAM3::TryAppendOpenVINO(Ort::SessionOptions& session_options)
    {
        std::vector<std::unordered_map<std::string, std::string>> configs = {
            {{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
            {{"device_type","GPU.0"},       {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
            {{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}
        };
        for (const auto& config : configs) {
            try {
                session_options.AppendExecutionProvider_OpenVINO_V2(config);
                std::cout << "[ONNXSAM3] OpenVINO EP attached (" << config.at("device_type") << ")." << std::endl;
                return true;
            }
            catch (const Ort::Exception&) { /* try next */ }
        }
        return false;
    }

    // ====================================================================
    // createSessionBundle — create one ORT session with EP + external data
    // ====================================================================

    void ONNXSAM3::createSessionBundle(SessionBundle& bundle,
                                        const std::string& onnxPath,
                                        const std::string& label,
                                        bool forceCPU,
                                        GraphOptimizationLevel optLevel)
    {
        std::cout << "[ONNXSAM3] Creating " << label << " session..." << std::endl;

        Ort::SessionOptions opts;
        opts.SetIntraOpNumThreads(m_numThreads);
        opts.SetGraphOptimizationLevel(optLevel);
        opts.SetLogSeverityLevel(4);

        // Determine effective engine type
        EngineType engine = forceCPU ? EngineType::CPU : m_engineType;
        if (forceCPU)
            std::cout << "[ONNXSAM3] " << label << ": forced to CPU to save GPU memory." << std::endl;

        std::vector<std::string> available = Ort::GetAvailableProviders();
        auto hasProvider = [&](const std::string& name) {
            return std::find(available.begin(), available.end(), name) != available.end();
        };

        bool epAttached = false;
        {
        switch (engine)
        {
        case EngineType::NVIDIA_GPU:
            if (hasProvider("CUDAExecutionProvider"))
                epAttached = TryAppendCUDA(opts);
            break;
        case EngineType::AMD_GPU:
            if (hasProvider("DmlExecutionProvider"))
                epAttached = TryAppendDirectML(opts);
            break;
        case EngineType::OPENVINO_GPU:
            if (hasProvider("OpenVINOExecutionProvider"))
                epAttached = TryAppendOpenVINO(opts);
            break;
        case EngineType::CPU:
        default:
            epAttached = true;
            break;
        }
        }
        if (!epAttached)
            std::cout << "[ONNXSAM3] " << label << ": using CPU EP." << std::endl;

        // -- CWD workaround for external data resolution --
        std::filesystem::path modelFsPath(onnxPath);
        std::filesystem::path modelDir = modelFsPath.parent_path();
        std::filesystem::path prevCwd  = std::filesystem::current_path();

        if (!modelDir.empty() && std::filesystem::is_directory(modelDir))
            std::filesystem::current_path(modelDir);

        // -- Pre-load external data file if one matches the model stem --
        // The external data filename stored inside the .onnx protobuf may
        // differ from the .onnx filename on disk (e.g. anssam3_image_encoder.onnx
        // internally references sam3_image_encoder.onnx.data).  We only
        // pre-load when a stem-based candidate exists on disk.  If no match
        // is found, we load the model from its FILE PATH (not memory buffer)
        // so that ORT resolves external data relative to the model directory.
        std::vector<char> extDataBuffer;
        std::filesystem::path extDataPath;
        {
            std::wstring stem = modelFsPath.stem().wstring();
            std::vector<std::filesystem::path> candidates = {
                modelDir / (stem + L".onnx_data"),     // monolithic convention
                modelDir / (stem + L".onnx.data"),     // split-model convention
            };
            for (auto& c : candidates) {
                if (std::filesystem::exists(c)) {
                    extDataPath = c;
                    break;
                }
            }
            if (extDataPath.empty()) {
                std::cout << "[ONNXSAM3] " << label
                          << ": no stem-matched external data; "
                          << "ORT will resolve from model directory." << std::endl;
            }
        }

        if (!extDataPath.empty() && std::filesystem::exists(extDataPath)) {
            auto fileSize = std::filesystem::file_size(extDataPath);
            std::cout << "[ONNXSAM3] " << label << ": external data "
                      << extDataPath.filename().string()
                      << " (" << (fileSize / (1024*1024)) << " MB)" << std::endl;
            try {
                std::ifstream ifs(extDataPath, std::ios::binary);
                if (ifs) {
                    extDataBuffer.resize(static_cast<size_t>(fileSize));
                    ifs.read(extDataBuffer.data(), static_cast<std::streamsize>(fileSize));
                    ifs.close();

                    std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
                        extDataPath.filename().wstring()
                    };
                    std::vector<char*>  extBuffers = { extDataBuffer.data() };
                    std::vector<size_t> extLengths = { extDataBuffer.size() };
                    opts.AddExternalInitializersFromFilesInMemory(
                        extFileNames, extBuffers, extLengths);
                }
            }
            catch (const std::bad_alloc&) {
                std::cerr << "[ONNXSAM3] " << label
                          << ": could not allocate memory for external data. "
                          << "ORT will use file mapping." << std::endl;
                extDataBuffer.clear();
                extDataBuffer.shrink_to_fit();
            }
        }

        // -- Load .onnx proto into memory --
        std::vector<char> modelBuffer;
        bool useModelBuffer = false;
        try {
            auto modelFileSize = std::filesystem::file_size(modelFsPath);
            modelBuffer.resize(static_cast<size_t>(modelFileSize));
            std::ifstream mifs(modelFsPath, std::ios::binary);
            if (mifs) {
                mifs.read(modelBuffer.data(), static_cast<std::streamsize>(modelFileSize));
                mifs.close();
                useModelBuffer = true;
            }
        }
        catch (const std::exception& e) {
            std::cerr << "[ONNXSAM3] " << label
                      << ": could not read model file: " << e.what() << std::endl;
        }

        // -- Create session (with GPU → CPU fallback) --
        std::wstring onnxPathW(onnxPath.begin(), onnxPath.end());

        auto doCreate = [&](Ort::SessionOptions& sopts, const char* tag) {
            // Use memory-buffer loading when external data has been pre-loaded;
            // otherwise use file-path loading so ORT can resolve external data
            // relative to the model's directory on disk.
            if (useModelBuffer && !extDataBuffer.empty())
                bundle.session = new Ort::Session(*m_env, modelBuffer.data(), modelBuffer.size(), sopts);
            else
                bundle.session = new Ort::Session(*m_env, onnxPathW.c_str(), sopts);
            std::cout << "[ONNXSAM3] " << label << " session created (" << tag << ")." << std::endl;
        };

        try {
            doCreate(opts, "primary EP");
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ONNXSAM3] " << label << " session FAILED: " << e.what() << std::endl;
            if (engine != EngineType::CPU && epAttached) {
                std::cerr << "[ONNXSAM3] " << label << ": retrying with CPU..." << std::endl;
                Ort::SessionOptions cpuOpts;
                cpuOpts.SetIntraOpNumThreads(m_numThreads);
                cpuOpts.SetGraphOptimizationLevel(optLevel);
                cpuOpts.SetLogSeverityLevel(4);

                if (!extDataBuffer.empty()) {
                    std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
                        extDataPath.filename().wstring()
                    };
                    std::vector<char*>  extBuffers = { extDataBuffer.data() };
                    std::vector<size_t> extLengths = { extDataBuffer.size() };
                    cpuOpts.AddExternalInitializersFromFilesInMemory(
                        extFileNames, extBuffers, extLengths);
                }
                doCreate(cpuOpts, "CPU fallback");
            } else {
                throw;
            }
        }

        // Restore CWD & free buffers
        std::filesystem::current_path(prevCwd);
        extDataBuffer.clear();  extDataBuffer.shrink_to_fit();
        modelBuffer.clear();    modelBuffer.shrink_to_fit();

        // -- Read input/output names --
        Ort::Allocator allocator(*bundle.session, *m_memInfo);

        size_t numInputs = bundle.session->GetInputCount();
        bundle.inputNames_.resize(numInputs);
        bundle.inputNames.resize(numInputs);
        for (size_t i = 0; i < numInputs; ++i) {
            bundle.inputNames_[i] = OrtCompatiableGetInputName(i, allocator, bundle.session);
            bundle.inputNames[i]  = bundle.inputNames_[i].c_str();
        }

        size_t numOutputs = bundle.session->GetOutputCount();
        bundle.outputNames_.resize(numOutputs);
        bundle.outputNames.resize(numOutputs);
        for (size_t i = 0; i < numOutputs; ++i) {
            bundle.outputNames_[i] = OrtCompatiableGetOutputName(i, allocator, bundle.session);
            bundle.outputNames[i]  = bundle.outputNames_[i].c_str();
        }

        // Log I/O info
        for (size_t i = 0; i < numInputs; ++i) {
            auto info = bundle.session->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo();
            auto shape = info.GetShape();
            std::cout << "[ONNXSAM3] " << label << " input[" << i << "]: "
                      << bundle.inputNames_[i] << " shape=[";
            for (size_t d = 0; d < shape.size(); ++d) {
                if (d > 0) std::cout << ",";
                std::cout << shape[d];
            }
            std::cout << "]" << std::endl;
        }
        for (size_t i = 0; i < numOutputs; ++i) {
            auto info = bundle.session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo();
            auto shape = info.GetShape();
            std::cout << "[ONNXSAM3] " << label << " output[" << i << "]: "
                      << bundle.outputNames_[i] << " shape=[";
            for (size_t d = 0; d < shape.size(); ++d) {
                if (d > 0) std::cout << ",";
                std::cout << shape[d];
            }
            std::cout << "]" << std::endl;
        }
    }

    // ====================================================================
    // Constructor
    // ====================================================================

    ONNXSAM3::ONNXSAM3(const std::string& modelFolder,
                         EngineType engineType,
                         unsigned int num_threads)
        : m_engineType(engineType),
          m_numThreads(num_threads),
          m_modelFolder(modelFolder)
    {
        // Initialize ORT API
        const auto& epInfo = EPLoader::Current();
        if (Ort::Global<void>::api_ == nullptr)
            Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));

        m_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, "ONNXSAM3");
        m_memInfo = new Ort::MemoryInfo(
            Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault));

        // Build paths
        std::string imgPath  = modelFolder + "\\sam3_image_encoder.onnx";
        std::string langPath = modelFolder + "\\sam3_language_encoder.onnx";
        std::string decPath  = modelFolder + "\\sam3_decoder.onnx";

        // Create 3 sessions.
        // Language encoder runs on CPU: it is only called once per prompt
        // change and keeping it on GPU wastes ~1.5 GB of VRAM that the
        // image encoder and decoder need for their activation tensors.
        // Image encoder uses ORT_ENABLE_BASIC: enables constant folding
        // and redundant-node elimination without the complex fusions
        // (MatMulScaleFusion) that are slower on this model and risk OOM
        // on 8 GB GPUs.  Benchmarked: BASIC=4.6s vs ALL=5.2s.
        createSessionBundle(m_imageEncoder, imgPath,  "ImageEncoder", false,
                            GraphOptimizationLevel::ORT_ENABLE_BASIC);
        createSessionBundle(m_langEncoder,  langPath, "LangEncoder",  true,
                            GraphOptimizationLevel::ORT_ENABLE_ALL);
        createSessionBundle(m_decoder,      decPath,  "Decoder");

        std::cout << "[ONNXSAM3] All 3 sessions created successfully." << std::endl;
    }

    // ====================================================================
    // Destructor
    // ====================================================================

    ONNXSAM3::~ONNXSAM3()
    {
        // Sessions must be destroyed BEFORE the Ort::Env they were created
        // with.  Member destructors run after the destructor body, so we
        // must explicitly release sessions here first.
        if (m_decoder.session)      { delete m_decoder.session;      m_decoder.session = nullptr; }
        if (m_langEncoder.session)  { delete m_langEncoder.session;  m_langEncoder.session = nullptr; }
        if (m_imageEncoder.session) { delete m_imageEncoder.session; m_imageEncoder.session = nullptr; }

        if (m_memInfo) { delete m_memInfo; m_memInfo = nullptr; }
        if (m_env)     { delete m_env;     m_env = nullptr; }
    }

    // ====================================================================
    // preprocessImage — BGR → RGB, resize to 1008, HWC→CHW, uint8
    // ====================================================================

    void ONNXSAM3::preprocessImage(const cv::Mat& mat, std::vector<uint8_t>& buffer)
    {
        // 3-model image encoder expects uint8 [3, 1008, 1008]
        cv::Mat resized;
        cv::resize(mat, resized, cv::Size(m_inputSize, m_inputSize));

        cv::Mat rgb;
        cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);

        const size_t planeSize = static_cast<size_t>(m_inputSize) * m_inputSize;
        buffer.resize(3 * planeSize);

        // HWC → CHW via cv::split + memcpy (much faster than per-pixel loop)
        cv::Mat channels[3];
        cv::split(rgb, channels);
        for (int c = 0; c < 3; ++c)
            std::memcpy(buffer.data() + c * planeSize, channels[c].data, planeSize);
    }

    // ====================================================================
    // setPrompt — run language encoder, cache results
    // ====================================================================

    void ONNXSAM3::setPrompt(const std::vector<int64_t>& inputIds,
                              const std::vector<int64_t>& attentionMask)
    {
        if (!m_langEncoder.session) {
            std::cerr << "[ONNXSAM3] Language encoder not initialized." << std::endl;
            return;
        }

        // Language encoder input: "tokens" [1, 32] int64
        std::vector<int64_t> tokenShape = { 1, static_cast<int64_t>(inputIds.size()) };
        m_tokenLength = static_cast<int>(inputIds.size());

        // We need a non-const copy for CreateTensor
        std::vector<int64_t> tokenData = inputIds;

        Ort::Value tokenTensor = Ort::Value::CreateTensor<int64_t>(
            *m_memInfo, tokenData.data(), tokenData.size(),
            tokenShape.data(), tokenShape.size());

        std::vector<Ort::Value> inputs;
        inputs.push_back(std::move(tokenTensor));

        // Run language encoder
        std::cout << "[ONNXSAM3] Running language encoder..." << std::endl;
        auto outputs = m_langEncoder.session->Run(
            Ort::RunOptions{nullptr},
            m_langEncoder.inputNames.data(),
            inputs.data(),
            inputs.size(),
            m_langEncoder.outputNames.data(),
            m_langEncoder.outputNames.size());

        // Language encoder outputs (from Python analysis):
        //   output[0]: text_attention_mask [1, 32] bool  → "language_mask" for decoder
        //   output[1]: text_memory [32, 1, 256] float32  → "language_features" for decoder
        //   output[2]: text_embeds [32, 1, 1024] float32 → NOT used by decoder

        // Find outputs by name or fall back to index
        int maskIdx = -1, featIdx = -1;
        for (size_t i = 0; i < m_langEncoder.outputNames_.size(); ++i) {
            const auto& name = m_langEncoder.outputNames_[i];
            if (name.find("attention_mask") != std::string::npos ||
                name.find("text_attention") != std::string::npos) {
                maskIdx = static_cast<int>(i);
            }
            else if (name.find("text_memory") != std::string::npos ||
                     name.find("memory") != std::string::npos) {
                featIdx = static_cast<int>(i);
            }
        }
        // Fallback: first output is mask, second is features
        if (maskIdx < 0) maskIdx = 0;
        if (featIdx < 0) featIdx = 1;

        // Cache language mask (bool)
        {
            auto info = outputs[maskIdx].GetTensorTypeAndShapeInfo();
            m_cachedLangMaskShape = info.GetShape();
            size_t count = info.GetElementCount();
            const bool* data = outputs[maskIdx].GetTensorData<bool>();
            m_cachedLangMask.resize(count);
            for (size_t i = 0; i < count; ++i)
                m_cachedLangMask[i] = data[i] ? 1 : 0;
        }

        // Cache language features (float32)
        {
            auto info = outputs[featIdx].GetTensorTypeAndShapeInfo();
            m_cachedLangFeaturesShape = info.GetShape();
            size_t count = info.GetElementCount();
            const float* data = outputs[featIdx].GetTensorData<float>();
            m_cachedLangFeatures.assign(data, data + count);
        }

        m_promptSet = true;
        std::cout << "[ONNXSAM3] Language encoder done. Mask shape=[";
        for (size_t i = 0; i < m_cachedLangMaskShape.size(); ++i) {
            if (i > 0) std::cout << ",";
            std::cout << m_cachedLangMaskShape[i];
        }
        std::cout << "] Features shape=[";
        for (size_t i = 0; i < m_cachedLangFeaturesShape.size(); ++i) {
            if (i > 0) std::cout << ",";
            std::cout << m_cachedLangFeaturesShape[i];
        }
        std::cout << "]" << std::endl;
    }

    // ====================================================================
    // detect — image encoder + decoder pipeline
    // ====================================================================

    std::vector<SAM3Result> ONNXSAM3::detect(const cv::Mat& mat, float segThreshold)
    {
        if (mat.empty()) return {};
        if (!m_promptSet) {
            std::cerr << "[ONNXSAM3] No prompt set. Call setPrompt() first." << std::endl;
            return {};
        }

        const int origW = mat.cols;
        const int origH = mat.rows;

        // ---- 1) Image Encoder ----
        std::vector<uint8_t> imgBuffer;
        preprocessImage(mat, imgBuffer);

        std::vector<int64_t> imgShape = { 3, m_inputSize, m_inputSize };
        Ort::Value imgTensor = Ort::Value::CreateTensor<uint8_t>(
            *m_memInfo, imgBuffer.data(), imgBuffer.size(),
            imgShape.data(), imgShape.size());

        std::vector<Ort::Value> imgInputs;
        imgInputs.push_back(std::move(imgTensor));

        auto imgOutputs = m_imageEncoder.session->Run(
            Ort::RunOptions{nullptr},
            m_imageEncoder.inputNames.data(),
            imgInputs.data(),
            imgInputs.size(),
            m_imageEncoder.outputNames.data(),
            m_imageEncoder.outputNames.size());

        // Image encoder outputs (6 total, matched by name):
        //   vision_pos_enc_0/1/2 — only _2 used by decoder
        //   backbone_fpn_0/1/2   — all 3 used by decoder

        // Build a map from image encoder output names to indices
        std::unordered_map<std::string, int> imgOutputMap;
        for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i)
            imgOutputMap[m_imageEncoder.outputNames_[i]] = static_cast<int>(i);

        // Release unused outputs (vision_pos_enc_0, vision_pos_enc_1) to free
        // GPU memory before running the decoder. These are ~105 MB on CUDA.
        for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i) {
            const auto& name = m_imageEncoder.outputNames_[i];
            if (name == "vision_pos_enc_0" || name == "vision_pos_enc_1") {
                imgOutputs[i] = Ort::Value(nullptr);
            }
        }

        // ---- 2) Build decoder inputs ----
        size_t numDecInputs = m_decoder.inputNames.size();
        std::vector<Ort::Value> decInputs;
        decInputs.reserve(numDecInputs);

        // Prepare scalar and prompt tensors
        int64_t origHeightVal = static_cast<int64_t>(origH);
        int64_t origWidthVal  = static_cast<int64_t>(origW);
        std::vector<int64_t> scalarShape = {};  // scalar = 0-dim tensor

        float boxCoordsData[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
        std::vector<int64_t> boxCoordsShape = { 1, 1, 4 };

        int64_t boxLabelsData[1] = { -1 };  // no box prompt
        std::vector<int64_t> boxLabelsShape = { 1, 1 };

        bool boxMasksData[1] = { false };  // no box prompt (language-only grounding)
        std::vector<int64_t> boxMasksShape = { 1, 1 };

        // Build input tensors in the order expected by the decoder
        for (size_t i = 0; i < numDecInputs; ++i) {
            const std::string& name = m_decoder.inputNames_[i];

            if (name == "original_height") {
                decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
                    *m_memInfo, &origHeightVal, 1, scalarShape.data(), scalarShape.size()));
            }
            else if (name == "original_width") {
                decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
                    *m_memInfo, &origWidthVal, 1, scalarShape.data(), scalarShape.size()));
            }
            else if (name == "backbone_fpn_0" || name == "backbone_fpn_1" ||
                     name == "backbone_fpn_2" || name == "vision_pos_enc_2") {
                // Find matching image encoder output by name
                auto it = imgOutputMap.find(name);
                if (it != imgOutputMap.end()) {
                    decInputs.push_back(std::move(imgOutputs[it->second]));
                } else {
                    std::cerr << "[ONNXSAM3] Image encoder output not found: " << name << std::endl;
                    float dummy = 0.0f;
                    std::vector<int64_t> dummyShape = { 1 };
                    decInputs.push_back(Ort::Value::CreateTensor<float>(
                        *m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size()));
                }
            }
            else if (name == "language_mask") {
                decInputs.push_back(Ort::Value::CreateTensor<bool>(
                    *m_memInfo,
                    reinterpret_cast<bool*>(m_cachedLangMask.data()),
                    m_cachedLangMask.size(),
                    m_cachedLangMaskShape.data(),
                    m_cachedLangMaskShape.size()));
            }
            else if (name == "language_features") {
                decInputs.push_back(Ort::Value::CreateTensor<float>(
                    *m_memInfo,
                    m_cachedLangFeatures.data(),
                    m_cachedLangFeatures.size(),
                    m_cachedLangFeaturesShape.data(),
                    m_cachedLangFeaturesShape.size()));
            }
            else if (name == "box_coords") {
                decInputs.push_back(Ort::Value::CreateTensor<float>(
                    *m_memInfo, boxCoordsData, 4,
                    boxCoordsShape.data(), boxCoordsShape.size()));
            }
            else if (name == "box_labels") {
                decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
                    *m_memInfo, boxLabelsData, 1,
                    boxLabelsShape.data(), boxLabelsShape.size()));
            }
            else if (name == "box_masks") {
                decInputs.push_back(Ort::Value::CreateTensor<bool>(
                    *m_memInfo, boxMasksData, 1,
                    boxMasksShape.data(), boxMasksShape.size()));
            }
            else {
                std::cerr << "[ONNXSAM3] Unknown decoder input: " << name << std::endl;
                // Create a dummy scalar float tensor
                float dummy = 0.0f;
                std::vector<int64_t> dummyShape = { 1 };
                decInputs.push_back(Ort::Value::CreateTensor<float>(
                    *m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size()));
            }
        }

        // ---- Debug: print decoder input stats for comparison with ANSSAM3 ----
        {
            std::cout << "[ONNXSAM3] Decoder inputs before Run:" << std::endl;
            for (size_t di = 0; di < numDecInputs; ++di) {
                const std::string& dname = m_decoder.inputNames_[di];
                auto info = decInputs[di].GetTensorTypeAndShapeInfo();
                auto shape = info.GetShape();
                auto elemType = info.GetElementType();
                std::cout << "  " << dname << " type=" << elemType << " shape=[";
                for (size_t d = 0; d < shape.size(); ++d) {
                    if (d > 0) std::cout << ",";
                    std::cout << shape[d];
                }
                std::cout << "]";
                // Print mean/first5 for float tensors
                if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && !shape.empty()) {
                    size_t numElems = info.GetElementCount();
                    if (numElems > 0 && numElems < 100000000) {
                        const float* data = decInputs[di].GetTensorData<float>();
                        double sum = 0;
                        for (size_t k = 0; k < numElems; ++k) sum += data[k];
                        double mean = sum / numElems;
                        std::cout << " mean=" << mean << " first5:";
                        for (size_t k = 0; k < std::min(numElems, (size_t)5); ++k)
                            std::cout << " " << data[k];
                    }
                }
                // Print bool tensor values (for language_mask)
                else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL && !shape.empty()) {
                    size_t numElems = info.GetElementCount();
                    const bool* data = decInputs[di].GetTensorData<bool>();
                    std::cout << " vals:";
                    for (size_t k = 0; k < std::min(numElems, (size_t)32); ++k)
                        std::cout << " " << (int)data[k];
                }
                // Print int64 scalar value
                else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && shape.empty()) {
                    const int64_t* data = decInputs[di].GetTensorData<int64_t>();
                    std::cout << " value=" << data[0];
                }
                std::cout << std::endl;
            }
        }

        // ---- 3) Run decoder ----
        auto decOutputs = m_decoder.session->Run(
            Ort::RunOptions{nullptr},
            m_decoder.inputNames.data(),
            decInputs.data(),
            decInputs.size(),
            m_decoder.outputNames.data(),
            m_decoder.outputNames.size());

        // Decoder outputs (from Python analysis):
        //   output[0]: boxes  [N, 4] float32
        //   output[1]: scores [N] float32
        //   output[2]: masks  [N, 1, H, W] bool

        // Find output indices by name
        int boxesIdx = 0, scoresIdx = 1, masksIdx = 2;
        for (size_t i = 0; i < m_decoder.outputNames_.size(); ++i) {
            const auto& name = m_decoder.outputNames_[i];
            if (name.find("box") != std::string::npos) boxesIdx = static_cast<int>(i);
            else if (name.find("score") != std::string::npos) scoresIdx = static_cast<int>(i);
            else if (name.find("mask") != std::string::npos) masksIdx = static_cast<int>(i);
        }

        // Get boxes
        auto boxInfo = decOutputs[boxesIdx].GetTensorTypeAndShapeInfo();
        auto boxShape = boxInfo.GetShape();
        int numBoxes = (boxShape.size() >= 1) ? static_cast<int>(boxShape[0]) : 0;
        const float* boxesData = decOutputs[boxesIdx].GetTensorData<float>();

        // Get scores
        const float* scoresData = decOutputs[scoresIdx].GetTensorData<float>();

        // Get masks
        auto maskInfo = decOutputs[masksIdx].GetTensorTypeAndShapeInfo();
        auto maskShape = maskInfo.GetShape();
        // masks shape: [N, 1, H, W]
        int maskH = (maskShape.size() >= 3) ? static_cast<int>(maskShape[2]) : 0;
        int maskW = (maskShape.size() >= 4) ? static_cast<int>(maskShape[3]) : 0;
        const bool* masksData = decOutputs[masksIdx].GetTensorData<bool>();

        m_maskH = maskH;
        m_maskW = maskW;

        std::cout << "[ONNXSAM3] Decoder: " << numBoxes << " detections, "
                  << "mask=" << maskH << "x" << maskW << std::endl;

        return postprocessResults(boxesData, numBoxes, scoresData,
                                  masksData, maskH, maskW,
                                  origW, origH, segThreshold);
    }

    // ====================================================================
    // postprocessResults — convert decoder outputs to SAM3Result
    // ====================================================================

    std::vector<SAM3Result> ONNXSAM3::postprocessResults(
        const float* boxesData, int numBoxes,
        const float* scoresData,
        const bool*  masksData, int maskH, int maskW,
        int origWidth, int origHeight,
        float scoreThreshold)
    {
        std::vector<SAM3Result> results;

        for (int i = 0; i < numBoxes; ++i) {
            float score = scoresData[i];
            if (score < scoreThreshold)
                continue;

            // Box: [x1, y1, x2, y2] in original image coordinates
            float x1 = boxesData[i * 4 + 0];
            float y1 = boxesData[i * 4 + 1];
            float x2 = boxesData[i * 4 + 2];
            float y2 = boxesData[i * 4 + 3];

            // Clamp to image bounds
            x1 = std::max(0.0f, std::min(x1, static_cast<float>(origWidth)));
            y1 = std::max(0.0f, std::min(y1, static_cast<float>(origHeight)));
            x2 = std::max(0.0f, std::min(x2, static_cast<float>(origWidth)));
            y2 = std::max(0.0f, std::min(y2, static_cast<float>(origHeight)));

            SAM3Result obj;
            obj.box = cv::Rect(
                static_cast<int>(x1), static_cast<int>(y1),
                static_cast<int>(x2 - x1), static_cast<int>(y2 - y1));
            obj.confidence = score;

            if (obj.box.width <= 0 || obj.box.height <= 0)
                continue;

            // Extract this instance's mask: [1, H, W] bool at index i
            // masksData layout: [N, 1, H, W]
            const bool* instanceMask = masksData + static_cast<size_t>(i) * 1 * maskH * maskW;

            // Create binary mask at original resolution
            cv::Mat boolMask(maskH, maskW, CV_8UC1);
            for (int y = 0; y < maskH; ++y)
                for (int x = 0; x < maskW; ++x)
                    boolMask.at<uint8_t>(y, x) = instanceMask[y * maskW + x] ? 255 : 0;

            // Resize mask to original image dimensions
            cv::Mat fullMask;
            cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight),
                       0, 0, cv::INTER_LINEAR);
            cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY);

            // Crop mask to bounding box
            cv::Mat roiMask = fullMask(obj.box).clone();
            obj.mask = roiMask;

            // Create polygon from contour in the ROI region
            std::vector<std::vector<cv::Point>> contours;
            cv::findContours(roiMask.clone(), contours, cv::RETR_EXTERNAL,
                             cv::CHAIN_APPROX_SIMPLE);

            if (!contours.empty()) {
                // Use the largest contour
                int largestIdx = 0;
                double largestArea = 0;
                for (int c = 0; c < static_cast<int>(contours.size()); ++c) {
                    double area = cv::contourArea(contours[c]);
                    if (area > largestArea) {
                        largestArea = area;
                        largestIdx = c;
                    }
                }

                std::vector<cv::Point> approx;
                double epsilon = 0.01 * cv::arcLength(contours[largestIdx], true);
                cv::approxPolyDP(contours[largestIdx], approx, epsilon, true);

                // Convert to absolute coordinates (ROI is relative to box)
                for (const auto& pt : approx) {
                    obj.polygon.emplace_back(
                        static_cast<float>(pt.x + obj.box.x),
                        static_cast<float>(pt.y + obj.box.y));
                }
            }

            // Fallback: box corners as polygon
            if (obj.polygon.size() < 3) {
                obj.polygon = {
                    cv::Point2f(static_cast<float>(obj.box.x),
                                static_cast<float>(obj.box.y)),
                    cv::Point2f(static_cast<float>(obj.box.x + obj.box.width),
                                static_cast<float>(obj.box.y)),
                    cv::Point2f(static_cast<float>(obj.box.x + obj.box.width),
                                static_cast<float>(obj.box.y + obj.box.height)),
                    cv::Point2f(static_cast<float>(obj.box.x),
                                static_cast<float>(obj.box.y + obj.box.height))
                };
            }

            results.push_back(std::move(obj));
        }

        return results;
    }

} // namespace ANSCENTER