#include "ONNXSAM3.h" #include "ONNXEngine.h" // OrtCompatiableGetInputName/OutputName helpers #include #include #include #include #include #include namespace ANSCENTER { // ==================================================================== // SessionBundle destructor // ==================================================================== ONNXSAM3::SessionBundle::~SessionBundle() { if (session) { delete session; session = nullptr; } } // ==================================================================== // EP helpers (same logic as BasicOrtHandler) // ==================================================================== bool ONNXSAM3::TryAppendCUDA(Ort::SessionOptions& session_options) { try { OrtCUDAProviderOptionsV2* cuda_options = nullptr; Ort::GetApi().CreateCUDAProviderOptions(&cuda_options); const char* keys[] = { "device_id", "arena_extend_strategy", "cudnn_conv_algo_search", "cudnn_conv_use_max_workspace", // reduce cuDNN temp memory "do_copy_in_default_stream", // allow async copies }; const char* values[] = { "0", "kSameAsRequested", "HEURISTIC", "0", // 0 = minimal workspace "0", // 0 = use separate stream }; Ort::GetApi().UpdateCUDAProviderOptions(cuda_options, keys, values, 5); session_options.AppendExecutionProvider_CUDA_V2(*cuda_options); Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options); std::cout << "[ONNXSAM3] CUDA EP attached." << std::endl; return true; } catch (const Ort::Exception& e) { std::cerr << "[ONNXSAM3] CUDA EP failed: " << e.what() << std::endl; return false; } } bool ONNXSAM3::TryAppendDirectML(Ort::SessionOptions& session_options) { try { std::unordered_map options = { {"device_id","0"} }; session_options.AppendExecutionProvider("DML", options); std::cout << "[ONNXSAM3] DirectML EP attached." << std::endl; return true; } catch (const Ort::Exception& e) { std::cerr << "[ONNXSAM3] DirectML EP failed: " << e.what() << std::endl; return false; } } bool ONNXSAM3::TryAppendOpenVINO(Ort::SessionOptions& session_options) { std::vector> configs = { {{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}, {{"device_type","GPU.0"}, {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}, {{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}} }; for (const auto& config : configs) { try { session_options.AppendExecutionProvider_OpenVINO_V2(config); std::cout << "[ONNXSAM3] OpenVINO EP attached (" << config.at("device_type") << ")." << std::endl; return true; } catch (const Ort::Exception&) { /* try next */ } } return false; } // ==================================================================== // createSessionBundle — create one ORT session with EP + external data // ==================================================================== void ONNXSAM3::createSessionBundle(SessionBundle& bundle, const std::string& onnxPath, const std::string& label, bool forceCPU, GraphOptimizationLevel optLevel) { std::cout << "[ONNXSAM3] Creating " << label << " session..." << std::endl; Ort::SessionOptions opts; opts.SetIntraOpNumThreads(m_numThreads); opts.SetGraphOptimizationLevel(optLevel); opts.SetLogSeverityLevel(4); // Determine effective engine type EngineType engine = forceCPU ? EngineType::CPU : m_engineType; if (forceCPU) std::cout << "[ONNXSAM3] " << label << ": forced to CPU to save GPU memory." << std::endl; std::vector available = Ort::GetAvailableProviders(); auto hasProvider = [&](const std::string& name) { return std::find(available.begin(), available.end(), name) != available.end(); }; bool epAttached = false; { switch (engine) { case EngineType::NVIDIA_GPU: if (hasProvider("CUDAExecutionProvider")) epAttached = TryAppendCUDA(opts); break; case EngineType::AMD_GPU: if (hasProvider("DmlExecutionProvider")) epAttached = TryAppendDirectML(opts); break; case EngineType::OPENVINO_GPU: if (hasProvider("OpenVINOExecutionProvider")) epAttached = TryAppendOpenVINO(opts); break; case EngineType::CPU: default: epAttached = true; break; } } if (!epAttached) std::cout << "[ONNXSAM3] " << label << ": using CPU EP." << std::endl; // -- CWD workaround for external data resolution -- std::filesystem::path modelFsPath(onnxPath); std::filesystem::path modelDir = modelFsPath.parent_path(); std::filesystem::path prevCwd = std::filesystem::current_path(); if (!modelDir.empty() && std::filesystem::is_directory(modelDir)) std::filesystem::current_path(modelDir); // -- Pre-load external data file if one matches the model stem -- // The external data filename stored inside the .onnx protobuf may // differ from the .onnx filename on disk (e.g. anssam3_image_encoder.onnx // internally references sam3_image_encoder.onnx.data). We only // pre-load when a stem-based candidate exists on disk. If no match // is found, we load the model from its FILE PATH (not memory buffer) // so that ORT resolves external data relative to the model directory. std::vector extDataBuffer; std::filesystem::path extDataPath; { std::wstring stem = modelFsPath.stem().wstring(); std::vector candidates = { modelDir / (stem + L".onnx_data"), // monolithic convention modelDir / (stem + L".onnx.data"), // split-model convention }; for (auto& c : candidates) { if (std::filesystem::exists(c)) { extDataPath = c; break; } } if (extDataPath.empty()) { std::cout << "[ONNXSAM3] " << label << ": no stem-matched external data; " << "ORT will resolve from model directory." << std::endl; } } if (!extDataPath.empty() && std::filesystem::exists(extDataPath)) { auto fileSize = std::filesystem::file_size(extDataPath); std::cout << "[ONNXSAM3] " << label << ": external data " << extDataPath.filename().string() << " (" << (fileSize / (1024*1024)) << " MB)" << std::endl; try { std::ifstream ifs(extDataPath, std::ios::binary); if (ifs) { extDataBuffer.resize(static_cast(fileSize)); ifs.read(extDataBuffer.data(), static_cast(fileSize)); ifs.close(); std::vector> extFileNames = { extDataPath.filename().wstring() }; std::vector extBuffers = { extDataBuffer.data() }; std::vector extLengths = { extDataBuffer.size() }; opts.AddExternalInitializersFromFilesInMemory( extFileNames, extBuffers, extLengths); } } catch (const std::bad_alloc&) { std::cerr << "[ONNXSAM3] " << label << ": could not allocate memory for external data. " << "ORT will use file mapping." << std::endl; extDataBuffer.clear(); extDataBuffer.shrink_to_fit(); } } // -- Load .onnx proto into memory -- std::vector modelBuffer; bool useModelBuffer = false; try { auto modelFileSize = std::filesystem::file_size(modelFsPath); modelBuffer.resize(static_cast(modelFileSize)); std::ifstream mifs(modelFsPath, std::ios::binary); if (mifs) { mifs.read(modelBuffer.data(), static_cast(modelFileSize)); mifs.close(); useModelBuffer = true; } } catch (const std::exception& e) { std::cerr << "[ONNXSAM3] " << label << ": could not read model file: " << e.what() << std::endl; } // -- Create session (with GPU → CPU fallback) -- std::wstring onnxPathW(onnxPath.begin(), onnxPath.end()); auto doCreate = [&](Ort::SessionOptions& sopts, const char* tag) { // Use memory-buffer loading when external data has been pre-loaded; // otherwise use file-path loading so ORT can resolve external data // relative to the model's directory on disk. if (useModelBuffer && !extDataBuffer.empty()) bundle.session = new Ort::Session(*m_env, modelBuffer.data(), modelBuffer.size(), sopts); else bundle.session = new Ort::Session(*m_env, onnxPathW.c_str(), sopts); std::cout << "[ONNXSAM3] " << label << " session created (" << tag << ")." << std::endl; }; try { doCreate(opts, "primary EP"); } catch (const Ort::Exception& e) { std::cerr << "[ONNXSAM3] " << label << " session FAILED: " << e.what() << std::endl; if (engine != EngineType::CPU && epAttached) { std::cerr << "[ONNXSAM3] " << label << ": retrying with CPU..." << std::endl; Ort::SessionOptions cpuOpts; cpuOpts.SetIntraOpNumThreads(m_numThreads); cpuOpts.SetGraphOptimizationLevel(optLevel); cpuOpts.SetLogSeverityLevel(4); if (!extDataBuffer.empty()) { std::vector> extFileNames = { extDataPath.filename().wstring() }; std::vector extBuffers = { extDataBuffer.data() }; std::vector extLengths = { extDataBuffer.size() }; cpuOpts.AddExternalInitializersFromFilesInMemory( extFileNames, extBuffers, extLengths); } doCreate(cpuOpts, "CPU fallback"); } else { throw; } } // Restore CWD & free buffers std::filesystem::current_path(prevCwd); extDataBuffer.clear(); extDataBuffer.shrink_to_fit(); modelBuffer.clear(); modelBuffer.shrink_to_fit(); // -- Read input/output names -- Ort::Allocator allocator(*bundle.session, *m_memInfo); size_t numInputs = bundle.session->GetInputCount(); bundle.inputNames_.resize(numInputs); bundle.inputNames.resize(numInputs); for (size_t i = 0; i < numInputs; ++i) { bundle.inputNames_[i] = OrtCompatiableGetInputName(i, allocator, bundle.session); bundle.inputNames[i] = bundle.inputNames_[i].c_str(); } size_t numOutputs = bundle.session->GetOutputCount(); bundle.outputNames_.resize(numOutputs); bundle.outputNames.resize(numOutputs); for (size_t i = 0; i < numOutputs; ++i) { bundle.outputNames_[i] = OrtCompatiableGetOutputName(i, allocator, bundle.session); bundle.outputNames[i] = bundle.outputNames_[i].c_str(); } // Log I/O info for (size_t i = 0; i < numInputs; ++i) { auto info = bundle.session->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo(); auto shape = info.GetShape(); std::cout << "[ONNXSAM3] " << label << " input[" << i << "]: " << bundle.inputNames_[i] << " shape=["; for (size_t d = 0; d < shape.size(); ++d) { if (d > 0) std::cout << ","; std::cout << shape[d]; } std::cout << "]" << std::endl; } for (size_t i = 0; i < numOutputs; ++i) { auto info = bundle.session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo(); auto shape = info.GetShape(); std::cout << "[ONNXSAM3] " << label << " output[" << i << "]: " << bundle.outputNames_[i] << " shape=["; for (size_t d = 0; d < shape.size(); ++d) { if (d > 0) std::cout << ","; std::cout << shape[d]; } std::cout << "]" << std::endl; } } // ==================================================================== // Constructor // ==================================================================== ONNXSAM3::ONNXSAM3(const std::string& modelFolder, EngineType engineType, unsigned int num_threads) : m_engineType(engineType), m_numThreads(num_threads), m_modelFolder(modelFolder) { // Initialize ORT API const auto& epInfo = EPLoader::Current(); if (Ort::Global::api_ == nullptr) Ort::InitApi(static_cast(EPLoader::GetOrtApiRaw())); m_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, "ONNXSAM3"); m_memInfo = new Ort::MemoryInfo( Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)); // Build paths std::string imgPath = modelFolder + "\\sam3_image_encoder.onnx"; std::string langPath = modelFolder + "\\sam3_language_encoder.onnx"; std::string decPath = modelFolder + "\\sam3_decoder.onnx"; // Create 3 sessions. // Language encoder runs on CPU: it is only called once per prompt // change and keeping it on GPU wastes ~1.5 GB of VRAM that the // image encoder and decoder need for their activation tensors. // Image encoder uses ORT_ENABLE_BASIC: enables constant folding // and redundant-node elimination without the complex fusions // (MatMulScaleFusion) that are slower on this model and risk OOM // on 8 GB GPUs. Benchmarked: BASIC=4.6s vs ALL=5.2s. createSessionBundle(m_imageEncoder, imgPath, "ImageEncoder", false, GraphOptimizationLevel::ORT_ENABLE_BASIC); createSessionBundle(m_langEncoder, langPath, "LangEncoder", true, GraphOptimizationLevel::ORT_ENABLE_ALL); createSessionBundle(m_decoder, decPath, "Decoder"); std::cout << "[ONNXSAM3] All 3 sessions created successfully." << std::endl; } // ==================================================================== // Destructor // ==================================================================== ONNXSAM3::~ONNXSAM3() { // Sessions must be destroyed BEFORE the Ort::Env they were created // with. Member destructors run after the destructor body, so we // must explicitly release sessions here first. if (m_decoder.session) { delete m_decoder.session; m_decoder.session = nullptr; } if (m_langEncoder.session) { delete m_langEncoder.session; m_langEncoder.session = nullptr; } if (m_imageEncoder.session) { delete m_imageEncoder.session; m_imageEncoder.session = nullptr; } if (m_memInfo) { delete m_memInfo; m_memInfo = nullptr; } if (m_env) { delete m_env; m_env = nullptr; } } // ==================================================================== // preprocessImage — BGR → RGB, resize to 1008, HWC→CHW, uint8 // ==================================================================== void ONNXSAM3::preprocessImage(const cv::Mat& mat, std::vector& buffer) { // 3-model image encoder expects uint8 [3, 1008, 1008] cv::Mat resized; cv::resize(mat, resized, cv::Size(m_inputSize, m_inputSize)); cv::Mat rgb; cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB); const size_t planeSize = static_cast(m_inputSize) * m_inputSize; buffer.resize(3 * planeSize); // HWC → CHW via cv::split + memcpy (much faster than per-pixel loop) cv::Mat channels[3]; cv::split(rgb, channels); for (int c = 0; c < 3; ++c) std::memcpy(buffer.data() + c * planeSize, channels[c].data, planeSize); } // ==================================================================== // setPrompt — run language encoder, cache results // ==================================================================== void ONNXSAM3::setPrompt(const std::vector& inputIds, const std::vector& attentionMask) { if (!m_langEncoder.session) { std::cerr << "[ONNXSAM3] Language encoder not initialized." << std::endl; return; } // Language encoder input: "tokens" [1, 32] int64 std::vector tokenShape = { 1, static_cast(inputIds.size()) }; m_tokenLength = static_cast(inputIds.size()); // We need a non-const copy for CreateTensor std::vector tokenData = inputIds; Ort::Value tokenTensor = Ort::Value::CreateTensor( *m_memInfo, tokenData.data(), tokenData.size(), tokenShape.data(), tokenShape.size()); std::vector inputs; inputs.push_back(std::move(tokenTensor)); // Run language encoder std::cout << "[ONNXSAM3] Running language encoder..." << std::endl; auto outputs = m_langEncoder.session->Run( Ort::RunOptions{nullptr}, m_langEncoder.inputNames.data(), inputs.data(), inputs.size(), m_langEncoder.outputNames.data(), m_langEncoder.outputNames.size()); // Language encoder outputs (from Python analysis): // output[0]: text_attention_mask [1, 32] bool → "language_mask" for decoder // output[1]: text_memory [32, 1, 256] float32 → "language_features" for decoder // output[2]: text_embeds [32, 1, 1024] float32 → NOT used by decoder // Find outputs by name or fall back to index int maskIdx = -1, featIdx = -1; for (size_t i = 0; i < m_langEncoder.outputNames_.size(); ++i) { const auto& name = m_langEncoder.outputNames_[i]; if (name.find("attention_mask") != std::string::npos || name.find("text_attention") != std::string::npos) { maskIdx = static_cast(i); } else if (name.find("text_memory") != std::string::npos || name.find("memory") != std::string::npos) { featIdx = static_cast(i); } } // Fallback: first output is mask, second is features if (maskIdx < 0) maskIdx = 0; if (featIdx < 0) featIdx = 1; // Cache language mask (bool) { auto info = outputs[maskIdx].GetTensorTypeAndShapeInfo(); m_cachedLangMaskShape = info.GetShape(); size_t count = info.GetElementCount(); const bool* data = outputs[maskIdx].GetTensorData(); m_cachedLangMask.resize(count); for (size_t i = 0; i < count; ++i) m_cachedLangMask[i] = data[i] ? 1 : 0; } // Cache language features (float32) { auto info = outputs[featIdx].GetTensorTypeAndShapeInfo(); m_cachedLangFeaturesShape = info.GetShape(); size_t count = info.GetElementCount(); const float* data = outputs[featIdx].GetTensorData(); m_cachedLangFeatures.assign(data, data + count); } m_promptSet = true; std::cout << "[ONNXSAM3] Language encoder done. Mask shape=["; for (size_t i = 0; i < m_cachedLangMaskShape.size(); ++i) { if (i > 0) std::cout << ","; std::cout << m_cachedLangMaskShape[i]; } std::cout << "] Features shape=["; for (size_t i = 0; i < m_cachedLangFeaturesShape.size(); ++i) { if (i > 0) std::cout << ","; std::cout << m_cachedLangFeaturesShape[i]; } std::cout << "]" << std::endl; } // ==================================================================== // detect — image encoder + decoder pipeline // ==================================================================== std::vector ONNXSAM3::detect(const cv::Mat& mat, float segThreshold) { if (mat.empty()) return {}; if (!m_promptSet) { std::cerr << "[ONNXSAM3] No prompt set. Call setPrompt() first." << std::endl; return {}; } const int origW = mat.cols; const int origH = mat.rows; // ---- 1) Image Encoder ---- std::vector imgBuffer; preprocessImage(mat, imgBuffer); std::vector imgShape = { 3, m_inputSize, m_inputSize }; Ort::Value imgTensor = Ort::Value::CreateTensor( *m_memInfo, imgBuffer.data(), imgBuffer.size(), imgShape.data(), imgShape.size()); std::vector imgInputs; imgInputs.push_back(std::move(imgTensor)); auto imgOutputs = m_imageEncoder.session->Run( Ort::RunOptions{nullptr}, m_imageEncoder.inputNames.data(), imgInputs.data(), imgInputs.size(), m_imageEncoder.outputNames.data(), m_imageEncoder.outputNames.size()); // Image encoder outputs (6 total, matched by name): // vision_pos_enc_0/1/2 — only _2 used by decoder // backbone_fpn_0/1/2 — all 3 used by decoder // Build a map from image encoder output names to indices std::unordered_map imgOutputMap; for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i) imgOutputMap[m_imageEncoder.outputNames_[i]] = static_cast(i); // Release unused outputs (vision_pos_enc_0, vision_pos_enc_1) to free // GPU memory before running the decoder. These are ~105 MB on CUDA. for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i) { const auto& name = m_imageEncoder.outputNames_[i]; if (name == "vision_pos_enc_0" || name == "vision_pos_enc_1") { imgOutputs[i] = Ort::Value(nullptr); } } // ---- 2) Build decoder inputs ---- size_t numDecInputs = m_decoder.inputNames.size(); std::vector decInputs; decInputs.reserve(numDecInputs); // Prepare scalar and prompt tensors int64_t origHeightVal = static_cast(origH); int64_t origWidthVal = static_cast(origW); std::vector scalarShape = {}; // scalar = 0-dim tensor float boxCoordsData[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; std::vector boxCoordsShape = { 1, 1, 4 }; int64_t boxLabelsData[1] = { -1 }; // no box prompt std::vector boxLabelsShape = { 1, 1 }; bool boxMasksData[1] = { false }; // no box prompt (language-only grounding) std::vector boxMasksShape = { 1, 1 }; // Build input tensors in the order expected by the decoder for (size_t i = 0; i < numDecInputs; ++i) { const std::string& name = m_decoder.inputNames_[i]; if (name == "original_height") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, &origHeightVal, 1, scalarShape.data(), scalarShape.size())); } else if (name == "original_width") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, &origWidthVal, 1, scalarShape.data(), scalarShape.size())); } else if (name == "backbone_fpn_0" || name == "backbone_fpn_1" || name == "backbone_fpn_2" || name == "vision_pos_enc_2") { // Find matching image encoder output by name auto it = imgOutputMap.find(name); if (it != imgOutputMap.end()) { decInputs.push_back(std::move(imgOutputs[it->second])); } else { std::cerr << "[ONNXSAM3] Image encoder output not found: " << name << std::endl; float dummy = 0.0f; std::vector dummyShape = { 1 }; decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size())); } } else if (name == "language_mask") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, reinterpret_cast(m_cachedLangMask.data()), m_cachedLangMask.size(), m_cachedLangMaskShape.data(), m_cachedLangMaskShape.size())); } else if (name == "language_features") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, m_cachedLangFeatures.data(), m_cachedLangFeatures.size(), m_cachedLangFeaturesShape.data(), m_cachedLangFeaturesShape.size())); } else if (name == "box_coords") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, boxCoordsData, 4, boxCoordsShape.data(), boxCoordsShape.size())); } else if (name == "box_labels") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, boxLabelsData, 1, boxLabelsShape.data(), boxLabelsShape.size())); } else if (name == "box_masks") { decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, boxMasksData, 1, boxMasksShape.data(), boxMasksShape.size())); } else { std::cerr << "[ONNXSAM3] Unknown decoder input: " << name << std::endl; // Create a dummy scalar float tensor float dummy = 0.0f; std::vector dummyShape = { 1 }; decInputs.push_back(Ort::Value::CreateTensor( *m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size())); } } // ---- Debug: print decoder input stats for comparison with ANSSAM3 ---- { std::cout << "[ONNXSAM3] Decoder inputs before Run:" << std::endl; for (size_t di = 0; di < numDecInputs; ++di) { const std::string& dname = m_decoder.inputNames_[di]; auto info = decInputs[di].GetTensorTypeAndShapeInfo(); auto shape = info.GetShape(); auto elemType = info.GetElementType(); std::cout << " " << dname << " type=" << elemType << " shape=["; for (size_t d = 0; d < shape.size(); ++d) { if (d > 0) std::cout << ","; std::cout << shape[d]; } std::cout << "]"; // Print mean/first5 for float tensors if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && !shape.empty()) { size_t numElems = info.GetElementCount(); if (numElems > 0 && numElems < 100000000) { const float* data = decInputs[di].GetTensorData(); double sum = 0; for (size_t k = 0; k < numElems; ++k) sum += data[k]; double mean = sum / numElems; std::cout << " mean=" << mean << " first5:"; for (size_t k = 0; k < std::min(numElems, (size_t)5); ++k) std::cout << " " << data[k]; } } // Print bool tensor values (for language_mask) else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL && !shape.empty()) { size_t numElems = info.GetElementCount(); const bool* data = decInputs[di].GetTensorData(); std::cout << " vals:"; for (size_t k = 0; k < std::min(numElems, (size_t)32); ++k) std::cout << " " << (int)data[k]; } // Print int64 scalar value else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && shape.empty()) { const int64_t* data = decInputs[di].GetTensorData(); std::cout << " value=" << data[0]; } std::cout << std::endl; } } // ---- 3) Run decoder ---- auto decOutputs = m_decoder.session->Run( Ort::RunOptions{nullptr}, m_decoder.inputNames.data(), decInputs.data(), decInputs.size(), m_decoder.outputNames.data(), m_decoder.outputNames.size()); // Decoder outputs (from Python analysis): // output[0]: boxes [N, 4] float32 // output[1]: scores [N] float32 // output[2]: masks [N, 1, H, W] bool // Find output indices by name int boxesIdx = 0, scoresIdx = 1, masksIdx = 2; for (size_t i = 0; i < m_decoder.outputNames_.size(); ++i) { const auto& name = m_decoder.outputNames_[i]; if (name.find("box") != std::string::npos) boxesIdx = static_cast(i); else if (name.find("score") != std::string::npos) scoresIdx = static_cast(i); else if (name.find("mask") != std::string::npos) masksIdx = static_cast(i); } // Get boxes auto boxInfo = decOutputs[boxesIdx].GetTensorTypeAndShapeInfo(); auto boxShape = boxInfo.GetShape(); int numBoxes = (boxShape.size() >= 1) ? static_cast(boxShape[0]) : 0; const float* boxesData = decOutputs[boxesIdx].GetTensorData(); // Get scores const float* scoresData = decOutputs[scoresIdx].GetTensorData(); // Get masks auto maskInfo = decOutputs[masksIdx].GetTensorTypeAndShapeInfo(); auto maskShape = maskInfo.GetShape(); // masks shape: [N, 1, H, W] int maskH = (maskShape.size() >= 3) ? static_cast(maskShape[2]) : 0; int maskW = (maskShape.size() >= 4) ? static_cast(maskShape[3]) : 0; const bool* masksData = decOutputs[masksIdx].GetTensorData(); m_maskH = maskH; m_maskW = maskW; std::cout << "[ONNXSAM3] Decoder: " << numBoxes << " detections, " << "mask=" << maskH << "x" << maskW << std::endl; return postprocessResults(boxesData, numBoxes, scoresData, masksData, maskH, maskW, origW, origH, segThreshold); } // ==================================================================== // postprocessResults — convert decoder outputs to SAM3Result // ==================================================================== std::vector ONNXSAM3::postprocessResults( const float* boxesData, int numBoxes, const float* scoresData, const bool* masksData, int maskH, int maskW, int origWidth, int origHeight, float scoreThreshold) { std::vector results; for (int i = 0; i < numBoxes; ++i) { float score = scoresData[i]; if (score < scoreThreshold) continue; // Box: [x1, y1, x2, y2] in original image coordinates float x1 = boxesData[i * 4 + 0]; float y1 = boxesData[i * 4 + 1]; float x2 = boxesData[i * 4 + 2]; float y2 = boxesData[i * 4 + 3]; // Clamp to image bounds x1 = std::max(0.0f, std::min(x1, static_cast(origWidth))); y1 = std::max(0.0f, std::min(y1, static_cast(origHeight))); x2 = std::max(0.0f, std::min(x2, static_cast(origWidth))); y2 = std::max(0.0f, std::min(y2, static_cast(origHeight))); SAM3Result obj; obj.box = cv::Rect( static_cast(x1), static_cast(y1), static_cast(x2 - x1), static_cast(y2 - y1)); obj.confidence = score; if (obj.box.width <= 0 || obj.box.height <= 0) continue; // Extract this instance's mask: [1, H, W] bool at index i // masksData layout: [N, 1, H, W] const bool* instanceMask = masksData + static_cast(i) * 1 * maskH * maskW; // Create binary mask at original resolution cv::Mat boolMask(maskH, maskW, CV_8UC1); for (int y = 0; y < maskH; ++y) for (int x = 0; x < maskW; ++x) boolMask.at(y, x) = instanceMask[y * maskW + x] ? 255 : 0; // Resize mask to original image dimensions cv::Mat fullMask; cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight), 0, 0, cv::INTER_LINEAR); cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY); // Crop mask to bounding box cv::Mat roiMask = fullMask(obj.box).clone(); obj.mask = roiMask; // Create polygon from contour in the ROI region std::vector> contours; cv::findContours(roiMask.clone(), contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); if (!contours.empty()) { // Use the largest contour int largestIdx = 0; double largestArea = 0; for (int c = 0; c < static_cast(contours.size()); ++c) { double area = cv::contourArea(contours[c]); if (area > largestArea) { largestArea = area; largestIdx = c; } } std::vector approx; double epsilon = 0.01 * cv::arcLength(contours[largestIdx], true); cv::approxPolyDP(contours[largestIdx], approx, epsilon, true); // Convert to absolute coordinates (ROI is relative to box) for (const auto& pt : approx) { obj.polygon.emplace_back( static_cast(pt.x + obj.box.x), static_cast(pt.y + obj.box.y)); } } // Fallback: box corners as polygon if (obj.polygon.size() < 3) { obj.polygon = { cv::Point2f(static_cast(obj.box.x), static_cast(obj.box.y)), cv::Point2f(static_cast(obj.box.x + obj.box.width), static_cast(obj.box.y)), cv::Point2f(static_cast(obj.box.x + obj.box.width), static_cast(obj.box.y + obj.box.height)), cv::Point2f(static_cast(obj.box.x), static_cast(obj.box.y + obj.box.height)) }; } results.push_back(std::move(obj)); } return results; } } // namespace ANSCENTER