Files
ANSCORE/engines/ONNXEngine/ONNXSAM3.cpp

835 lines
36 KiB
C++

#include "ONNXSAM3.h"
#include "ONNXEngine.h" // OrtCompatiableGetInputName/OutputName helpers
#include <iostream>
#include <fstream>
#include <filesystem>
#include <cmath>
#include <algorithm>
#include <unordered_map>
namespace ANSCENTER
{
// ====================================================================
// SessionBundle destructor
// ====================================================================
ONNXSAM3::SessionBundle::~SessionBundle()
{
if (session) {
delete session;
session = nullptr;
}
}
// ====================================================================
// EP helpers (same logic as BasicOrtHandler)
// ====================================================================
bool ONNXSAM3::TryAppendCUDA(Ort::SessionOptions& session_options)
{
try {
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);
const char* keys[] = {
"device_id",
"arena_extend_strategy",
"cudnn_conv_algo_search",
"cudnn_conv_use_max_workspace", // reduce cuDNN temp memory
"do_copy_in_default_stream", // allow async copies
};
const char* values[] = {
"0",
"kSameAsRequested",
"HEURISTIC",
"0", // 0 = minimal workspace
"0", // 0 = use separate stream
};
Ort::GetApi().UpdateCUDAProviderOptions(cuda_options, keys, values, 5);
session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
std::cout << "[ONNXSAM3] CUDA EP attached." << std::endl;
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXSAM3] CUDA EP failed: " << e.what() << std::endl;
return false;
}
}
bool ONNXSAM3::TryAppendDirectML(Ort::SessionOptions& session_options)
{
try {
std::unordered_map<std::string, std::string> options = { {"device_id","0"} };
session_options.AppendExecutionProvider("DML", options);
std::cout << "[ONNXSAM3] DirectML EP attached." << std::endl;
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXSAM3] DirectML EP failed: " << e.what() << std::endl;
return false;
}
}
bool ONNXSAM3::TryAppendOpenVINO(Ort::SessionOptions& session_options)
{
std::vector<std::unordered_map<std::string, std::string>> configs = {
{{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
{{"device_type","GPU.0"}, {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
{{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}
};
for (const auto& config : configs) {
try {
session_options.AppendExecutionProvider_OpenVINO_V2(config);
std::cout << "[ONNXSAM3] OpenVINO EP attached (" << config.at("device_type") << ")." << std::endl;
return true;
}
catch (const Ort::Exception&) { /* try next */ }
}
return false;
}
// ====================================================================
// createSessionBundle — create one ORT session with EP + external data
// ====================================================================
void ONNXSAM3::createSessionBundle(SessionBundle& bundle,
const std::string& onnxPath,
const std::string& label,
bool forceCPU,
GraphOptimizationLevel optLevel)
{
std::cout << "[ONNXSAM3] Creating " << label << " session..." << std::endl;
Ort::SessionOptions opts;
opts.SetIntraOpNumThreads(m_numThreads);
opts.SetGraphOptimizationLevel(optLevel);
opts.SetLogSeverityLevel(4);
// Determine effective engine type
EngineType engine = forceCPU ? EngineType::CPU : m_engineType;
if (forceCPU)
std::cout << "[ONNXSAM3] " << label << ": forced to CPU to save GPU memory." << std::endl;
std::vector<std::string> available = Ort::GetAvailableProviders();
auto hasProvider = [&](const std::string& name) {
return std::find(available.begin(), available.end(), name) != available.end();
};
bool epAttached = false;
{
switch (engine)
{
case EngineType::NVIDIA_GPU:
if (hasProvider("CUDAExecutionProvider"))
epAttached = TryAppendCUDA(opts);
break;
case EngineType::AMD_GPU:
if (hasProvider("DmlExecutionProvider"))
epAttached = TryAppendDirectML(opts);
break;
case EngineType::OPENVINO_GPU:
if (hasProvider("OpenVINOExecutionProvider"))
epAttached = TryAppendOpenVINO(opts);
break;
case EngineType::CPU:
default:
epAttached = true;
break;
}
}
if (!epAttached)
std::cout << "[ONNXSAM3] " << label << ": using CPU EP." << std::endl;
// -- CWD workaround for external data resolution --
std::filesystem::path modelFsPath(onnxPath);
std::filesystem::path modelDir = modelFsPath.parent_path();
std::filesystem::path prevCwd = std::filesystem::current_path();
if (!modelDir.empty() && std::filesystem::is_directory(modelDir))
std::filesystem::current_path(modelDir);
// -- Pre-load external data file if one matches the model stem --
// The external data filename stored inside the .onnx protobuf may
// differ from the .onnx filename on disk (e.g. anssam3_image_encoder.onnx
// internally references sam3_image_encoder.onnx.data). We only
// pre-load when a stem-based candidate exists on disk. If no match
// is found, we load the model from its FILE PATH (not memory buffer)
// so that ORT resolves external data relative to the model directory.
std::vector<char> extDataBuffer;
std::filesystem::path extDataPath;
{
std::wstring stem = modelFsPath.stem().wstring();
std::vector<std::filesystem::path> candidates = {
modelDir / (stem + L".onnx_data"), // monolithic convention
modelDir / (stem + L".onnx.data"), // split-model convention
};
for (auto& c : candidates) {
if (std::filesystem::exists(c)) {
extDataPath = c;
break;
}
}
if (extDataPath.empty()) {
std::cout << "[ONNXSAM3] " << label
<< ": no stem-matched external data; "
<< "ORT will resolve from model directory." << std::endl;
}
}
if (!extDataPath.empty() && std::filesystem::exists(extDataPath)) {
auto fileSize = std::filesystem::file_size(extDataPath);
std::cout << "[ONNXSAM3] " << label << ": external data "
<< extDataPath.filename().string()
<< " (" << (fileSize / (1024*1024)) << " MB)" << std::endl;
try {
std::ifstream ifs(extDataPath, std::ios::binary);
if (ifs) {
extDataBuffer.resize(static_cast<size_t>(fileSize));
ifs.read(extDataBuffer.data(), static_cast<std::streamsize>(fileSize));
ifs.close();
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
extDataPath.filename().wstring()
};
std::vector<char*> extBuffers = { extDataBuffer.data() };
std::vector<size_t> extLengths = { extDataBuffer.size() };
opts.AddExternalInitializersFromFilesInMemory(
extFileNames, extBuffers, extLengths);
}
}
catch (const std::bad_alloc&) {
std::cerr << "[ONNXSAM3] " << label
<< ": could not allocate memory for external data. "
<< "ORT will use file mapping." << std::endl;
extDataBuffer.clear();
extDataBuffer.shrink_to_fit();
}
}
// -- Load .onnx proto into memory --
std::vector<char> modelBuffer;
bool useModelBuffer = false;
try {
auto modelFileSize = std::filesystem::file_size(modelFsPath);
modelBuffer.resize(static_cast<size_t>(modelFileSize));
std::ifstream mifs(modelFsPath, std::ios::binary);
if (mifs) {
mifs.read(modelBuffer.data(), static_cast<std::streamsize>(modelFileSize));
mifs.close();
useModelBuffer = true;
}
}
catch (const std::exception& e) {
std::cerr << "[ONNXSAM3] " << label
<< ": could not read model file: " << e.what() << std::endl;
}
// -- Create session (with GPU → CPU fallback) --
std::wstring onnxPathW(onnxPath.begin(), onnxPath.end());
auto doCreate = [&](Ort::SessionOptions& sopts, const char* tag) {
// Use memory-buffer loading when external data has been pre-loaded;
// otherwise use file-path loading so ORT can resolve external data
// relative to the model's directory on disk.
if (useModelBuffer && !extDataBuffer.empty())
bundle.session = new Ort::Session(*m_env, modelBuffer.data(), modelBuffer.size(), sopts);
else
bundle.session = new Ort::Session(*m_env, onnxPathW.c_str(), sopts);
std::cout << "[ONNXSAM3] " << label << " session created (" << tag << ")." << std::endl;
};
try {
doCreate(opts, "primary EP");
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXSAM3] " << label << " session FAILED: " << e.what() << std::endl;
if (engine != EngineType::CPU && epAttached) {
std::cerr << "[ONNXSAM3] " << label << ": retrying with CPU..." << std::endl;
Ort::SessionOptions cpuOpts;
cpuOpts.SetIntraOpNumThreads(m_numThreads);
cpuOpts.SetGraphOptimizationLevel(optLevel);
cpuOpts.SetLogSeverityLevel(4);
if (!extDataBuffer.empty()) {
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
extDataPath.filename().wstring()
};
std::vector<char*> extBuffers = { extDataBuffer.data() };
std::vector<size_t> extLengths = { extDataBuffer.size() };
cpuOpts.AddExternalInitializersFromFilesInMemory(
extFileNames, extBuffers, extLengths);
}
doCreate(cpuOpts, "CPU fallback");
} else {
throw;
}
}
// Restore CWD & free buffers
std::filesystem::current_path(prevCwd);
extDataBuffer.clear(); extDataBuffer.shrink_to_fit();
modelBuffer.clear(); modelBuffer.shrink_to_fit();
// -- Read input/output names --
Ort::Allocator allocator(*bundle.session, *m_memInfo);
size_t numInputs = bundle.session->GetInputCount();
bundle.inputNames_.resize(numInputs);
bundle.inputNames.resize(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
bundle.inputNames_[i] = OrtCompatiableGetInputName(i, allocator, bundle.session);
bundle.inputNames[i] = bundle.inputNames_[i].c_str();
}
size_t numOutputs = bundle.session->GetOutputCount();
bundle.outputNames_.resize(numOutputs);
bundle.outputNames.resize(numOutputs);
for (size_t i = 0; i < numOutputs; ++i) {
bundle.outputNames_[i] = OrtCompatiableGetOutputName(i, allocator, bundle.session);
bundle.outputNames[i] = bundle.outputNames_[i].c_str();
}
// Log I/O info
for (size_t i = 0; i < numInputs; ++i) {
auto info = bundle.session->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo();
auto shape = info.GetShape();
std::cout << "[ONNXSAM3] " << label << " input[" << i << "]: "
<< bundle.inputNames_[i] << " shape=[";
for (size_t d = 0; d < shape.size(); ++d) {
if (d > 0) std::cout << ",";
std::cout << shape[d];
}
std::cout << "]" << std::endl;
}
for (size_t i = 0; i < numOutputs; ++i) {
auto info = bundle.session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo();
auto shape = info.GetShape();
std::cout << "[ONNXSAM3] " << label << " output[" << i << "]: "
<< bundle.outputNames_[i] << " shape=[";
for (size_t d = 0; d < shape.size(); ++d) {
if (d > 0) std::cout << ",";
std::cout << shape[d];
}
std::cout << "]" << std::endl;
}
}
// ====================================================================
// Constructor
// ====================================================================
ONNXSAM3::ONNXSAM3(const std::string& modelFolder,
EngineType engineType,
unsigned int num_threads)
: m_engineType(engineType),
m_numThreads(num_threads),
m_modelFolder(modelFolder)
{
// Initialize ORT API
const auto& epInfo = EPLoader::Current();
if (Ort::Global<void>::api_ == nullptr)
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
m_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, "ONNXSAM3");
m_memInfo = new Ort::MemoryInfo(
Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault));
// Build paths
std::string imgPath = modelFolder + "\\sam3_image_encoder.onnx";
std::string langPath = modelFolder + "\\sam3_language_encoder.onnx";
std::string decPath = modelFolder + "\\sam3_decoder.onnx";
// Create 3 sessions.
// Language encoder runs on CPU: it is only called once per prompt
// change and keeping it on GPU wastes ~1.5 GB of VRAM that the
// image encoder and decoder need for their activation tensors.
// Image encoder uses ORT_ENABLE_BASIC: enables constant folding
// and redundant-node elimination without the complex fusions
// (MatMulScaleFusion) that are slower on this model and risk OOM
// on 8 GB GPUs. Benchmarked: BASIC=4.6s vs ALL=5.2s.
createSessionBundle(m_imageEncoder, imgPath, "ImageEncoder", false,
GraphOptimizationLevel::ORT_ENABLE_BASIC);
createSessionBundle(m_langEncoder, langPath, "LangEncoder", true,
GraphOptimizationLevel::ORT_ENABLE_ALL);
createSessionBundle(m_decoder, decPath, "Decoder");
std::cout << "[ONNXSAM3] All 3 sessions created successfully." << std::endl;
}
// ====================================================================
// Destructor
// ====================================================================
ONNXSAM3::~ONNXSAM3()
{
// Sessions must be destroyed BEFORE the Ort::Env they were created
// with. Member destructors run after the destructor body, so we
// must explicitly release sessions here first.
if (m_decoder.session) { delete m_decoder.session; m_decoder.session = nullptr; }
if (m_langEncoder.session) { delete m_langEncoder.session; m_langEncoder.session = nullptr; }
if (m_imageEncoder.session) { delete m_imageEncoder.session; m_imageEncoder.session = nullptr; }
if (m_memInfo) { delete m_memInfo; m_memInfo = nullptr; }
if (m_env) { delete m_env; m_env = nullptr; }
}
// ====================================================================
// preprocessImage — BGR → RGB, resize to 1008, HWC→CHW, uint8
// ====================================================================
void ONNXSAM3::preprocessImage(const cv::Mat& mat, std::vector<uint8_t>& buffer)
{
// 3-model image encoder expects uint8 [3, 1008, 1008]
cv::Mat resized;
cv::resize(mat, resized, cv::Size(m_inputSize, m_inputSize));
cv::Mat rgb;
cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);
const size_t planeSize = static_cast<size_t>(m_inputSize) * m_inputSize;
buffer.resize(3 * planeSize);
// HWC → CHW via cv::split + memcpy (much faster than per-pixel loop)
cv::Mat channels[3];
cv::split(rgb, channels);
for (int c = 0; c < 3; ++c)
std::memcpy(buffer.data() + c * planeSize, channels[c].data, planeSize);
}
// ====================================================================
// setPrompt — run language encoder, cache results
// ====================================================================
void ONNXSAM3::setPrompt(const std::vector<int64_t>& inputIds,
const std::vector<int64_t>& attentionMask)
{
if (!m_langEncoder.session) {
std::cerr << "[ONNXSAM3] Language encoder not initialized." << std::endl;
return;
}
// Language encoder input: "tokens" [1, 32] int64
std::vector<int64_t> tokenShape = { 1, static_cast<int64_t>(inputIds.size()) };
m_tokenLength = static_cast<int>(inputIds.size());
// We need a non-const copy for CreateTensor
std::vector<int64_t> tokenData = inputIds;
Ort::Value tokenTensor = Ort::Value::CreateTensor<int64_t>(
*m_memInfo, tokenData.data(), tokenData.size(),
tokenShape.data(), tokenShape.size());
std::vector<Ort::Value> inputs;
inputs.push_back(std::move(tokenTensor));
// Run language encoder
std::cout << "[ONNXSAM3] Running language encoder..." << std::endl;
auto outputs = m_langEncoder.session->Run(
Ort::RunOptions{nullptr},
m_langEncoder.inputNames.data(),
inputs.data(),
inputs.size(),
m_langEncoder.outputNames.data(),
m_langEncoder.outputNames.size());
// Language encoder outputs (from Python analysis):
// output[0]: text_attention_mask [1, 32] bool → "language_mask" for decoder
// output[1]: text_memory [32, 1, 256] float32 → "language_features" for decoder
// output[2]: text_embeds [32, 1, 1024] float32 → NOT used by decoder
// Find outputs by name or fall back to index
int maskIdx = -1, featIdx = -1;
for (size_t i = 0; i < m_langEncoder.outputNames_.size(); ++i) {
const auto& name = m_langEncoder.outputNames_[i];
if (name.find("attention_mask") != std::string::npos ||
name.find("text_attention") != std::string::npos) {
maskIdx = static_cast<int>(i);
}
else if (name.find("text_memory") != std::string::npos ||
name.find("memory") != std::string::npos) {
featIdx = static_cast<int>(i);
}
}
// Fallback: first output is mask, second is features
if (maskIdx < 0) maskIdx = 0;
if (featIdx < 0) featIdx = 1;
// Cache language mask (bool)
{
auto info = outputs[maskIdx].GetTensorTypeAndShapeInfo();
m_cachedLangMaskShape = info.GetShape();
size_t count = info.GetElementCount();
const bool* data = outputs[maskIdx].GetTensorData<bool>();
m_cachedLangMask.resize(count);
for (size_t i = 0; i < count; ++i)
m_cachedLangMask[i] = data[i] ? 1 : 0;
}
// Cache language features (float32)
{
auto info = outputs[featIdx].GetTensorTypeAndShapeInfo();
m_cachedLangFeaturesShape = info.GetShape();
size_t count = info.GetElementCount();
const float* data = outputs[featIdx].GetTensorData<float>();
m_cachedLangFeatures.assign(data, data + count);
}
m_promptSet = true;
std::cout << "[ONNXSAM3] Language encoder done. Mask shape=[";
for (size_t i = 0; i < m_cachedLangMaskShape.size(); ++i) {
if (i > 0) std::cout << ",";
std::cout << m_cachedLangMaskShape[i];
}
std::cout << "] Features shape=[";
for (size_t i = 0; i < m_cachedLangFeaturesShape.size(); ++i) {
if (i > 0) std::cout << ",";
std::cout << m_cachedLangFeaturesShape[i];
}
std::cout << "]" << std::endl;
}
// ====================================================================
// detect — image encoder + decoder pipeline
// ====================================================================
std::vector<SAM3Result> ONNXSAM3::detect(const cv::Mat& mat, float segThreshold)
{
if (mat.empty()) return {};
if (!m_promptSet) {
std::cerr << "[ONNXSAM3] No prompt set. Call setPrompt() first." << std::endl;
return {};
}
const int origW = mat.cols;
const int origH = mat.rows;
// ---- 1) Image Encoder ----
std::vector<uint8_t> imgBuffer;
preprocessImage(mat, imgBuffer);
std::vector<int64_t> imgShape = { 3, m_inputSize, m_inputSize };
Ort::Value imgTensor = Ort::Value::CreateTensor<uint8_t>(
*m_memInfo, imgBuffer.data(), imgBuffer.size(),
imgShape.data(), imgShape.size());
std::vector<Ort::Value> imgInputs;
imgInputs.push_back(std::move(imgTensor));
auto imgOutputs = m_imageEncoder.session->Run(
Ort::RunOptions{nullptr},
m_imageEncoder.inputNames.data(),
imgInputs.data(),
imgInputs.size(),
m_imageEncoder.outputNames.data(),
m_imageEncoder.outputNames.size());
// Image encoder outputs (6 total, matched by name):
// vision_pos_enc_0/1/2 — only _2 used by decoder
// backbone_fpn_0/1/2 — all 3 used by decoder
// Build a map from image encoder output names to indices
std::unordered_map<std::string, int> imgOutputMap;
for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i)
imgOutputMap[m_imageEncoder.outputNames_[i]] = static_cast<int>(i);
// Release unused outputs (vision_pos_enc_0, vision_pos_enc_1) to free
// GPU memory before running the decoder. These are ~105 MB on CUDA.
for (size_t i = 0; i < m_imageEncoder.outputNames_.size(); ++i) {
const auto& name = m_imageEncoder.outputNames_[i];
if (name == "vision_pos_enc_0" || name == "vision_pos_enc_1") {
imgOutputs[i] = Ort::Value(nullptr);
}
}
// ---- 2) Build decoder inputs ----
size_t numDecInputs = m_decoder.inputNames.size();
std::vector<Ort::Value> decInputs;
decInputs.reserve(numDecInputs);
// Prepare scalar and prompt tensors
int64_t origHeightVal = static_cast<int64_t>(origH);
int64_t origWidthVal = static_cast<int64_t>(origW);
std::vector<int64_t> scalarShape = {}; // scalar = 0-dim tensor
float boxCoordsData[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
std::vector<int64_t> boxCoordsShape = { 1, 1, 4 };
int64_t boxLabelsData[1] = { -1 }; // no box prompt
std::vector<int64_t> boxLabelsShape = { 1, 1 };
bool boxMasksData[1] = { false }; // no box prompt (language-only grounding)
std::vector<int64_t> boxMasksShape = { 1, 1 };
// Build input tensors in the order expected by the decoder
for (size_t i = 0; i < numDecInputs; ++i) {
const std::string& name = m_decoder.inputNames_[i];
if (name == "original_height") {
decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
*m_memInfo, &origHeightVal, 1, scalarShape.data(), scalarShape.size()));
}
else if (name == "original_width") {
decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
*m_memInfo, &origWidthVal, 1, scalarShape.data(), scalarShape.size()));
}
else if (name == "backbone_fpn_0" || name == "backbone_fpn_1" ||
name == "backbone_fpn_2" || name == "vision_pos_enc_2") {
// Find matching image encoder output by name
auto it = imgOutputMap.find(name);
if (it != imgOutputMap.end()) {
decInputs.push_back(std::move(imgOutputs[it->second]));
} else {
std::cerr << "[ONNXSAM3] Image encoder output not found: " << name << std::endl;
float dummy = 0.0f;
std::vector<int64_t> dummyShape = { 1 };
decInputs.push_back(Ort::Value::CreateTensor<float>(
*m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size()));
}
}
else if (name == "language_mask") {
decInputs.push_back(Ort::Value::CreateTensor<bool>(
*m_memInfo,
reinterpret_cast<bool*>(m_cachedLangMask.data()),
m_cachedLangMask.size(),
m_cachedLangMaskShape.data(),
m_cachedLangMaskShape.size()));
}
else if (name == "language_features") {
decInputs.push_back(Ort::Value::CreateTensor<float>(
*m_memInfo,
m_cachedLangFeatures.data(),
m_cachedLangFeatures.size(),
m_cachedLangFeaturesShape.data(),
m_cachedLangFeaturesShape.size()));
}
else if (name == "box_coords") {
decInputs.push_back(Ort::Value::CreateTensor<float>(
*m_memInfo, boxCoordsData, 4,
boxCoordsShape.data(), boxCoordsShape.size()));
}
else if (name == "box_labels") {
decInputs.push_back(Ort::Value::CreateTensor<int64_t>(
*m_memInfo, boxLabelsData, 1,
boxLabelsShape.data(), boxLabelsShape.size()));
}
else if (name == "box_masks") {
decInputs.push_back(Ort::Value::CreateTensor<bool>(
*m_memInfo, boxMasksData, 1,
boxMasksShape.data(), boxMasksShape.size()));
}
else {
std::cerr << "[ONNXSAM3] Unknown decoder input: " << name << std::endl;
// Create a dummy scalar float tensor
float dummy = 0.0f;
std::vector<int64_t> dummyShape = { 1 };
decInputs.push_back(Ort::Value::CreateTensor<float>(
*m_memInfo, &dummy, 1, dummyShape.data(), dummyShape.size()));
}
}
// ---- Debug: print decoder input stats for comparison with ANSSAM3 ----
{
std::cout << "[ONNXSAM3] Decoder inputs before Run:" << std::endl;
for (size_t di = 0; di < numDecInputs; ++di) {
const std::string& dname = m_decoder.inputNames_[di];
auto info = decInputs[di].GetTensorTypeAndShapeInfo();
auto shape = info.GetShape();
auto elemType = info.GetElementType();
std::cout << " " << dname << " type=" << elemType << " shape=[";
for (size_t d = 0; d < shape.size(); ++d) {
if (d > 0) std::cout << ",";
std::cout << shape[d];
}
std::cout << "]";
// Print mean/first5 for float tensors
if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && !shape.empty()) {
size_t numElems = info.GetElementCount();
if (numElems > 0 && numElems < 100000000) {
const float* data = decInputs[di].GetTensorData<float>();
double sum = 0;
for (size_t k = 0; k < numElems; ++k) sum += data[k];
double mean = sum / numElems;
std::cout << " mean=" << mean << " first5:";
for (size_t k = 0; k < std::min(numElems, (size_t)5); ++k)
std::cout << " " << data[k];
}
}
// Print bool tensor values (for language_mask)
else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL && !shape.empty()) {
size_t numElems = info.GetElementCount();
const bool* data = decInputs[di].GetTensorData<bool>();
std::cout << " vals:";
for (size_t k = 0; k < std::min(numElems, (size_t)32); ++k)
std::cout << " " << (int)data[k];
}
// Print int64 scalar value
else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && shape.empty()) {
const int64_t* data = decInputs[di].GetTensorData<int64_t>();
std::cout << " value=" << data[0];
}
std::cout << std::endl;
}
}
// ---- 3) Run decoder ----
auto decOutputs = m_decoder.session->Run(
Ort::RunOptions{nullptr},
m_decoder.inputNames.data(),
decInputs.data(),
decInputs.size(),
m_decoder.outputNames.data(),
m_decoder.outputNames.size());
// Decoder outputs (from Python analysis):
// output[0]: boxes [N, 4] float32
// output[1]: scores [N] float32
// output[2]: masks [N, 1, H, W] bool
// Find output indices by name
int boxesIdx = 0, scoresIdx = 1, masksIdx = 2;
for (size_t i = 0; i < m_decoder.outputNames_.size(); ++i) {
const auto& name = m_decoder.outputNames_[i];
if (name.find("box") != std::string::npos) boxesIdx = static_cast<int>(i);
else if (name.find("score") != std::string::npos) scoresIdx = static_cast<int>(i);
else if (name.find("mask") != std::string::npos) masksIdx = static_cast<int>(i);
}
// Get boxes
auto boxInfo = decOutputs[boxesIdx].GetTensorTypeAndShapeInfo();
auto boxShape = boxInfo.GetShape();
int numBoxes = (boxShape.size() >= 1) ? static_cast<int>(boxShape[0]) : 0;
const float* boxesData = decOutputs[boxesIdx].GetTensorData<float>();
// Get scores
const float* scoresData = decOutputs[scoresIdx].GetTensorData<float>();
// Get masks
auto maskInfo = decOutputs[masksIdx].GetTensorTypeAndShapeInfo();
auto maskShape = maskInfo.GetShape();
// masks shape: [N, 1, H, W]
int maskH = (maskShape.size() >= 3) ? static_cast<int>(maskShape[2]) : 0;
int maskW = (maskShape.size() >= 4) ? static_cast<int>(maskShape[3]) : 0;
const bool* masksData = decOutputs[masksIdx].GetTensorData<bool>();
m_maskH = maskH;
m_maskW = maskW;
std::cout << "[ONNXSAM3] Decoder: " << numBoxes << " detections, "
<< "mask=" << maskH << "x" << maskW << std::endl;
return postprocessResults(boxesData, numBoxes, scoresData,
masksData, maskH, maskW,
origW, origH, segThreshold);
}
// ====================================================================
// postprocessResults — convert decoder outputs to SAM3Result
// ====================================================================
std::vector<SAM3Result> ONNXSAM3::postprocessResults(
const float* boxesData, int numBoxes,
const float* scoresData,
const bool* masksData, int maskH, int maskW,
int origWidth, int origHeight,
float scoreThreshold)
{
std::vector<SAM3Result> results;
for (int i = 0; i < numBoxes; ++i) {
float score = scoresData[i];
if (score < scoreThreshold)
continue;
// Box: [x1, y1, x2, y2] in original image coordinates
float x1 = boxesData[i * 4 + 0];
float y1 = boxesData[i * 4 + 1];
float x2 = boxesData[i * 4 + 2];
float y2 = boxesData[i * 4 + 3];
// Clamp to image bounds
x1 = std::max(0.0f, std::min(x1, static_cast<float>(origWidth)));
y1 = std::max(0.0f, std::min(y1, static_cast<float>(origHeight)));
x2 = std::max(0.0f, std::min(x2, static_cast<float>(origWidth)));
y2 = std::max(0.0f, std::min(y2, static_cast<float>(origHeight)));
SAM3Result obj;
obj.box = cv::Rect(
static_cast<int>(x1), static_cast<int>(y1),
static_cast<int>(x2 - x1), static_cast<int>(y2 - y1));
obj.confidence = score;
if (obj.box.width <= 0 || obj.box.height <= 0)
continue;
// Extract this instance's mask: [1, H, W] bool at index i
// masksData layout: [N, 1, H, W]
const bool* instanceMask = masksData + static_cast<size_t>(i) * 1 * maskH * maskW;
// Create binary mask at original resolution
cv::Mat boolMask(maskH, maskW, CV_8UC1);
for (int y = 0; y < maskH; ++y)
for (int x = 0; x < maskW; ++x)
boolMask.at<uint8_t>(y, x) = instanceMask[y * maskW + x] ? 255 : 0;
// Resize mask to original image dimensions
cv::Mat fullMask;
cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight),
0, 0, cv::INTER_LINEAR);
cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY);
// Crop mask to bounding box
cv::Mat roiMask = fullMask(obj.box).clone();
obj.mask = roiMask;
// Create polygon from contour in the ROI region
std::vector<std::vector<cv::Point>> contours;
cv::findContours(roiMask.clone(), contours, cv::RETR_EXTERNAL,
cv::CHAIN_APPROX_SIMPLE);
if (!contours.empty()) {
// Use the largest contour
int largestIdx = 0;
double largestArea = 0;
for (int c = 0; c < static_cast<int>(contours.size()); ++c) {
double area = cv::contourArea(contours[c]);
if (area > largestArea) {
largestArea = area;
largestIdx = c;
}
}
std::vector<cv::Point> approx;
double epsilon = 0.01 * cv::arcLength(contours[largestIdx], true);
cv::approxPolyDP(contours[largestIdx], approx, epsilon, true);
// Convert to absolute coordinates (ROI is relative to box)
for (const auto& pt : approx) {
obj.polygon.emplace_back(
static_cast<float>(pt.x + obj.box.x),
static_cast<float>(pt.y + obj.box.y));
}
}
// Fallback: box corners as polygon
if (obj.polygon.size() < 3) {
obj.polygon = {
cv::Point2f(static_cast<float>(obj.box.x),
static_cast<float>(obj.box.y)),
cv::Point2f(static_cast<float>(obj.box.x + obj.box.width),
static_cast<float>(obj.box.y)),
cv::Point2f(static_cast<float>(obj.box.x + obj.box.width),
static_cast<float>(obj.box.y + obj.box.height)),
cv::Point2f(static_cast<float>(obj.box.x),
static_cast<float>(obj.box.y + obj.box.height))
};
}
results.push_back(std::move(obj));
}
return results;
}
} // namespace ANSCENTER