Files

1490 lines
66 KiB
C++
Raw Permalink Normal View History

2026-03-28 16:54:11 +11:00
#include "ANSSAM3.h"
#include "ANSCLIPTokenizer.h"
#include "Utility.h"
#include <NvInferPlugin.h>
#include <opencv2/opencv.hpp>
#include <fstream>
#include <filesystem>
#include <cmath>
#include <algorithm>
#include <numeric>
namespace ANSCENTER
{
// =========================================================================
// Helpers
// =========================================================================
// Portable FP16 ↔ FP32 conversion (works in plain C++ without NVCC)
static float fp16ToFloat(uint16_t h)
{
uint32_t sign = static_cast<uint32_t>(h >> 15) << 31;
uint32_t expo = (h >> 10) & 0x1Fu;
uint32_t mant = h & 0x3FFu;
if (expo == 0) {
if (mant == 0) { float f; std::memcpy(&f, &sign, 4); return f; }
while (!(mant & 0x400u)) { mant <<= 1; expo--; }
expo++; mant &= 0x3FFu;
} else if (expo == 31) {
expo = 255;
}
uint32_t bits = sign | ((expo + 127u - 15u) << 23) | (mant << 13);
float f; std::memcpy(&f, &bits, 4); return f;
}
static uint16_t floatToFp16(float val)
{
uint32_t bits; std::memcpy(&bits, &val, 4);
uint16_t sign = static_cast<uint16_t>((bits >> 16) & 0x8000u);
int32_t expo = ((bits >> 23) & 0xFF) - 127 + 15;
uint32_t mant = bits & 0x7FFFFFu;
if (expo <= 0) return sign; // underflow → ±0
if (expo >= 31) return sign | 0x7C00u; // overflow → ±inf
return sign | static_cast<uint16_t>(expo << 10) | static_cast<uint16_t>(mant >> 13);
}
size_t ANSSAM3::DataTypeSize(nvinfer1::DataType dtype)
{
switch (dtype) {
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kINT64: return 8;
case nvinfer1::DataType::kINT8: return 1;
case nvinfer1::DataType::kBOOL: return 1;
#if NV_TENSORRT_MAJOR >= 10
case nvinfer1::DataType::kUINT8: return 1;
case nvinfer1::DataType::kFP8: return 1;
case nvinfer1::DataType::kBF16: return 2;
case nvinfer1::DataType::kINT4: return 1; // conservative — 4-bit packed
#endif
default: return 4;
}
}
void ANSSAM3::TRTBundle::destroy()
{
context.reset();
engine.reset();
runtime.reset();
for (int i = 0; i < (int)gpuBuffers.size(); ++i) {
if (!gpuBuffers[i]) continue;
if (hostBufferIdx.count(i))
free(gpuBuffers[i]); // host-allocated (shape tensor)
else
cudaFree(gpuBuffers[i]); // device-allocated
gpuBuffers[i] = nullptr;
}
gpuBuffers.clear();
gpuBufferSizes.clear();
hostBufferIdx.clear();
nameToIdx.clear();
}
// (CreateOrtDecoderSession removed — decoder now runs under TRT)
// -----------------------------------------------------------------
// PassthroughOutputAllocator — lightweight IOutputAllocator that simply
// returns the pre-allocated gpuBuffer. Defined here (not in the header)
// so the vtable and CUDA symbols stay inside the engine DLL.
// Created on the stack in Detect() — no persistent class members needed.
// -----------------------------------------------------------------
#if NV_TENSORRT_MAJOR >= 10
struct PassthroughOutputAllocator : public nvinfer1::IOutputAllocator
{
void* preAllocBuf; // existing gpuBuffers[idx]
size_t preAllocSize; // existing gpuBufferSizes[idx]
nvinfer1::Dims actualDims{};
bool shapeKnown = false;
PassthroughOutputAllocator(void* buf, size_t sz)
: preAllocBuf(buf), preAllocSize(sz) {}
void* reallocateOutput(
char const* /*tensorName*/, void* currentMemory,
uint64_t size, uint64_t /*alignment*/) noexcept override
{
if (size <= preAllocSize) return preAllocBuf;
// Fallback: grow (should not happen with generous pre-allocation)
void* newBuf = nullptr;
if (cudaMalloc(&newBuf, size) == cudaSuccess) {
preAllocBuf = newBuf;
preAllocSize = size;
}
return preAllocBuf;
}
void notifyShape(char const* /*tensorName*/,
nvinfer1::Dims const& dims) noexcept override
{
actualDims = dims;
shapeKnown = true;
}
};
#endif
// =========================================================================
// EngineFileName — generate cache path: <stem>.engine.<GPUName>.<fp16|fp32>
// =========================================================================
std::string ANSSAM3::EngineFileName(const std::string& onnxPath, TrtPrecision precision) const
{
// Extract stem from ONNX path
std::filesystem::path p(onnxPath);
std::string stem = p.stem().string();
// Get GPU name
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
std::string gpuName(prop.name);
gpuName.erase(std::remove_if(gpuName.begin(), gpuName.end(), ::isspace), gpuName.end());
// BF16 requires Ampere+ (compute capability >= 8.0).
// On older GPUs BuildAndLoadEngine silently falls back to FP32,
// so the filename must match to avoid a .bf16 / .fp32 mismatch.
TrtPrecision effective = precision;
if (precision == TrtPrecision::BF16 && prop.major < 8)
effective = TrtPrecision::FP32;
std::string precStr;
switch (effective) {
case TrtPrecision::FP16: precStr = "fp16"; break;
case TrtPrecision::BF16: precStr = "bf16"; break;
case TrtPrecision::FP32: precStr = "fp32"; break;
}
std::string dir = p.parent_path().string();
return dir + "\\" + stem + ".engine." + gpuName + "." + precStr;
}
// =========================================================================
// BuildAndLoadEngine — build TRT engine from ONNX + load for inference
// =========================================================================
bool ANSSAM3::BuildAndLoadEngine(TRTBundle& bundle, const std::string& onnxPath,
const std::string& label, TrtPrecision precision)
{
// Register TRT built-in plugins (needed for RoiAlign in decoder, etc.)
// Safe to call multiple times — idempotent.
initLibNvInferPlugins(&m_trtLogger, "");
std::string enginePath = EngineFileName(onnxPath, precision);
// Check for cached engine
if (FileExist(enginePath)) {
std::cout << "[ANSSAM3] " << label << ": cached engine found: " << enginePath << std::endl;
return LoadTRTEngineBundle(bundle, enginePath, label);
}
// --- Build from ONNX ---
std::cout << "[ANSSAM3] " << label << ": building TRT engine from " << onnxPath << std::endl;
if (!FileExist(onnxPath)) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": ONNX file not found: " + onnxPath, __FILE__, __LINE__);
return false;
}
auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_trtLogger));
if (!builder) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createInferBuilder failed", __FILE__, __LINE__);
return false;
}
auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
if (!network) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createNetworkV2 failed", __FILE__, __LINE__);
return false;
}
auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_trtLogger));
if (!parser) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createParser failed", __FILE__, __LINE__);
return false;
}
// parseFromFile resolves .onnx_data external files relative to the ONNX directory
if (!parser->parseFromFile(onnxPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))) {
for (int i = 0; i < parser->getNbErrors(); ++i)
std::cerr << "[ANSSAM3] " << label << " parse error: " << parser->getError(i)->desc() << std::endl;
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": parseFromFile failed", __FILE__, __LINE__);
return false;
}
std::cout << "[ANSSAM3] " << label << ": ONNX parsed successfully." << std::endl;
// --- Log and configure input tensors ---
auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
// Workspace governs max scratch memory TRT can use at runtime.
// ImageEncoder FP32 needs huge scratch (3.5 GiB at 4 GiB workspace) which
// causes OOM on 8 GiB GPUs. Cap it to 2 GiB — TRT picks leaner tactics
// with minimal quality loss. LangEncoder/Decoder are small; keep 4 GiB.
const bool isImageEncoder = (label.find("ImageEncoder") != std::string::npos);
size_t workspaceBytes = isImageEncoder
? 2048ULL * 1024 * 1024 // 2 GiB for ImageEncoder (prevents OOM)
: 4096ULL * 1024 * 1024; // 4 GiB for LangEncoder / Decoder
#if NV_TENSORRT_MAJOR >= 10
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspaceBytes);
#else
config->setMaxWorkspaceSize(workspaceBytes);
#endif
// Set precision flags
switch (precision) {
case TrtPrecision::BF16:
{
// BF16 requires Ampere+ (compute capability >= 8.0) and TRT 8.6+.
// Check GPU capability, then attempt to set the flag if available.
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, 0);
#if NV_TENSORRT_MAJOR > 8 || (NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR >= 6)
if (devProp.major >= 8) {
config->setFlag(nvinfer1::BuilderFlag::kBF16);
std::cout << "[ANSSAM3] " << label << ": BF16 precision enabled." << std::endl;
} else {
std::cout << "[ANSSAM3] " << label << ": GPU CC " << devProp.major << "." << devProp.minor
<< " does not support BF16, falling back to FP32." << std::endl;
}
#else
(void)devProp; // suppress unused warning
std::cout << "[ANSSAM3] " << label << ": TensorRT version does not support BF16, falling back to FP32." << std::endl;
#endif
break;
}
case TrtPrecision::FP16:
if (builder->platformHasFastFp16()) {
config->setFlag(nvinfer1::BuilderFlag::kFP16);
std::cout << "[ANSSAM3] " << label << ": FP16 precision enabled." << std::endl;
}
break;
case TrtPrecision::FP32:
// No precision flags = FP32
break;
}
// Create optimization profile with actual ONNX dimensions
auto profile = builder->createOptimizationProfile();
int numInputs = network->getNbInputs();
for (int i = 0; i < numInputs; ++i) {
auto input = network->getInput(i);
const char* name = input->getName();
auto dims = input->getDimensions();
std::cout << "[ANSSAM3] " << label << " input[" << i << "] '" << name << "': [";
for (int d = 0; d < dims.nbDims; ++d) {
if (d > 0) std::cout << ", ";
std::cout << (dims.d[d] == -1 ? "dyn" : std::to_string(dims.d[d]));
}
std::cout << "]" << (input->isShapeTensor() ? " (shape tensor)" : "") << std::endl;
// Shape tensors: scalar int64 inputs whose VALUES determine output shapes
// (e.g. original_height, original_width). Use setShapeValues() not setDimensions().
if (input->isShapeTensor()) {
// nbValues = product of dims; for a scalar (nbDims==0) that is 1
int nbValues = 1;
for (int d = 0; d < dims.nbDims; ++d) {
if (dims.d[d] > 0) nbValues *= dims.d[d];
}
std::vector<int32_t> minV(nbValues, 1);
std::vector<int32_t> optV(nbValues, 1024);
std::vector<int32_t> maxV(nbValues, 4096);
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, minV.data(), nbValues);
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, optV.data(), nbValues);
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, maxV.data(), nbValues);
continue;
}
// Regular execution tensors: replace dynamic dims with concrete values
bool hasDynamic = false;
nvinfer1::Dims fixedDims = dims;
for (int d = 0; d < dims.nbDims; ++d) {
if (dims.d[d] == -1) {
hasDynamic = true;
fixedDims.d[d] = 1; // default batch or sequence
}
}
if (hasDynamic) {
profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, fixedDims);
profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, fixedDims);
profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, fixedDims);
}
}
config->addOptimizationProfile(profile);
// --- Mixed precision for Decoder: keep FP16 for bulk ops but force
// score/NMS/comparison layers to FP32 so that the internal
// thresholding doesn't lose detections due to half-precision
// rounding. We mark layers whose names contain NMS-related
// keywords, plus all layers of types that perform comparisons
// or index-selection (which are part of the NMS pipeline).
if (precision == TrtPrecision::FP16 && label == std::string("Decoder")) {
config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
int numLayers = network->getNbLayers();
int markedCount = 0;
for (int li = 0; li < numLayers; ++li) {
auto* layer = network->getLayer(li);
if (!layer) continue;
std::string lname(layer->getName());
auto ltype = layer->getType();
// Force FP32 on layers involved in score thresholding / NMS:
// - Comparison ops (Greater, Less, Equal)
// - Sigmoid (final score activation)
// - TopK, NonZero, Gather, Select (index-selection in NMS)
// - Any layer whose name hints at score/nms/threshold
bool needFP32 = false;
// By layer type
switch (ltype) {
case nvinfer1::LayerType::kTOPK:
case nvinfer1::LayerType::kGATHER:
case nvinfer1::LayerType::kSELECT:
case nvinfer1::LayerType::kNON_ZERO:
case nvinfer1::LayerType::kSCATTER:
needFP32 = true;
break;
default:
break;
}
// By layer name (ONNX op names often preserved by parser)
if (!needFP32) {
// Convert to lowercase for matching
std::string lower = lname;
std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
if (lower.find("score") != std::string::npos ||
lower.find("nms") != std::string::npos ||
lower.find("sigmoid") != std::string::npos ||
lower.find("threshold") != std::string::npos ||
lower.find("greater") != std::string::npos ||
lower.find("less") != std::string::npos ||
lower.find("where") != std::string::npos ||
lower.find("nonzero") != std::string::npos ||
lower.find("topk") != std::string::npos) {
needFP32 = true;
}
}
// TensorRT forbids setPrecision(kFLOAT) on layers that
// produce non-float types (booleans, indices/int32, int64).
// Only force FP32 when ALL outputs are floating-point.
if (needFP32) {
bool allFloat = true;
for (int oi = 0; oi < layer->getNbOutputs(); ++oi) {
auto dt = layer->getOutputType(oi);
if (dt != nvinfer1::DataType::kFLOAT &&
dt != nvinfer1::DataType::kHALF) {
allFloat = false;
break;
}
}
if (allFloat) {
layer->setPrecision(nvinfer1::DataType::kFLOAT);
for (int oi = 0; oi < layer->getNbOutputs(); ++oi)
layer->setOutputType(oi, nvinfer1::DataType::kFLOAT);
++markedCount;
}
}
}
std::cout << "[ANSSAM3] " << label << ": mixed precision — "
<< markedCount << "/" << numLayers
<< " layers forced to FP32 (score/NMS ops)." << std::endl;
}
// --- Build serialized engine ---
std::cout << "[ANSSAM3] " << label << ": building engine (this may take a few minutes)..." << std::endl;
unsigned long sehCode = 0;
auto plan = std::unique_ptr<nvinfer1::IHostMemory>(
buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCode));
if (sehCode != 0) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine",
label + ": engine build crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
return false;
}
if (!plan) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": buildSerializedNetwork returned null", __FILE__, __LINE__);
return false;
}
// --- Save to disk ---
std::ofstream outFile(enginePath, std::ios::binary);
if (!outFile.is_open()) {
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": cannot write engine file: " + enginePath, __FILE__, __LINE__);
return false;
}
outFile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
outFile.close();
std::cout << "[ANSSAM3] " << label << ": engine saved to " << enginePath << std::endl;
plan.reset();
// --- Load the just-built engine ---
return LoadTRTEngineBundle(bundle, enginePath, label);
}
// =========================================================================
// LoadTRTEngineBundle — deserialize engine, allocate GPU buffers, bind
// =========================================================================
bool ANSSAM3::LoadTRTEngineBundle(TRTBundle& bundle, const std::string& enginePath, const std::string& label)
{
// Read engine file
std::ifstream file(enginePath, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cannot open: " + enginePath, __FILE__, __LINE__);
return false;
}
std::streamsize fileSize = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> engineData(fileSize);
if (!file.read(engineData.data(), fileSize)) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": read failed", __FILE__, __LINE__);
return false;
}
file.close();
// Deserialize
bundle.runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(m_trtLogger));
if (!bundle.runtime) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createInferRuntime failed", __FILE__, __LINE__);
return false;
}
unsigned long sehCode = 0;
bundle.engine = std::unique_ptr<nvinfer1::ICudaEngine>(
deserializeCudaEngineSafe(bundle.runtime.get(), engineData.data(), engineData.size(), &sehCode));
if (sehCode != 0) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
label + ": deserialize crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
return false;
}
if (!bundle.engine) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize returned null", __FILE__, __LINE__);
return false;
}
// --- Weight streaming (TRT 10+): keep only a budget of weights on GPU,
// stream the rest from CPU pinned memory on demand.
// Saves ~1.3 GiB VRAM for ImageEncoder (1.8 GiB weights → 512 MiB on GPU).
#if NV_TENSORRT_MAJOR >= 10
{
int64_t streamableBytes = bundle.engine->getStreamableWeightsSize();
if (streamableBytes > 0 && label.find("ImageEncoder") != std::string::npos) {
// Budget = how much weight memory stays on GPU.
// 512 MiB keeps hot layers cached; rest streamed via PCIe.
const int64_t budgetBytes = 512LL * 1024 * 1024;
int64_t actualBudget = std::min(budgetBytes, streamableBytes);
bundle.engine->setWeightStreamingBudgetV2(actualBudget);
std::cout << "[ANSSAM3] " << label
<< ": weight streaming enabled (streamable="
<< (streamableBytes / (1024*1024)) << " MiB, budget="
<< (actualBudget / (1024*1024)) << " MiB)" << std::endl;
}
}
#endif
bundle.context = std::unique_ptr<nvinfer1::IExecutionContext>(bundle.engine->createExecutionContext());
if (!bundle.context) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createExecutionContext failed", __FILE__, __LINE__);
return false;
}
// Set optimization profile
int numProfiles = bundle.engine->getNbOptimizationProfiles();
if (numProfiles > 0) {
bundle.context->setOptimizationProfileAsync(0, m_cudaStream);
cudaStreamSynchronize(m_cudaStream);
}
// Allocate buffers (device for execution tensors, host for shape tensors)
const int numTensors = bundle.engine->getNbIOTensors();
bundle.gpuBuffers.resize(numTensors, nullptr);
bundle.gpuBufferSizes.resize(numTensors, 0);
bundle.hostBufferIdx.clear();
bundle.nameToIdx.clear();
for (int i = 0; i < numTensors; ++i) {
const char* name = bundle.engine->getIOTensorName(i);
auto mode = bundle.engine->getTensorIOMode(name);
auto shape = bundle.engine->getTensorShape(name);
auto dtype = bundle.engine->getTensorDataType(name);
auto loc = bundle.engine->getTensorLocation(name);
bool isHost = (loc == nvinfer1::TensorLocation::kHOST);
// Check if any dimension is dynamic (-1)
bool hasDynamic = false;
int64_t numElements = 1;
for (int d = 0; d < shape.nbDims; ++d) {
int64_t v = shape.d[d];
if (v <= 0) { hasDynamic = true; v = 1; }
numElements *= v;
}
// Scalars (0-dim) still need at least 1 element
if (numElements < 1) numElements = 1;
// For output tensors with ANY dynamic dim, pre-allocate a generous buffer.
// The decoder outputs (boxes [-1,4], scores [-1], masks [-1,-1,-1,-1])
// all have data-dependent first dimension from NonZero/NMS.
if (mode == nvinfer1::TensorIOMode::kOUTPUT && hasDynamic) {
// Pre-allocate for up to 256 detections with generous mask size
numElements = 256 * 1 * 256 * 256;
}
size_t bufSize = numElements * DataTypeSize(dtype);
bundle.gpuBufferSizes[i] = bufSize;
if (isHost) {
// Shape tensor — allocate host memory
bundle.gpuBuffers[i] = calloc(numElements, DataTypeSize(dtype));
if (!bundle.gpuBuffers[i]) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
label + ": host alloc failed for " + std::string(name), __FILE__, __LINE__);
return false;
}
bundle.hostBufferIdx.insert(i);
} else {
// Execution tensor — allocate device memory
cudaError_t err = cudaMalloc(&bundle.gpuBuffers[i], bufSize);
if (err != cudaSuccess) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
label + ": cudaMalloc failed for " + std::string(name) + ": " + cudaGetErrorString(err),
__FILE__, __LINE__);
return false;
}
cudaMemset(bundle.gpuBuffers[i], 0, bufSize);
}
// Bind tensor address (host ptr for shape tensors, device ptr for execution tensors)
if (!bundle.context->setTensorAddress(name, bundle.gpuBuffers[i])) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
label + ": setTensorAddress failed for " + std::string(name), __FILE__, __LINE__);
return false;
}
bundle.nameToIdx[std::string(name)] = i;
std::cout << "[ANSSAM3] " << label << " tensor[" << i << "] '"
<< name << "' "
<< (mode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT")
<< (isHost ? " HOST" : " DEVICE")
<< " dtype=" << static_cast<int>(dtype)
<< " bufSize=" << bufSize << std::endl;
}
// Set input shapes (replace dynamic dims with concrete values)
for (int i = 0; i < numTensors; ++i) {
const char* name = bundle.engine->getIOTensorName(i);
if (bundle.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kINPUT)
continue;
// Shape tensors (scalar, host memory): TRT reads the value directly
// from the host buffer. setInputShape for scalars uses Dims{0, {}}.
// Write a default value (1024) into the host buffer at load time.
if (bundle.hostBufferIdx.count(i)) {
auto dtype = bundle.engine->getTensorDataType(name);
if (dtype == nvinfer1::DataType::kINT64)
*reinterpret_cast<int64_t*>(bundle.gpuBuffers[i]) = 1024;
else
*reinterpret_cast<int32_t*>(bundle.gpuBuffers[i]) = 1024;
nvinfer1::Dims scalarDims;
scalarDims.nbDims = 0;
bundle.context->setInputShape(name, scalarDims);
continue;
}
auto dims = bundle.engine->getTensorShape(name);
nvinfer1::Dims inputDims = dims;
for (int d = 0; d < inputDims.nbDims; ++d) {
if (inputDims.d[d] == -1)
inputDims.d[d] = 1;
}
if (!bundle.context->setInputShape(name, inputDims)) {
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
label + ": setInputShape failed for " + std::string(name), __FILE__, __LINE__);
return false;
}
}
std::cout << "[ANSSAM3] " << label << ": loaded successfully (" << numTensors << " tensors)." << std::endl;
return true;
}
// =========================================================================
// EnsureEnginesBuilt — pre-build uncached engines one at a time
// Avoids GPU OOM when building one engine while others are already loaded.
// =========================================================================
bool ANSSAM3::EnsureEnginesBuilt(const std::string& imgOnnx, const std::string& langOnnx, const std::string& decOnnx)
{
struct Job { const std::string* onnx; const char* label; TrtPrecision prec; };
Job jobs[] = {
{&langOnnx, "LangEncoder", TrtPrecision::FP16}, // FP16 — verified identical to FP32
{&decOnnx, "Decoder", TrtPrecision::FP16}, // FP16 decoder
{&imgOnnx, "ImageEncoder", TrtPrecision::FP32}, // FP32 — FP16/BF16 both corrupt backbone FPN
};
for (auto& j : jobs) {
if (!FileExist(EngineFileName(*j.onnx, j.prec))) {
TRTBundle tmp;
if (!BuildAndLoadEngine(tmp, *j.onnx, j.label, j.prec)) {
_logger.LogError("ANSSAM3::EnsureEnginesBuilt",
std::string("Failed to pre-build engine: ") + j.label, __FILE__, __LINE__);
tmp.destroy();
return false;
}
tmp.destroy(); // free GPU memory before next build
}
}
return true;
}
// =========================================================================
// OptimizeModel
// =========================================================================
bool ANSSAM3::OptimizeModel(bool fp16, std::string& optimizedModelFolder)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder))
return false;
_fp16 = fp16;
optimizedModelFolder = _modelFolder;
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
// Build engines one at a time, destroying each to free GPU memory
TRTBundle tmp;
bool ok = true;
ok = BuildAndLoadEngine(tmp, imgOnnx, "ImageEncoder", TrtPrecision::FP32);
tmp.destroy();
if (ok) { ok = BuildAndLoadEngine(tmp, langOnnx, "LangEncoder", TrtPrecision::FP16); tmp.destroy(); }
if (ok) { ok = BuildAndLoadEngine(tmp, decOnnx, "Decoder", TrtPrecision::FP16); tmp.destroy(); }
if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }
return ok;
}
// =========================================================================
// Initialize
// =========================================================================
bool ANSSAM3::Initialize(std::string licenseKey, ModelConfig modelConfig,
const std::string& modelZipFilePath, const std::string& modelZipPassword,
std::string& labelMap)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
2026-04-13 19:48:32 +10:00
ModelLoadingGuard mlg(_modelLoading);
2026-03-28 16:54:11 +11:00
try {
bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
if (!result) return false;
_modelConfig.detectionType = DetectionType::SEGMENTATION;
if (_modelConfig.modelConfThreshold < 0.1f)
_modelConfig.modelConfThreshold = 0.5f;
m_segThreshold = _modelConfig.modelConfThreshold;
_fp16 = true;
// Create CUDA stream
cudaSetDevice(_modelConfig.gpuDeviceIndex);
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
// Build/load TRT engines for image + language encoders
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
// Pre-build uncached TRT engines (avoids GPU OOM during build)
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
_modelLoadValid = false;
return false;
}
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
_logger.LogError("ANSSAM3::Initialize", "Failed to build/load TRT engines", __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
_modelLoadValid = true;
_isInitialized = true;
// Load tokenizer
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
if (FileExist(tokenizerPath)) {
m_tokenizer->Load(tokenizerPath);
_logger.LogDebug("ANSSAM3::Initialize", "CLIP tokenizer loaded", __FILE__, __LINE__);
}
return true;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSSAM3::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
// =========================================================================
// LoadModel
// =========================================================================
bool ANSSAM3::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
2026-04-13 19:48:32 +10:00
ModelLoadingGuard mlg(_modelLoading);
2026-03-28 16:54:11 +11:00
try {
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
_modelConfig.detectionType = DetectionType::SEGMENTATION;
if (_modelConfig.modelConfThreshold < 0.1f)
_modelConfig.modelConfThreshold = 0.5f;
m_segThreshold = _modelConfig.modelConfThreshold;
_fp16 = true;
cudaSetDevice(_modelConfig.gpuDeviceIndex);
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
// Pre-build uncached TRT engines (avoids GPU OOM during build)
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
_modelLoadValid = false;
return false;
}
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
_logger.LogError("ANSSAM3::LoadModel", "Failed to build/load TRT engines", __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
_modelLoadValid = true;
_isInitialized = true;
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
if (FileExist(tokenizerPath)) {
m_tokenizer->Load(tokenizerPath);
}
return true;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSSAM3::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
// =========================================================================
// LoadModelFromFolder
// =========================================================================
bool ANSSAM3::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
std::string modelName, std::string className,
const std::string& modelFolder, std::string& labelMap)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
2026-04-13 19:48:32 +10:00
ModelLoadingGuard mlg(_modelLoading);
2026-03-28 16:54:11 +11:00
try {
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
if (!result) return false;
_modelConfig = modelConfig;
_modelConfig.detectionType = DetectionType::SEGMENTATION;
if (_modelConfig.modelConfThreshold < 0.1f)
_modelConfig.modelConfThreshold = 0.5f;
m_segThreshold = _modelConfig.modelConfThreshold;
_fp16 = true;
cudaSetDevice(_modelConfig.gpuDeviceIndex);
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
std::string imgOnnx = CreateFilePath(modelFolder, "sam3_image_encoder.onnx");
std::string langOnnx = CreateFilePath(modelFolder, "sam3_language_encoder.onnx");
std::string decOnnx = CreateFilePath(modelFolder, "sam3_decoder.onnx");
// Pre-build uncached TRT engines (avoids GPU OOM during build)
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
_modelLoadValid = false;
return false;
}
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
_logger.LogError("ANSSAM3::LoadModelFromFolder", "Failed to build/load TRT engines", __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
_modelLoadValid = true;
_isInitialized = true;
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
std::string tokenizerPath = CreateFilePath(modelFolder, "merges.txt");
if (FileExist(tokenizerPath)) {
m_tokenizer->Load(tokenizerPath);
_logger.LogDebug("ANSSAM3::LoadModelFromFolder", "CLIP tokenizer loaded", __FILE__, __LINE__);
}
return true;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSSAM3::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
return false;
}
}
// =========================================================================
// SetPrompt — run language encoder, cache outputs on GPU
// =========================================================================
bool ANSSAM3::SetPrompt(const std::vector<int64_t>& inputIds, const std::vector<int64_t>& attentionMask)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!m_langEncoder.context) {
_logger.LogError("ANSSAM3::SetPrompt", "Language encoder not loaded", __FILE__, __LINE__);
return false;
}
// Language encoder input: "tokens" [1, 32] int64
// Find the tokens input tensor
auto it = m_langEncoder.nameToIdx.find("tokens");
if (it == m_langEncoder.nameToIdx.end()) {
// Try first input
const char* firstName = m_langEncoder.engine->getIOTensorName(0);
it = m_langEncoder.nameToIdx.find(firstName);
}
if (it == m_langEncoder.nameToIdx.end()) {
_logger.LogError("ANSSAM3::SetPrompt", "Cannot find tokens input tensor", __FILE__, __LINE__);
return false;
}
int tokIdx = it->second;
const char* tokName = m_langEncoder.engine->getIOTensorName(tokIdx);
auto tokDtype = m_langEncoder.engine->getTensorDataType(tokName);
// Upload tokens — handle int64 vs int32 data type
if (tokDtype == nvinfer1::DataType::kINT64) {
cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], inputIds.data(),
inputIds.size() * sizeof(int64_t),
cudaMemcpyHostToDevice, m_cudaStream);
}
else if (tokDtype == nvinfer1::DataType::kINT32) {
// TRT may have converted int64 to int32 at build time
std::vector<int32_t> tokens32(inputIds.size());
for (size_t i = 0; i < inputIds.size(); ++i)
tokens32[i] = static_cast<int32_t>(inputIds[i]);
cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], tokens32.data(),
tokens32.size() * sizeof(int32_t),
cudaMemcpyHostToDevice, m_cudaStream);
}
// Set input shape
nvinfer1::Dims tokenDims;
tokenDims.nbDims = 2;
tokenDims.d[0] = 1;
tokenDims.d[1] = static_cast<int>(inputIds.size());
m_langEncoder.context->setInputShape(tokName, tokenDims);
// Run language encoder
#if NV_TENSORRT_MAJOR >= 10
bool ok = m_langEncoder.context->enqueueV3(m_cudaStream);
#else
bool ok = m_langEncoder.context->enqueueV2(
reinterpret_cast<void**>(m_langEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
if (!ok) {
_logger.LogError("ANSSAM3::SetPrompt", "Language encoder enqueue failed", __FILE__, __LINE__);
return false;
}
cudaStreamSynchronize(m_cudaStream);
// Find language encoder outputs: text_attention_mask and text_memory
// output[0]: text_attention_mask [1, 32] bool → cached as m_cachedLangMask
// output[1]: text_memory [32, 1, 256] float32 → cached as m_cachedLangFeats
// output[2]: text_embeds [32, 1, 1024] float32 → NOT used
int maskOutIdx = -1, featsOutIdx = -1;
const int numTensors = m_langEncoder.engine->getNbIOTensors();
for (int i = 0; i < numTensors; ++i) {
const char* name = m_langEncoder.engine->getIOTensorName(i);
if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
continue;
std::string sname(name);
if (sname.find("attention_mask") != std::string::npos ||
sname.find("text_attention") != std::string::npos) {
maskOutIdx = i;
}
else if (sname.find("text_memory") != std::string::npos ||
sname.find("memory") != std::string::npos) {
featsOutIdx = i;
}
}
// Fallback: first 2 outputs in order
if (maskOutIdx < 0 || featsOutIdx < 0) {
int outCount = 0;
for (int i = 0; i < numTensors; ++i) {
const char* name = m_langEncoder.engine->getIOTensorName(i);
if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
continue;
if (outCount == 0 && maskOutIdx < 0) maskOutIdx = i;
else if (outCount == 1 && featsOutIdx < 0) featsOutIdx = i;
outCount++;
}
}
// Cache mask on GPU
if (maskOutIdx >= 0) {
size_t bytes = m_langEncoder.gpuBufferSizes[maskOutIdx];
if (m_cachedLangMask && m_cachedLangMaskBytes < bytes) {
cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr;
}
if (!m_cachedLangMask) {
cudaMalloc(&m_cachedLangMask, bytes);
}
m_cachedLangMaskBytes = bytes;
cudaMemcpyAsync(m_cachedLangMask, m_langEncoder.gpuBuffers[maskOutIdx],
bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
}
// Cache features on GPU
if (featsOutIdx >= 0) {
size_t bytes = m_langEncoder.gpuBufferSizes[featsOutIdx];
if (m_cachedLangFeats && m_cachedLangFeatsBytes < bytes) {
cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr;
}
if (!m_cachedLangFeats) {
cudaMalloc(&m_cachedLangFeats, bytes);
}
m_cachedLangFeatsBytes = bytes;
cudaMemcpyAsync(m_cachedLangFeats, m_langEncoder.gpuBuffers[featsOutIdx],
bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
}
cudaStreamSynchronize(m_cudaStream);
m_promptSet = true;
return true;
}
bool ANSSAM3::SetPrompt(const std::string& text)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!m_tokenizer || !m_tokenizer->IsLoaded()) {
_logger.LogError("ANSSAM3::SetPrompt",
"Tokenizer not loaded. Place merges.txt in model folder.", __FILE__, __LINE__);
return false;
}
auto result = m_tokenizer->Tokenize(text, m_tokenLength);
SetPrompt(result.inputIds, result.attentionMask);
return true;
}
// =========================================================================
// RunInference
// =========================================================================
std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input)
{
return RunInference(input, "");
}
std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input, const std::string& camera_id)
{
2026-04-13 19:48:32 +10:00
if (!PreInferenceCheck("ANSSAM3::RunInference")) return {};
2026-03-28 16:54:11 +11:00
try {
return Detect(input, camera_id);
}
catch (const std::exception& e) {
_logger.LogFatal("ANSSAM3::RunInference", e.what(), __FILE__, __LINE__);
return {};
}
}
// =========================================================================
// Detect — image encoder + decoder pipeline
// =========================================================================
std::vector<Object> ANSSAM3::Detect(const cv::Mat& input, const std::string& camera_id)
{
if (!_modelLoadValid || !m_imgEncoder.context || !m_cudaStream) {
return {};
}
const int origW = input.cols;
const int origH = input.rows;
// ---- 1) Find image encoder input tensor and determine dtype ----
auto imgIt = m_imgEncoder.nameToIdx.find("image");
if (imgIt == m_imgEncoder.nameToIdx.end()) {
const char* firstName = m_imgEncoder.engine->getIOTensorName(0);
imgIt = m_imgEncoder.nameToIdx.find(firstName);
}
if (imgIt == m_imgEncoder.nameToIdx.end()) {
_logger.LogError("ANSSAM3::Detect", "Cannot find image input tensor", __FILE__, __LINE__);
return {};
}
int imgInputIdx = imgIt->second;
const char* imgInputName = m_imgEncoder.engine->getIOTensorName(imgInputIdx);
auto imgDtype = m_imgEncoder.engine->getTensorDataType(imgInputName);
bool isUint8Input = (imgDtype == nvinfer1::DataType::kINT8 || imgDtype == nvinfer1::DataType::kBOOL);
#if NV_TENSORRT_MAJOR >= 10
isUint8Input = isUint8Input || (imgDtype == nvinfer1::DataType::kUINT8);
#endif
// ---- 1b) Try NV12 fast path — fused NV12→RGB resize CHW directly into TRT buffer ----
bool usedNV12 = false;
{
auto nv12 = m_nv12Helper.tryNV12DirectToBuffer(
input, 0 /*inferenceGpu*/,
m_imgEncoder.gpuBuffers[imgInputIdx],
m_inputSize, m_inputSize,
!isUint8Input, // float32 if not uint8
m_cudaStream,
_logger, "ANSSAM3");
usedNV12 = nv12.succeeded;
m_nv12Helper.tickInference();
}
// ---- 1c) CPU fallback: BGR → RGB, resize to 1008, HWC→CHW, upload ----
if (!usedNV12) {
cv::Mat resized;
cv::resize(input, resized, cv::Size(m_inputSize, m_inputSize));
cv::Mat rgb;
cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);
const size_t planeSize = static_cast<size_t>(m_inputSize) * m_inputSize;
std::vector<uint8_t> imgBuffer(3 * planeSize);
cv::Mat channels[3];
cv::split(rgb, channels);
for (int c = 0; c < 3; ++c)
std::memcpy(imgBuffer.data() + c * planeSize, channels[c].data, planeSize);
if (isUint8Input) {
cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgBuffer.data(),
imgBuffer.size(), cudaMemcpyHostToDevice, m_cudaStream);
}
else {
std::vector<float> imgFloat(imgBuffer.size());
for (size_t i = 0; i < imgBuffer.size(); ++i)
imgFloat[i] = static_cast<float>(imgBuffer[i]);
cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgFloat.data(),
imgFloat.size() * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream);
}
}
// Set image input shape
nvinfer1::Dims imgDims;
imgDims.nbDims = 3;
imgDims.d[0] = 3;
imgDims.d[1] = m_inputSize;
imgDims.d[2] = m_inputSize;
m_imgEncoder.context->setInputShape(imgInputName, imgDims);
// ---- 2) Run image encoder ----
// Check for prior CUDA errors (e.g. OOM from memcpy) before enqueue
{
cudaError_t preErr = cudaGetLastError();
if (preErr != cudaSuccess) {
_logger.LogError("ANSSAM3::Detect",
std::string("CUDA error before enqueue: ") + cudaGetErrorString(preErr),
__FILE__, __LINE__);
return {};
}
}
#if NV_TENSORRT_MAJOR >= 10
bool okImg = m_imgEncoder.context->enqueueV3(m_cudaStream);
#else
bool okImg = m_imgEncoder.context->enqueueV2(
reinterpret_cast<void**>(m_imgEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
if (!okImg) {
cudaError_t postErr = cudaGetLastError();
_logger.LogError("ANSSAM3::Detect",
std::string("Image encoder enqueue failed") +
(postErr != cudaSuccess ? std::string(": ") + cudaGetErrorString(postErr) : ""),
__FILE__, __LINE__);
return {};
}
cudaStreamSynchronize(m_cudaStream);
// ---- 3) Feed encoder outputs into TRT decoder (zero-copy via setTensorAddress) ----
// Helper: copy GPU buffer from image encoder output → decoder input
auto feedImgToDec = [&](const std::string& tensorName) {
auto srcIt = m_imgEncoder.nameToIdx.find(tensorName);
auto dstIt = m_decoder.nameToIdx.find(tensorName);
if (srcIt == m_imgEncoder.nameToIdx.end() || dstIt == m_decoder.nameToIdx.end()) return;
int srcIdx = srcIt->second;
int dstIdx = dstIt->second;
// Point decoder input directly at encoder output buffer (true zero-copy)
const char* dstName = m_decoder.engine->getIOTensorName(dstIdx);
m_decoder.context->setTensorAddress(dstName, m_imgEncoder.gpuBuffers[srcIdx]);
// Set input shape from encoder's actual output shape
const char* srcName = m_imgEncoder.engine->getIOTensorName(srcIdx);
auto shape = m_imgEncoder.context->getTensorShape(srcName);
m_decoder.context->setInputShape(dstName, shape);
};
feedImgToDec("vision_pos_enc_2");
feedImgToDec("backbone_fpn_0");
feedImgToDec("backbone_fpn_1");
feedImgToDec("backbone_fpn_2");
// Language features — point decoder inputs at cached GPU buffers
{
auto it = m_decoder.nameToIdx.find("language_mask");
if (it != m_decoder.nameToIdx.end()) {
const char* name = m_decoder.engine->getIOTensorName(it->second);
m_decoder.context->setTensorAddress(name, m_cachedLangMask);
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = m_tokenLength;
m_decoder.context->setInputShape(name, d);
}
}
{
auto it = m_decoder.nameToIdx.find("language_features");
if (it != m_decoder.nameToIdx.end()) {
const char* name = m_decoder.engine->getIOTensorName(it->second);
m_decoder.context->setTensorAddress(name, m_cachedLangFeats);
nvinfer1::Dims d; d.nbDims = 3; d.d[0] = m_tokenLength; d.d[1] = 1; d.d[2] = 256;
m_decoder.context->setInputShape(name, d);
}
}
// Scalar inputs (original_height, original_width) — host-memory shape tensors
{
auto it = m_decoder.nameToIdx.find("original_height");
if (it != m_decoder.nameToIdx.end()) {
int idx = it->second;
auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
if (dtype == nvinfer1::DataType::kINT64)
*reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origH);
else
*reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origH;
}
}
{
auto it = m_decoder.nameToIdx.find("original_width");
if (it != m_decoder.nameToIdx.end()) {
int idx = it->second;
auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
if (dtype == nvinfer1::DataType::kINT64)
*reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origW);
else
*reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origW;
}
}
// Prompt inputs: box_coords [1,1,4], box_labels [1,1], box_masks [1,1]
{
auto it = m_decoder.nameToIdx.find("box_coords");
if (it != m_decoder.nameToIdx.end()) {
int idx = it->second;
float boxCoords[4] = { 0.f, 0.f, 0.f, 0.f };
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], boxCoords, sizeof(boxCoords),
cudaMemcpyHostToDevice, m_cudaStream);
const char* name = m_decoder.engine->getIOTensorName(idx);
nvinfer1::Dims d; d.nbDims = 3; d.d[0] = 1; d.d[1] = 1; d.d[2] = 4;
m_decoder.context->setInputShape(name, d);
}
}
{
auto it = m_decoder.nameToIdx.find("box_labels");
if (it != m_decoder.nameToIdx.end()) {
int idx = it->second;
const char* name = m_decoder.engine->getIOTensorName(idx);
auto dtype = m_decoder.engine->getTensorDataType(name);
if (dtype == nvinfer1::DataType::kINT64) {
int64_t val = -1;
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
cudaMemcpyHostToDevice, m_cudaStream);
} else {
int32_t val = -1;
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
cudaMemcpyHostToDevice, m_cudaStream);
}
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
m_decoder.context->setInputShape(name, d);
}
}
{
auto it = m_decoder.nameToIdx.find("box_masks");
if (it != m_decoder.nameToIdx.end()) {
int idx = it->second;
const char* name = m_decoder.engine->getIOTensorName(idx);
auto dtype = m_decoder.engine->getTensorDataType(name);
if (dtype == nvinfer1::DataType::kBOOL) {
bool val = false;
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
cudaMemcpyHostToDevice, m_cudaStream);
} else {
int32_t val = 0;
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
cudaMemcpyHostToDevice, m_cudaStream);
}
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
m_decoder.context->setInputShape(name, d);
}
}
// ---- 4) Run TRT decoder ----
// Find output tensor indices for boxes/scores/masks
std::string boxesTName, scoresTName, masksTName;
int boxesIdx = -1, scoresIdx = -1, masksIdx = -1;
for (auto& kv : m_decoder.nameToIdx) {
auto mode = m_decoder.engine->getTensorIOMode(kv.first.c_str());
if (mode != nvinfer1::TensorIOMode::kOUTPUT) continue;
if (kv.first.find("box") != std::string::npos && kv.first.find("mask") == std::string::npos)
{ boxesTName = kv.first; boxesIdx = kv.second; }
else if (kv.first.find("score") != std::string::npos)
{ scoresTName = kv.first; scoresIdx = kv.second; }
else if (kv.first.find("mask") != std::string::npos)
{ masksTName = kv.first; masksIdx = kv.second; }
}
if (boxesIdx < 0 || scoresIdx < 0 || masksIdx < 0) {
_logger.LogError("ANSSAM3::Detect", "Cannot find decoder output tensors", __FILE__, __LINE__);
return {};
}
#if NV_TENSORRT_MAJOR >= 10
// Stack-local allocators that return the pre-allocated gpuBuffers.
// Registering these enables getTensorShape() to return actual
// (not -1) dimensions for data-dependent outputs after enqueueV3.
PassthroughOutputAllocator boxAlloc (m_decoder.gpuBuffers[boxesIdx], m_decoder.gpuBufferSizes[boxesIdx]);
PassthroughOutputAllocator scoreAlloc(m_decoder.gpuBuffers[scoresIdx], m_decoder.gpuBufferSizes[scoresIdx]);
PassthroughOutputAllocator maskAlloc (m_decoder.gpuBuffers[masksIdx], m_decoder.gpuBufferSizes[masksIdx]);
m_decoder.context->setOutputAllocator(boxesTName.c_str(), &boxAlloc);
m_decoder.context->setOutputAllocator(scoresTName.c_str(), &scoreAlloc);
m_decoder.context->setOutputAllocator(masksTName.c_str(), &maskAlloc);
#endif
#if NV_TENSORRT_MAJOR >= 10
bool okDec = m_decoder.context->enqueueV3(m_cudaStream);
#else
bool okDec = m_decoder.context->enqueueV2(
reinterpret_cast<void**>(m_decoder.gpuBuffers.data()), m_cudaStream, nullptr);
#endif
cudaStreamSynchronize(m_cudaStream);
if (!okDec) {
_logger.LogError("ANSSAM3::Detect", "Decoder enqueue failed", __FILE__, __LINE__);
return {};
}
// ---- 5) Parse TRT decoder outputs: boxes [N,4], scores [N], masks [N,1,H,W] ----
int numBoxes = 0, maskH = 0, maskW = 0;
void* boxesGpu = nullptr;
void* scoresGpu = nullptr;
void* masksGpu = nullptr;
#if NV_TENSORRT_MAJOR >= 10
// Read actual shapes from notifyShape() callback
if (boxAlloc.shapeKnown) {
auto& s = boxAlloc.actualDims;
numBoxes = (s.nbDims >= 1) ? static_cast<int>(s.d[0]) : 0;
boxesGpu = boxAlloc.preAllocBuf;
}
if (scoreAlloc.shapeKnown) {
scoresGpu = scoreAlloc.preAllocBuf;
}
if (maskAlloc.shapeKnown) {
auto& s = maskAlloc.actualDims;
maskH = (s.nbDims >= 3) ? static_cast<int>(s.d[2]) : 0;
maskW = (s.nbDims >= 4) ? static_cast<int>(s.d[3]) : 0;
masksGpu = maskAlloc.preAllocBuf;
}
#else
{
auto boxShape = m_decoder.context->getTensorShape(boxesTName.c_str());
auto maskShape = m_decoder.context->getTensorShape(masksTName.c_str());
numBoxes = (boxShape.nbDims >= 1) ? static_cast<int>(boxShape.d[0]) : 0;
maskH = (maskShape.nbDims >= 3) ? static_cast<int>(maskShape.d[2]) : 0;
maskW = (maskShape.nbDims >= 4) ? static_cast<int>(maskShape.d[3]) : 0;
boxesGpu = m_decoder.gpuBuffers[boxesIdx];
scoresGpu = m_decoder.gpuBuffers[scoresIdx];
masksGpu = m_decoder.gpuBuffers[masksIdx];
}
#endif
if (numBoxes <= 0 || !boxesGpu || !scoresGpu) return {};
// Download decoder outputs from GPU → CPU for postprocessing
size_t boxesBytes = static_cast<size_t>(numBoxes) * 4 * sizeof(float);
size_t scoresBytes = static_cast<size_t>(numBoxes) * sizeof(float);
size_t masksBytes = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(bool);
std::vector<float> boxesCpu(numBoxes * 4);
std::vector<float> scoresCpu(numBoxes);
cudaMemcpy(boxesCpu.data(), boxesGpu, boxesBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(scoresCpu.data(), scoresGpu, scoresBytes, cudaMemcpyDeviceToHost);
// Masks may be bool or float depending on TRT's internal optimization.
// Download to a raw buffer and convert to bool.
auto maskDtype = m_decoder.engine->getTensorDataType(masksTName.c_str());
std::vector<uint8_t> masksRaw(masksBytes);
if (masksGpu && maskH > 0 && maskW > 0) {
if (maskDtype == nvinfer1::DataType::kBOOL) {
cudaMemcpy(masksRaw.data(), masksGpu, masksBytes, cudaMemcpyDeviceToHost);
} else {
// Float masks — download and threshold
size_t floatBytes = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(float);
std::vector<float> masksFloat(static_cast<size_t>(numBoxes) * maskH * maskW);
cudaMemcpy(masksFloat.data(), masksGpu, floatBytes, cudaMemcpyDeviceToHost);
for (size_t j = 0; j < masksFloat.size(); ++j)
masksRaw[j] = masksFloat[j] > m_segThreshold ? 1 : 0;
}
}
// Convert raw buffer to bool pointer for PostprocessInstances
const bool* masksData = reinterpret_cast<const bool*>(masksRaw.data());
auto ret = PostprocessInstances(boxesCpu.data(), numBoxes, scoresCpu.data(),
masksData, maskH, maskW,
origW, origH, camera_id);
if (_trackerEnabled) {
ret = ApplyTracking(ret, camera_id);
if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id);
}
return ret;
}
// =========================================================================
// PostprocessInstances — same logic as ONNXSAM3::postprocessResults
// =========================================================================
std::vector<Object> ANSSAM3::PostprocessInstances(
const float* boxesData, int numBoxes,
const float* scoresData,
const bool* masksData,
int maskH, int maskW,
int origWidth, int origHeight,
const std::string& camera_id)
{
std::vector<Object> results;
for (int i = 0; i < numBoxes; ++i) {
float score = scoresData[i];
if (score < m_segThreshold)
continue;
// Box: [x1, y1, x2, y2] in original image coordinates
float x1 = std::max(0.0f, std::min(boxesData[i * 4 + 0], static_cast<float>(origWidth)));
float y1 = std::max(0.0f, std::min(boxesData[i * 4 + 1], static_cast<float>(origHeight)));
float x2 = std::max(0.0f, std::min(boxesData[i * 4 + 2], static_cast<float>(origWidth)));
float y2 = std::max(0.0f, std::min(boxesData[i * 4 + 3], static_cast<float>(origHeight)));
cv::Rect box(static_cast<int>(x1), static_cast<int>(y1),
static_cast<int>(x2 - x1), static_cast<int>(y2 - y1));
if (box.width <= 0 || box.height <= 0)
continue;
// Extract this instance's mask: [1, H, W] at index i
// ORT decoder always outputs bool masks — convert to 0/255 uint8
cv::Mat boolMask(maskH, maskW, CV_8UC1);
size_t maskOffset = static_cast<size_t>(i) * 1 * maskH * maskW;
const bool* src = masksData + maskOffset;
for (int y = 0; y < maskH; ++y)
for (int x = 0; x < maskW; ++x)
boolMask.at<uint8_t>(y, x) = src[y * maskW + x] ? 255 : 0;
// Resize mask to original resolution
cv::Mat fullMask;
cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight), 0, 0, cv::INTER_LINEAR);
cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY);
// Crop to bounding box
cv::Mat roiMask = fullMask(box).clone();
Object obj;
obj.box = box;
obj.confidence = score;
obj.classId = 0;
obj.className = "object";
obj.cameraId = camera_id;
obj.mask = roiMask;
// Create normalized polygon from mask (closed, maxPoints-limited)
obj.polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
roiMask, box,
static_cast<float>(origWidth), static_cast<float>(origHeight));
// Fallback: normalized box corners if mask polygon failed
if (obj.polygon.empty()) {
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
box, static_cast<float>(origWidth), static_cast<float>(origHeight));
}
results.push_back(std::move(obj));
}
return results;
}
// =========================================================================
// Destroy
// =========================================================================
bool ANSSAM3::Destroy()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
// TRT engine bundles
m_imgEncoder.destroy();
m_langEncoder.destroy();
m_decoder.destroy();
// Cached language encoder GPU buffers
if (m_cachedLangMask) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; }
m_cachedLangMaskBytes = 0;
if (m_cachedLangFeats) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; }
m_cachedLangFeatsBytes = 0;
if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }
m_tokenizer.reset();
m_promptSet = false;
_modelLoadValid = false;
_isInitialized = false;
return true;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSSAM3::Destroy", e.what(), __FILE__, __LINE__);
return false;
}
}
ANSSAM3::~ANSSAM3()
{
Destroy();
}
}