1495 lines
66 KiB
C++
1495 lines
66 KiB
C++
#include "ANSSAM3.h"
|
|
#include "ANSCLIPTokenizer.h"
|
|
#include "Utility.h"
|
|
#include <NvInferPlugin.h>
|
|
|
|
#include <opencv2/opencv.hpp>
|
|
#include <fstream>
|
|
#include <filesystem>
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
|
|
namespace ANSCENTER
|
|
{
|
|
// =========================================================================
|
|
// Helpers
|
|
// =========================================================================
|
|
|
|
// Portable FP16 ↔ FP32 conversion (works in plain C++ without NVCC)
|
|
static float fp16ToFloat(uint16_t h)
|
|
{
|
|
uint32_t sign = static_cast<uint32_t>(h >> 15) << 31;
|
|
uint32_t expo = (h >> 10) & 0x1Fu;
|
|
uint32_t mant = h & 0x3FFu;
|
|
if (expo == 0) {
|
|
if (mant == 0) { float f; std::memcpy(&f, &sign, 4); return f; }
|
|
while (!(mant & 0x400u)) { mant <<= 1; expo--; }
|
|
expo++; mant &= 0x3FFu;
|
|
} else if (expo == 31) {
|
|
expo = 255;
|
|
}
|
|
uint32_t bits = sign | ((expo + 127u - 15u) << 23) | (mant << 13);
|
|
float f; std::memcpy(&f, &bits, 4); return f;
|
|
}
|
|
static uint16_t floatToFp16(float val)
|
|
{
|
|
uint32_t bits; std::memcpy(&bits, &val, 4);
|
|
uint16_t sign = static_cast<uint16_t>((bits >> 16) & 0x8000u);
|
|
int32_t expo = ((bits >> 23) & 0xFF) - 127 + 15;
|
|
uint32_t mant = bits & 0x7FFFFFu;
|
|
if (expo <= 0) return sign; // underflow → ±0
|
|
if (expo >= 31) return sign | 0x7C00u; // overflow → ±inf
|
|
return sign | static_cast<uint16_t>(expo << 10) | static_cast<uint16_t>(mant >> 13);
|
|
}
|
|
|
|
size_t ANSSAM3::DataTypeSize(nvinfer1::DataType dtype)
|
|
{
|
|
switch (dtype) {
|
|
case nvinfer1::DataType::kFLOAT: return 4;
|
|
case nvinfer1::DataType::kHALF: return 2;
|
|
case nvinfer1::DataType::kINT32: return 4;
|
|
case nvinfer1::DataType::kINT64: return 8;
|
|
case nvinfer1::DataType::kINT8: return 1;
|
|
case nvinfer1::DataType::kBOOL: return 1;
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
case nvinfer1::DataType::kUINT8: return 1;
|
|
case nvinfer1::DataType::kFP8: return 1;
|
|
case nvinfer1::DataType::kBF16: return 2;
|
|
case nvinfer1::DataType::kINT4: return 1; // conservative — 4-bit packed
|
|
#endif
|
|
default: return 4;
|
|
}
|
|
}
|
|
|
|
void ANSSAM3::TRTBundle::destroy()
|
|
{
|
|
context.reset();
|
|
engine.reset();
|
|
runtime.reset();
|
|
for (int i = 0; i < (int)gpuBuffers.size(); ++i) {
|
|
if (!gpuBuffers[i]) continue;
|
|
if (hostBufferIdx.count(i))
|
|
free(gpuBuffers[i]); // host-allocated (shape tensor)
|
|
else
|
|
cudaFree(gpuBuffers[i]); // device-allocated
|
|
gpuBuffers[i] = nullptr;
|
|
}
|
|
gpuBuffers.clear();
|
|
gpuBufferSizes.clear();
|
|
hostBufferIdx.clear();
|
|
nameToIdx.clear();
|
|
}
|
|
|
|
// (CreateOrtDecoderSession removed — decoder now runs under TRT)
|
|
|
|
// -----------------------------------------------------------------
|
|
// PassthroughOutputAllocator — lightweight IOutputAllocator that simply
|
|
// returns the pre-allocated gpuBuffer. Defined here (not in the header)
|
|
// so the vtable and CUDA symbols stay inside the engine DLL.
|
|
// Created on the stack in Detect() — no persistent class members needed.
|
|
// -----------------------------------------------------------------
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
struct PassthroughOutputAllocator : public nvinfer1::IOutputAllocator
|
|
{
|
|
void* preAllocBuf; // existing gpuBuffers[idx]
|
|
size_t preAllocSize; // existing gpuBufferSizes[idx]
|
|
nvinfer1::Dims actualDims{};
|
|
bool shapeKnown = false;
|
|
|
|
PassthroughOutputAllocator(void* buf, size_t sz)
|
|
: preAllocBuf(buf), preAllocSize(sz) {}
|
|
|
|
void* reallocateOutput(
|
|
char const* /*tensorName*/, void* currentMemory,
|
|
uint64_t size, uint64_t /*alignment*/) noexcept override
|
|
{
|
|
if (size <= preAllocSize) return preAllocBuf;
|
|
|
|
// Fallback: grow (should not happen with generous pre-allocation)
|
|
void* newBuf = nullptr;
|
|
if (cudaMalloc(&newBuf, size) == cudaSuccess) {
|
|
preAllocBuf = newBuf;
|
|
preAllocSize = size;
|
|
}
|
|
return preAllocBuf;
|
|
}
|
|
|
|
void notifyShape(char const* /*tensorName*/,
|
|
nvinfer1::Dims const& dims) noexcept override
|
|
{
|
|
actualDims = dims;
|
|
shapeKnown = true;
|
|
}
|
|
};
|
|
#endif
|
|
|
|
// =========================================================================
|
|
// EngineFileName — generate cache path: <stem>.engine.<GPUName>.<fp16|fp32>
|
|
// =========================================================================
|
|
|
|
std::string ANSSAM3::EngineFileName(const std::string& onnxPath, TrtPrecision precision) const
|
|
{
|
|
// Extract stem from ONNX path
|
|
std::filesystem::path p(onnxPath);
|
|
std::string stem = p.stem().string();
|
|
|
|
// Get GPU name
|
|
cudaDeviceProp prop;
|
|
cudaGetDeviceProperties(&prop, 0);
|
|
std::string gpuName(prop.name);
|
|
gpuName.erase(std::remove_if(gpuName.begin(), gpuName.end(), ::isspace), gpuName.end());
|
|
|
|
// BF16 requires Ampere+ (compute capability >= 8.0).
|
|
// On older GPUs BuildAndLoadEngine silently falls back to FP32,
|
|
// so the filename must match to avoid a .bf16 / .fp32 mismatch.
|
|
TrtPrecision effective = precision;
|
|
if (precision == TrtPrecision::BF16 && prop.major < 8)
|
|
effective = TrtPrecision::FP32;
|
|
|
|
std::string precStr;
|
|
switch (effective) {
|
|
case TrtPrecision::FP16: precStr = "fp16"; break;
|
|
case TrtPrecision::BF16: precStr = "bf16"; break;
|
|
case TrtPrecision::FP32: precStr = "fp32"; break;
|
|
}
|
|
|
|
std::string dir = p.parent_path().string();
|
|
return dir + "\\" + stem + ".engine." + gpuName + "." + precStr;
|
|
}
|
|
|
|
// =========================================================================
|
|
// BuildAndLoadEngine — build TRT engine from ONNX + load for inference
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::BuildAndLoadEngine(TRTBundle& bundle, const std::string& onnxPath,
|
|
const std::string& label, TrtPrecision precision)
|
|
{
|
|
// Register TRT built-in plugins (needed for RoiAlign in decoder, etc.)
|
|
// Safe to call multiple times — idempotent.
|
|
initLibNvInferPlugins(&m_trtLogger, "");
|
|
|
|
std::string enginePath = EngineFileName(onnxPath, precision);
|
|
|
|
// Check for cached engine
|
|
if (FileExist(enginePath)) {
|
|
std::cout << "[ANSSAM3] " << label << ": cached engine found: " << enginePath << std::endl;
|
|
return LoadTRTEngineBundle(bundle, enginePath, label);
|
|
}
|
|
|
|
// --- Build from ONNX ---
|
|
std::cout << "[ANSSAM3] " << label << ": building TRT engine from " << onnxPath << std::endl;
|
|
|
|
if (!FileExist(onnxPath)) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": ONNX file not found: " + onnxPath, __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_trtLogger));
|
|
if (!builder) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createInferBuilder failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
|
|
if (!network) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createNetworkV2 failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_trtLogger));
|
|
if (!parser) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createParser failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
// parseFromFile resolves .onnx_data external files relative to the ONNX directory
|
|
if (!parser->parseFromFile(onnxPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))) {
|
|
for (int i = 0; i < parser->getNbErrors(); ++i)
|
|
std::cerr << "[ANSSAM3] " << label << " parse error: " << parser->getError(i)->desc() << std::endl;
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": parseFromFile failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
std::cout << "[ANSSAM3] " << label << ": ONNX parsed successfully." << std::endl;
|
|
|
|
// --- Log and configure input tensors ---
|
|
auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
|
|
|
|
// Workspace governs max scratch memory TRT can use at runtime.
|
|
// ImageEncoder FP32 needs huge scratch (3.5 GiB at 4 GiB workspace) which
|
|
// causes OOM on 8 GiB GPUs. Cap it to 2 GiB — TRT picks leaner tactics
|
|
// with minimal quality loss. LangEncoder/Decoder are small; keep 4 GiB.
|
|
const bool isImageEncoder = (label.find("ImageEncoder") != std::string::npos);
|
|
size_t workspaceBytes = isImageEncoder
|
|
? 2048ULL * 1024 * 1024 // 2 GiB for ImageEncoder (prevents OOM)
|
|
: 4096ULL * 1024 * 1024; // 4 GiB for LangEncoder / Decoder
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspaceBytes);
|
|
#else
|
|
config->setMaxWorkspaceSize(workspaceBytes);
|
|
#endif
|
|
|
|
// Set precision flags
|
|
switch (precision) {
|
|
case TrtPrecision::BF16:
|
|
{
|
|
// BF16 requires Ampere+ (compute capability >= 8.0) and TRT 8.6+.
|
|
// Check GPU capability, then attempt to set the flag if available.
|
|
cudaDeviceProp devProp;
|
|
cudaGetDeviceProperties(&devProp, 0);
|
|
#if NV_TENSORRT_MAJOR > 8 || (NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR >= 6)
|
|
if (devProp.major >= 8) {
|
|
config->setFlag(nvinfer1::BuilderFlag::kBF16);
|
|
std::cout << "[ANSSAM3] " << label << ": BF16 precision enabled." << std::endl;
|
|
} else {
|
|
std::cout << "[ANSSAM3] " << label << ": GPU CC " << devProp.major << "." << devProp.minor
|
|
<< " does not support BF16, falling back to FP32." << std::endl;
|
|
}
|
|
#else
|
|
(void)devProp; // suppress unused warning
|
|
std::cout << "[ANSSAM3] " << label << ": TensorRT version does not support BF16, falling back to FP32." << std::endl;
|
|
#endif
|
|
break;
|
|
}
|
|
case TrtPrecision::FP16:
|
|
if (builder->platformHasFastFp16()) {
|
|
config->setFlag(nvinfer1::BuilderFlag::kFP16);
|
|
std::cout << "[ANSSAM3] " << label << ": FP16 precision enabled." << std::endl;
|
|
}
|
|
break;
|
|
case TrtPrecision::FP32:
|
|
// No precision flags = FP32
|
|
break;
|
|
}
|
|
|
|
// Create optimization profile with actual ONNX dimensions
|
|
auto profile = builder->createOptimizationProfile();
|
|
int numInputs = network->getNbInputs();
|
|
for (int i = 0; i < numInputs; ++i) {
|
|
auto input = network->getInput(i);
|
|
const char* name = input->getName();
|
|
auto dims = input->getDimensions();
|
|
|
|
std::cout << "[ANSSAM3] " << label << " input[" << i << "] '" << name << "': [";
|
|
for (int d = 0; d < dims.nbDims; ++d) {
|
|
if (d > 0) std::cout << ", ";
|
|
std::cout << (dims.d[d] == -1 ? "dyn" : std::to_string(dims.d[d]));
|
|
}
|
|
std::cout << "]" << (input->isShapeTensor() ? " (shape tensor)" : "") << std::endl;
|
|
|
|
// Shape tensors: scalar int64 inputs whose VALUES determine output shapes
|
|
// (e.g. original_height, original_width). Use setShapeValues() not setDimensions().
|
|
if (input->isShapeTensor()) {
|
|
// nbValues = product of dims; for a scalar (nbDims==0) that is 1
|
|
int nbValues = 1;
|
|
for (int d = 0; d < dims.nbDims; ++d) {
|
|
if (dims.d[d] > 0) nbValues *= dims.d[d];
|
|
}
|
|
std::vector<int32_t> minV(nbValues, 1);
|
|
std::vector<int32_t> optV(nbValues, 1024);
|
|
std::vector<int32_t> maxV(nbValues, 4096);
|
|
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, minV.data(), nbValues);
|
|
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, optV.data(), nbValues);
|
|
profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, maxV.data(), nbValues);
|
|
continue;
|
|
}
|
|
|
|
// Regular execution tensors: replace dynamic dims with concrete values
|
|
bool hasDynamic = false;
|
|
nvinfer1::Dims fixedDims = dims;
|
|
for (int d = 0; d < dims.nbDims; ++d) {
|
|
if (dims.d[d] == -1) {
|
|
hasDynamic = true;
|
|
fixedDims.d[d] = 1; // default batch or sequence
|
|
}
|
|
}
|
|
|
|
if (hasDynamic) {
|
|
profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, fixedDims);
|
|
profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, fixedDims);
|
|
profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, fixedDims);
|
|
}
|
|
}
|
|
config->addOptimizationProfile(profile);
|
|
|
|
// --- Mixed precision for Decoder: keep FP16 for bulk ops but force
|
|
// score/NMS/comparison layers to FP32 so that the internal
|
|
// thresholding doesn't lose detections due to half-precision
|
|
// rounding. We mark layers whose names contain NMS-related
|
|
// keywords, plus all layers of types that perform comparisons
|
|
// or index-selection (which are part of the NMS pipeline).
|
|
if (precision == TrtPrecision::FP16 && label == std::string("Decoder")) {
|
|
config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
|
|
|
|
int numLayers = network->getNbLayers();
|
|
int markedCount = 0;
|
|
for (int li = 0; li < numLayers; ++li) {
|
|
auto* layer = network->getLayer(li);
|
|
if (!layer) continue;
|
|
|
|
std::string lname(layer->getName());
|
|
auto ltype = layer->getType();
|
|
|
|
// Force FP32 on layers involved in score thresholding / NMS:
|
|
// - Comparison ops (Greater, Less, Equal)
|
|
// - Sigmoid (final score activation)
|
|
// - TopK, NonZero, Gather, Select (index-selection in NMS)
|
|
// - Any layer whose name hints at score/nms/threshold
|
|
bool needFP32 = false;
|
|
|
|
// By layer type
|
|
switch (ltype) {
|
|
case nvinfer1::LayerType::kTOPK:
|
|
case nvinfer1::LayerType::kGATHER:
|
|
case nvinfer1::LayerType::kSELECT:
|
|
case nvinfer1::LayerType::kNON_ZERO:
|
|
case nvinfer1::LayerType::kSCATTER:
|
|
needFP32 = true;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// By layer name (ONNX op names often preserved by parser)
|
|
if (!needFP32) {
|
|
// Convert to lowercase for matching
|
|
std::string lower = lname;
|
|
std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
|
|
if (lower.find("score") != std::string::npos ||
|
|
lower.find("nms") != std::string::npos ||
|
|
lower.find("sigmoid") != std::string::npos ||
|
|
lower.find("threshold") != std::string::npos ||
|
|
lower.find("greater") != std::string::npos ||
|
|
lower.find("less") != std::string::npos ||
|
|
lower.find("where") != std::string::npos ||
|
|
lower.find("nonzero") != std::string::npos ||
|
|
lower.find("topk") != std::string::npos) {
|
|
needFP32 = true;
|
|
}
|
|
}
|
|
|
|
// TensorRT forbids setPrecision(kFLOAT) on layers that
|
|
// produce non-float types (booleans, indices/int32, int64).
|
|
// Only force FP32 when ALL outputs are floating-point.
|
|
if (needFP32) {
|
|
bool allFloat = true;
|
|
for (int oi = 0; oi < layer->getNbOutputs(); ++oi) {
|
|
auto dt = layer->getOutputType(oi);
|
|
if (dt != nvinfer1::DataType::kFLOAT &&
|
|
dt != nvinfer1::DataType::kHALF) {
|
|
allFloat = false;
|
|
break;
|
|
}
|
|
}
|
|
if (allFloat) {
|
|
layer->setPrecision(nvinfer1::DataType::kFLOAT);
|
|
for (int oi = 0; oi < layer->getNbOutputs(); ++oi)
|
|
layer->setOutputType(oi, nvinfer1::DataType::kFLOAT);
|
|
++markedCount;
|
|
}
|
|
}
|
|
}
|
|
std::cout << "[ANSSAM3] " << label << ": mixed precision — "
|
|
<< markedCount << "/" << numLayers
|
|
<< " layers forced to FP32 (score/NMS ops)." << std::endl;
|
|
}
|
|
|
|
// --- Build serialized engine ---
|
|
std::cout << "[ANSSAM3] " << label << ": building engine (this may take a few minutes)..." << std::endl;
|
|
|
|
unsigned long sehCode = 0;
|
|
auto plan = std::unique_ptr<nvinfer1::IHostMemory>(
|
|
buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCode));
|
|
|
|
if (sehCode != 0) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine",
|
|
label + ": engine build crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
if (!plan) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": buildSerializedNetwork returned null", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
// --- Save to disk ---
|
|
std::ofstream outFile(enginePath, std::ios::binary);
|
|
if (!outFile.is_open()) {
|
|
_logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": cannot write engine file: " + enginePath, __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
outFile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
|
|
outFile.close();
|
|
std::cout << "[ANSSAM3] " << label << ": engine saved to " << enginePath << std::endl;
|
|
|
|
plan.reset();
|
|
|
|
// --- Load the just-built engine ---
|
|
return LoadTRTEngineBundle(bundle, enginePath, label);
|
|
}
|
|
|
|
// =========================================================================
|
|
// LoadTRTEngineBundle — deserialize engine, allocate GPU buffers, bind
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::LoadTRTEngineBundle(TRTBundle& bundle, const std::string& enginePath, const std::string& label)
|
|
{
|
|
// Read engine file
|
|
std::ifstream file(enginePath, std::ios::binary | std::ios::ate);
|
|
if (!file.is_open()) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cannot open: " + enginePath, __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
std::streamsize fileSize = file.tellg();
|
|
file.seekg(0, std::ios::beg);
|
|
std::vector<char> engineData(fileSize);
|
|
if (!file.read(engineData.data(), fileSize)) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": read failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
file.close();
|
|
|
|
// Deserialize
|
|
bundle.runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(m_trtLogger));
|
|
if (!bundle.runtime) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createInferRuntime failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
unsigned long sehCode = 0;
|
|
bundle.engine = std::unique_ptr<nvinfer1::ICudaEngine>(
|
|
deserializeCudaEngineSafe(bundle.runtime.get(), engineData.data(), engineData.size(), &sehCode));
|
|
if (sehCode != 0) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
|
|
label + ": deserialize crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
if (!bundle.engine) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize returned null", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
// --- Weight streaming (TRT 10+): keep only a budget of weights on GPU,
|
|
// stream the rest from CPU pinned memory on demand.
|
|
// Saves ~1.3 GiB VRAM for ImageEncoder (1.8 GiB weights → 512 MiB on GPU).
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
{
|
|
int64_t streamableBytes = bundle.engine->getStreamableWeightsSize();
|
|
if (streamableBytes > 0 && label.find("ImageEncoder") != std::string::npos) {
|
|
// Budget = how much weight memory stays on GPU.
|
|
// 512 MiB keeps hot layers cached; rest streamed via PCIe.
|
|
const int64_t budgetBytes = 512LL * 1024 * 1024;
|
|
int64_t actualBudget = std::min(budgetBytes, streamableBytes);
|
|
bundle.engine->setWeightStreamingBudgetV2(actualBudget);
|
|
std::cout << "[ANSSAM3] " << label
|
|
<< ": weight streaming enabled (streamable="
|
|
<< (streamableBytes / (1024*1024)) << " MiB, budget="
|
|
<< (actualBudget / (1024*1024)) << " MiB)" << std::endl;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
bundle.context = std::unique_ptr<nvinfer1::IExecutionContext>(bundle.engine->createExecutionContext());
|
|
if (!bundle.context) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createExecutionContext failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
// Set optimization profile
|
|
int numProfiles = bundle.engine->getNbOptimizationProfiles();
|
|
if (numProfiles > 0) {
|
|
bundle.context->setOptimizationProfileAsync(0, m_cudaStream);
|
|
cudaStreamSynchronize(m_cudaStream);
|
|
}
|
|
|
|
// Allocate buffers (device for execution tensors, host for shape tensors)
|
|
const int numTensors = bundle.engine->getNbIOTensors();
|
|
bundle.gpuBuffers.resize(numTensors, nullptr);
|
|
bundle.gpuBufferSizes.resize(numTensors, 0);
|
|
bundle.hostBufferIdx.clear();
|
|
bundle.nameToIdx.clear();
|
|
|
|
for (int i = 0; i < numTensors; ++i) {
|
|
const char* name = bundle.engine->getIOTensorName(i);
|
|
auto mode = bundle.engine->getTensorIOMode(name);
|
|
auto shape = bundle.engine->getTensorShape(name);
|
|
auto dtype = bundle.engine->getTensorDataType(name);
|
|
auto loc = bundle.engine->getTensorLocation(name);
|
|
bool isHost = (loc == nvinfer1::TensorLocation::kHOST);
|
|
|
|
// Check if any dimension is dynamic (-1)
|
|
bool hasDynamic = false;
|
|
int64_t numElements = 1;
|
|
for (int d = 0; d < shape.nbDims; ++d) {
|
|
int64_t v = shape.d[d];
|
|
if (v <= 0) { hasDynamic = true; v = 1; }
|
|
numElements *= v;
|
|
}
|
|
// Scalars (0-dim) still need at least 1 element
|
|
if (numElements < 1) numElements = 1;
|
|
|
|
// For output tensors with ANY dynamic dim, pre-allocate a generous buffer.
|
|
// The decoder outputs (boxes [-1,4], scores [-1], masks [-1,-1,-1,-1])
|
|
// all have data-dependent first dimension from NonZero/NMS.
|
|
if (mode == nvinfer1::TensorIOMode::kOUTPUT && hasDynamic) {
|
|
// Pre-allocate for up to 256 detections with generous mask size
|
|
numElements = 256 * 1 * 256 * 256;
|
|
}
|
|
|
|
size_t bufSize = numElements * DataTypeSize(dtype);
|
|
bundle.gpuBufferSizes[i] = bufSize;
|
|
|
|
if (isHost) {
|
|
// Shape tensor — allocate host memory
|
|
bundle.gpuBuffers[i] = calloc(numElements, DataTypeSize(dtype));
|
|
if (!bundle.gpuBuffers[i]) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
|
|
label + ": host alloc failed for " + std::string(name), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
bundle.hostBufferIdx.insert(i);
|
|
} else {
|
|
// Execution tensor — allocate device memory
|
|
cudaError_t err = cudaMalloc(&bundle.gpuBuffers[i], bufSize);
|
|
if (err != cudaSuccess) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
|
|
label + ": cudaMalloc failed for " + std::string(name) + ": " + cudaGetErrorString(err),
|
|
__FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
cudaMemset(bundle.gpuBuffers[i], 0, bufSize);
|
|
}
|
|
|
|
// Bind tensor address (host ptr for shape tensors, device ptr for execution tensors)
|
|
if (!bundle.context->setTensorAddress(name, bundle.gpuBuffers[i])) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
|
|
label + ": setTensorAddress failed for " + std::string(name), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
bundle.nameToIdx[std::string(name)] = i;
|
|
|
|
std::cout << "[ANSSAM3] " << label << " tensor[" << i << "] '"
|
|
<< name << "' "
|
|
<< (mode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT")
|
|
<< (isHost ? " HOST" : " DEVICE")
|
|
<< " dtype=" << static_cast<int>(dtype)
|
|
<< " bufSize=" << bufSize << std::endl;
|
|
}
|
|
|
|
// Set input shapes (replace dynamic dims with concrete values)
|
|
for (int i = 0; i < numTensors; ++i) {
|
|
const char* name = bundle.engine->getIOTensorName(i);
|
|
if (bundle.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kINPUT)
|
|
continue;
|
|
|
|
// Shape tensors (scalar, host memory): TRT reads the value directly
|
|
// from the host buffer. setInputShape for scalars uses Dims{0, {}}.
|
|
// Write a default value (1024) into the host buffer at load time.
|
|
if (bundle.hostBufferIdx.count(i)) {
|
|
auto dtype = bundle.engine->getTensorDataType(name);
|
|
if (dtype == nvinfer1::DataType::kINT64)
|
|
*reinterpret_cast<int64_t*>(bundle.gpuBuffers[i]) = 1024;
|
|
else
|
|
*reinterpret_cast<int32_t*>(bundle.gpuBuffers[i]) = 1024;
|
|
|
|
nvinfer1::Dims scalarDims;
|
|
scalarDims.nbDims = 0;
|
|
bundle.context->setInputShape(name, scalarDims);
|
|
continue;
|
|
}
|
|
|
|
auto dims = bundle.engine->getTensorShape(name);
|
|
nvinfer1::Dims inputDims = dims;
|
|
for (int d = 0; d < inputDims.nbDims; ++d) {
|
|
if (inputDims.d[d] == -1)
|
|
inputDims.d[d] = 1;
|
|
}
|
|
|
|
if (!bundle.context->setInputShape(name, inputDims)) {
|
|
_logger.LogError("ANSSAM3::LoadTRTEngineBundle",
|
|
label + ": setInputShape failed for " + std::string(name), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
std::cout << "[ANSSAM3] " << label << ": loaded successfully (" << numTensors << " tensors)." << std::endl;
|
|
return true;
|
|
}
|
|
|
|
|
|
// =========================================================================
|
|
// EnsureEnginesBuilt — pre-build uncached engines one at a time
|
|
// Avoids GPU OOM when building one engine while others are already loaded.
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::EnsureEnginesBuilt(const std::string& imgOnnx, const std::string& langOnnx, const std::string& decOnnx)
|
|
{
|
|
struct Job { const std::string* onnx; const char* label; TrtPrecision prec; };
|
|
Job jobs[] = {
|
|
{&langOnnx, "LangEncoder", TrtPrecision::FP16}, // FP16 — verified identical to FP32
|
|
{&decOnnx, "Decoder", TrtPrecision::FP16}, // FP16 decoder
|
|
{&imgOnnx, "ImageEncoder", TrtPrecision::FP32}, // FP32 — FP16/BF16 both corrupt backbone FPN
|
|
};
|
|
for (auto& j : jobs) {
|
|
if (!FileExist(EngineFileName(*j.onnx, j.prec))) {
|
|
TRTBundle tmp;
|
|
if (!BuildAndLoadEngine(tmp, *j.onnx, j.label, j.prec)) {
|
|
_logger.LogError("ANSSAM3::EnsureEnginesBuilt",
|
|
std::string("Failed to pre-build engine: ") + j.label, __FILE__, __LINE__);
|
|
tmp.destroy();
|
|
return false;
|
|
}
|
|
tmp.destroy(); // free GPU memory before next build
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// =========================================================================
|
|
// OptimizeModel
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::OptimizeModel(bool fp16, std::string& optimizedModelFolder)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder))
|
|
return false;
|
|
|
|
_fp16 = fp16;
|
|
optimizedModelFolder = _modelFolder;
|
|
|
|
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
|
|
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
|
|
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
|
|
|
|
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
|
|
|
|
// Build engines one at a time, destroying each to free GPU memory
|
|
TRTBundle tmp;
|
|
bool ok = true;
|
|
|
|
ok = BuildAndLoadEngine(tmp, imgOnnx, "ImageEncoder", TrtPrecision::FP32);
|
|
tmp.destroy();
|
|
|
|
if (ok) { ok = BuildAndLoadEngine(tmp, langOnnx, "LangEncoder", TrtPrecision::FP16); tmp.destroy(); }
|
|
if (ok) { ok = BuildAndLoadEngine(tmp, decOnnx, "Decoder", TrtPrecision::FP16); tmp.destroy(); }
|
|
|
|
if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }
|
|
return ok;
|
|
}
|
|
|
|
// =========================================================================
|
|
// Initialize
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::Initialize(std::string licenseKey, ModelConfig modelConfig,
|
|
const std::string& modelZipFilePath, const std::string& modelZipPassword,
|
|
std::string& labelMap)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
try {
|
|
bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
|
|
if (!result) return false;
|
|
|
|
_modelConfig.detectionType = DetectionType::SEGMENTATION;
|
|
if (_modelConfig.modelConfThreshold < 0.1f)
|
|
_modelConfig.modelConfThreshold = 0.5f;
|
|
m_segThreshold = _modelConfig.modelConfThreshold;
|
|
_fp16 = true;
|
|
|
|
// Create CUDA stream
|
|
cudaSetDevice(_modelConfig.gpuDeviceIndex);
|
|
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
|
|
|
|
// Build/load TRT engines for image + language encoders
|
|
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
|
|
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
|
|
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
|
|
|
|
// Pre-build uncached TRT engines (avoids GPU OOM during build)
|
|
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
|
|
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
|
|
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
|
|
_logger.LogError("ANSSAM3::Initialize", "Failed to build/load TRT engines", __FILE__, __LINE__);
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
_modelLoadValid = true;
|
|
_isInitialized = true;
|
|
|
|
// Load tokenizer
|
|
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
|
|
std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
|
|
if (FileExist(tokenizerPath)) {
|
|
m_tokenizer->Load(tokenizerPath);
|
|
_logger.LogDebug("ANSSAM3::Initialize", "CLIP tokenizer loaded", __FILE__, __LINE__);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
catch (const std::exception& e) {
|
|
_logger.LogFatal("ANSSAM3::Initialize", e.what(), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// =========================================================================
|
|
// LoadModel
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
try {
|
|
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
|
|
if (!result) return false;
|
|
|
|
_modelConfig.detectionType = DetectionType::SEGMENTATION;
|
|
if (_modelConfig.modelConfThreshold < 0.1f)
|
|
_modelConfig.modelConfThreshold = 0.5f;
|
|
m_segThreshold = _modelConfig.modelConfThreshold;
|
|
_fp16 = true;
|
|
|
|
cudaSetDevice(_modelConfig.gpuDeviceIndex);
|
|
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
|
|
|
|
std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx");
|
|
std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx");
|
|
std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx");
|
|
|
|
// Pre-build uncached TRT engines (avoids GPU OOM during build)
|
|
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
|
|
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
|
|
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
|
|
_logger.LogError("ANSSAM3::LoadModel", "Failed to build/load TRT engines", __FILE__, __LINE__);
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
_modelLoadValid = true;
|
|
_isInitialized = true;
|
|
|
|
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
|
|
std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt");
|
|
if (FileExist(tokenizerPath)) {
|
|
m_tokenizer->Load(tokenizerPath);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
catch (const std::exception& e) {
|
|
_logger.LogFatal("ANSSAM3::LoadModel", e.what(), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// =========================================================================
|
|
// LoadModelFromFolder
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
|
|
std::string modelName, std::string className,
|
|
const std::string& modelFolder, std::string& labelMap)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
try {
|
|
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
|
|
if (!result) return false;
|
|
|
|
_modelConfig = modelConfig;
|
|
_modelConfig.detectionType = DetectionType::SEGMENTATION;
|
|
if (_modelConfig.modelConfThreshold < 0.1f)
|
|
_modelConfig.modelConfThreshold = 0.5f;
|
|
m_segThreshold = _modelConfig.modelConfThreshold;
|
|
_fp16 = true;
|
|
|
|
cudaSetDevice(_modelConfig.gpuDeviceIndex);
|
|
cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking);
|
|
|
|
std::string imgOnnx = CreateFilePath(modelFolder, "sam3_image_encoder.onnx");
|
|
std::string langOnnx = CreateFilePath(modelFolder, "sam3_language_encoder.onnx");
|
|
std::string decOnnx = CreateFilePath(modelFolder, "sam3_decoder.onnx");
|
|
|
|
// Pre-build uncached TRT engines (avoids GPU OOM during build)
|
|
if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) {
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) ||
|
|
!BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) ||
|
|
!BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) {
|
|
_logger.LogError("ANSSAM3::LoadModelFromFolder", "Failed to build/load TRT engines", __FILE__, __LINE__);
|
|
_modelLoadValid = false;
|
|
return false;
|
|
}
|
|
|
|
_modelLoadValid = true;
|
|
_isInitialized = true;
|
|
|
|
m_tokenizer = std::make_unique<ANSCLIPTokenizer>();
|
|
std::string tokenizerPath = CreateFilePath(modelFolder, "merges.txt");
|
|
if (FileExist(tokenizerPath)) {
|
|
m_tokenizer->Load(tokenizerPath);
|
|
_logger.LogDebug("ANSSAM3::LoadModelFromFolder", "CLIP tokenizer loaded", __FILE__, __LINE__);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
catch (const std::exception& e) {
|
|
_logger.LogFatal("ANSSAM3::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// =========================================================================
|
|
// SetPrompt — run language encoder, cache outputs on GPU
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::SetPrompt(const std::vector<int64_t>& inputIds, const std::vector<int64_t>& attentionMask)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
|
|
if (!m_langEncoder.context) {
|
|
_logger.LogError("ANSSAM3::SetPrompt", "Language encoder not loaded", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
// Language encoder input: "tokens" [1, 32] int64
|
|
// Find the tokens input tensor
|
|
auto it = m_langEncoder.nameToIdx.find("tokens");
|
|
if (it == m_langEncoder.nameToIdx.end()) {
|
|
// Try first input
|
|
const char* firstName = m_langEncoder.engine->getIOTensorName(0);
|
|
it = m_langEncoder.nameToIdx.find(firstName);
|
|
}
|
|
if (it == m_langEncoder.nameToIdx.end()) {
|
|
_logger.LogError("ANSSAM3::SetPrompt", "Cannot find tokens input tensor", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
|
|
int tokIdx = it->second;
|
|
const char* tokName = m_langEncoder.engine->getIOTensorName(tokIdx);
|
|
auto tokDtype = m_langEncoder.engine->getTensorDataType(tokName);
|
|
|
|
// Upload tokens — handle int64 vs int32 data type
|
|
if (tokDtype == nvinfer1::DataType::kINT64) {
|
|
cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], inputIds.data(),
|
|
inputIds.size() * sizeof(int64_t),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
else if (tokDtype == nvinfer1::DataType::kINT32) {
|
|
// TRT may have converted int64 to int32 at build time
|
|
std::vector<int32_t> tokens32(inputIds.size());
|
|
for (size_t i = 0; i < inputIds.size(); ++i)
|
|
tokens32[i] = static_cast<int32_t>(inputIds[i]);
|
|
cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], tokens32.data(),
|
|
tokens32.size() * sizeof(int32_t),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
|
|
// Set input shape
|
|
nvinfer1::Dims tokenDims;
|
|
tokenDims.nbDims = 2;
|
|
tokenDims.d[0] = 1;
|
|
tokenDims.d[1] = static_cast<int>(inputIds.size());
|
|
m_langEncoder.context->setInputShape(tokName, tokenDims);
|
|
|
|
// Run language encoder
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
bool ok = m_langEncoder.context->enqueueV3(m_cudaStream);
|
|
#else
|
|
bool ok = m_langEncoder.context->enqueueV2(
|
|
reinterpret_cast<void**>(m_langEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
|
|
#endif
|
|
if (!ok) {
|
|
_logger.LogError("ANSSAM3::SetPrompt", "Language encoder enqueue failed", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
cudaStreamSynchronize(m_cudaStream);
|
|
|
|
// Find language encoder outputs: text_attention_mask and text_memory
|
|
// output[0]: text_attention_mask [1, 32] bool → cached as m_cachedLangMask
|
|
// output[1]: text_memory [32, 1, 256] float32 → cached as m_cachedLangFeats
|
|
// output[2]: text_embeds [32, 1, 1024] float32 → NOT used
|
|
int maskOutIdx = -1, featsOutIdx = -1;
|
|
const int numTensors = m_langEncoder.engine->getNbIOTensors();
|
|
for (int i = 0; i < numTensors; ++i) {
|
|
const char* name = m_langEncoder.engine->getIOTensorName(i);
|
|
if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
|
|
continue;
|
|
std::string sname(name);
|
|
if (sname.find("attention_mask") != std::string::npos ||
|
|
sname.find("text_attention") != std::string::npos) {
|
|
maskOutIdx = i;
|
|
}
|
|
else if (sname.find("text_memory") != std::string::npos ||
|
|
sname.find("memory") != std::string::npos) {
|
|
featsOutIdx = i;
|
|
}
|
|
}
|
|
|
|
// Fallback: first 2 outputs in order
|
|
if (maskOutIdx < 0 || featsOutIdx < 0) {
|
|
int outCount = 0;
|
|
for (int i = 0; i < numTensors; ++i) {
|
|
const char* name = m_langEncoder.engine->getIOTensorName(i);
|
|
if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT)
|
|
continue;
|
|
if (outCount == 0 && maskOutIdx < 0) maskOutIdx = i;
|
|
else if (outCount == 1 && featsOutIdx < 0) featsOutIdx = i;
|
|
outCount++;
|
|
}
|
|
}
|
|
|
|
// Cache mask on GPU
|
|
if (maskOutIdx >= 0) {
|
|
size_t bytes = m_langEncoder.gpuBufferSizes[maskOutIdx];
|
|
if (m_cachedLangMask && m_cachedLangMaskBytes < bytes) {
|
|
cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr;
|
|
}
|
|
if (!m_cachedLangMask) {
|
|
cudaMalloc(&m_cachedLangMask, bytes);
|
|
}
|
|
m_cachedLangMaskBytes = bytes;
|
|
cudaMemcpyAsync(m_cachedLangMask, m_langEncoder.gpuBuffers[maskOutIdx],
|
|
bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
|
|
}
|
|
|
|
// Cache features on GPU
|
|
if (featsOutIdx >= 0) {
|
|
size_t bytes = m_langEncoder.gpuBufferSizes[featsOutIdx];
|
|
if (m_cachedLangFeats && m_cachedLangFeatsBytes < bytes) {
|
|
cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr;
|
|
}
|
|
if (!m_cachedLangFeats) {
|
|
cudaMalloc(&m_cachedLangFeats, bytes);
|
|
}
|
|
m_cachedLangFeatsBytes = bytes;
|
|
cudaMemcpyAsync(m_cachedLangFeats, m_langEncoder.gpuBuffers[featsOutIdx],
|
|
bytes, cudaMemcpyDeviceToDevice, m_cudaStream);
|
|
}
|
|
|
|
cudaStreamSynchronize(m_cudaStream);
|
|
m_promptSet = true;
|
|
return true;
|
|
}
|
|
|
|
bool ANSSAM3::SetPrompt(const std::string& text)
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
if (!m_tokenizer || !m_tokenizer->IsLoaded()) {
|
|
_logger.LogError("ANSSAM3::SetPrompt",
|
|
"Tokenizer not loaded. Place merges.txt in model folder.", __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
auto result = m_tokenizer->Tokenize(text, m_tokenLength);
|
|
SetPrompt(result.inputIds, result.attentionMask);
|
|
return true;
|
|
}
|
|
|
|
// =========================================================================
|
|
// RunInference
|
|
// =========================================================================
|
|
|
|
std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input)
|
|
{
|
|
return RunInference(input, "");
|
|
}
|
|
|
|
std::vector<Object> ANSSAM3::RunInference(const cv::Mat& input, const std::string& camera_id)
|
|
{
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
if (!_modelLoadValid || !_isInitialized) return {};
|
|
if (!m_promptSet) {
|
|
_logger.LogError("ANSSAM3::RunInference", "No prompt set", __FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
if (input.empty() || input.cols < 10 || input.rows < 10) return {};
|
|
}
|
|
try {
|
|
return Detect(input, camera_id);
|
|
}
|
|
catch (const std::exception& e) {
|
|
_logger.LogFatal("ANSSAM3::RunInference", e.what(), __FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
}
|
|
|
|
// =========================================================================
|
|
// Detect — image encoder + decoder pipeline
|
|
// =========================================================================
|
|
|
|
std::vector<Object> ANSSAM3::Detect(const cv::Mat& input, const std::string& camera_id)
|
|
{
|
|
if (!_modelLoadValid || !m_imgEncoder.context || !m_cudaStream) {
|
|
return {};
|
|
}
|
|
|
|
const int origW = input.cols;
|
|
const int origH = input.rows;
|
|
|
|
// ---- 1) Find image encoder input tensor and determine dtype ----
|
|
auto imgIt = m_imgEncoder.nameToIdx.find("image");
|
|
if (imgIt == m_imgEncoder.nameToIdx.end()) {
|
|
const char* firstName = m_imgEncoder.engine->getIOTensorName(0);
|
|
imgIt = m_imgEncoder.nameToIdx.find(firstName);
|
|
}
|
|
if (imgIt == m_imgEncoder.nameToIdx.end()) {
|
|
_logger.LogError("ANSSAM3::Detect", "Cannot find image input tensor", __FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
|
|
int imgInputIdx = imgIt->second;
|
|
const char* imgInputName = m_imgEncoder.engine->getIOTensorName(imgInputIdx);
|
|
auto imgDtype = m_imgEncoder.engine->getTensorDataType(imgInputName);
|
|
|
|
bool isUint8Input = (imgDtype == nvinfer1::DataType::kINT8 || imgDtype == nvinfer1::DataType::kBOOL);
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
isUint8Input = isUint8Input || (imgDtype == nvinfer1::DataType::kUINT8);
|
|
#endif
|
|
|
|
// ---- 1b) Try NV12 fast path — fused NV12→RGB resize CHW directly into TRT buffer ----
|
|
bool usedNV12 = false;
|
|
{
|
|
auto nv12 = m_nv12Helper.tryNV12DirectToBuffer(
|
|
input, 0 /*inferenceGpu*/,
|
|
m_imgEncoder.gpuBuffers[imgInputIdx],
|
|
m_inputSize, m_inputSize,
|
|
!isUint8Input, // float32 if not uint8
|
|
m_cudaStream,
|
|
_logger, "ANSSAM3");
|
|
usedNV12 = nv12.succeeded;
|
|
m_nv12Helper.tickInference();
|
|
}
|
|
|
|
// ---- 1c) CPU fallback: BGR → RGB, resize to 1008, HWC→CHW, upload ----
|
|
if (!usedNV12) {
|
|
cv::Mat resized;
|
|
cv::resize(input, resized, cv::Size(m_inputSize, m_inputSize));
|
|
cv::Mat rgb;
|
|
cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB);
|
|
|
|
const size_t planeSize = static_cast<size_t>(m_inputSize) * m_inputSize;
|
|
std::vector<uint8_t> imgBuffer(3 * planeSize);
|
|
cv::Mat channels[3];
|
|
cv::split(rgb, channels);
|
|
for (int c = 0; c < 3; ++c)
|
|
std::memcpy(imgBuffer.data() + c * planeSize, channels[c].data, planeSize);
|
|
|
|
if (isUint8Input) {
|
|
cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgBuffer.data(),
|
|
imgBuffer.size(), cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
else {
|
|
std::vector<float> imgFloat(imgBuffer.size());
|
|
for (size_t i = 0; i < imgBuffer.size(); ++i)
|
|
imgFloat[i] = static_cast<float>(imgBuffer[i]);
|
|
cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgFloat.data(),
|
|
imgFloat.size() * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
}
|
|
|
|
// Set image input shape
|
|
nvinfer1::Dims imgDims;
|
|
imgDims.nbDims = 3;
|
|
imgDims.d[0] = 3;
|
|
imgDims.d[1] = m_inputSize;
|
|
imgDims.d[2] = m_inputSize;
|
|
m_imgEncoder.context->setInputShape(imgInputName, imgDims);
|
|
|
|
// ---- 2) Run image encoder ----
|
|
// Check for prior CUDA errors (e.g. OOM from memcpy) before enqueue
|
|
{
|
|
cudaError_t preErr = cudaGetLastError();
|
|
if (preErr != cudaSuccess) {
|
|
_logger.LogError("ANSSAM3::Detect",
|
|
std::string("CUDA error before enqueue: ") + cudaGetErrorString(preErr),
|
|
__FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
}
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
bool okImg = m_imgEncoder.context->enqueueV3(m_cudaStream);
|
|
#else
|
|
bool okImg = m_imgEncoder.context->enqueueV2(
|
|
reinterpret_cast<void**>(m_imgEncoder.gpuBuffers.data()), m_cudaStream, nullptr);
|
|
#endif
|
|
if (!okImg) {
|
|
cudaError_t postErr = cudaGetLastError();
|
|
_logger.LogError("ANSSAM3::Detect",
|
|
std::string("Image encoder enqueue failed") +
|
|
(postErr != cudaSuccess ? std::string(": ") + cudaGetErrorString(postErr) : ""),
|
|
__FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
cudaStreamSynchronize(m_cudaStream);
|
|
|
|
// ---- 3) Feed encoder outputs into TRT decoder (zero-copy via setTensorAddress) ----
|
|
// Helper: copy GPU buffer from image encoder output → decoder input
|
|
auto feedImgToDec = [&](const std::string& tensorName) {
|
|
auto srcIt = m_imgEncoder.nameToIdx.find(tensorName);
|
|
auto dstIt = m_decoder.nameToIdx.find(tensorName);
|
|
if (srcIt == m_imgEncoder.nameToIdx.end() || dstIt == m_decoder.nameToIdx.end()) return;
|
|
|
|
int srcIdx = srcIt->second;
|
|
int dstIdx = dstIt->second;
|
|
|
|
// Point decoder input directly at encoder output buffer (true zero-copy)
|
|
const char* dstName = m_decoder.engine->getIOTensorName(dstIdx);
|
|
m_decoder.context->setTensorAddress(dstName, m_imgEncoder.gpuBuffers[srcIdx]);
|
|
|
|
// Set input shape from encoder's actual output shape
|
|
const char* srcName = m_imgEncoder.engine->getIOTensorName(srcIdx);
|
|
auto shape = m_imgEncoder.context->getTensorShape(srcName);
|
|
m_decoder.context->setInputShape(dstName, shape);
|
|
};
|
|
|
|
feedImgToDec("vision_pos_enc_2");
|
|
feedImgToDec("backbone_fpn_0");
|
|
feedImgToDec("backbone_fpn_1");
|
|
feedImgToDec("backbone_fpn_2");
|
|
|
|
// Language features — point decoder inputs at cached GPU buffers
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("language_mask");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
const char* name = m_decoder.engine->getIOTensorName(it->second);
|
|
m_decoder.context->setTensorAddress(name, m_cachedLangMask);
|
|
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = m_tokenLength;
|
|
m_decoder.context->setInputShape(name, d);
|
|
}
|
|
}
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("language_features");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
const char* name = m_decoder.engine->getIOTensorName(it->second);
|
|
m_decoder.context->setTensorAddress(name, m_cachedLangFeats);
|
|
nvinfer1::Dims d; d.nbDims = 3; d.d[0] = m_tokenLength; d.d[1] = 1; d.d[2] = 256;
|
|
m_decoder.context->setInputShape(name, d);
|
|
}
|
|
}
|
|
|
|
// Scalar inputs (original_height, original_width) — host-memory shape tensors
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("original_height");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
int idx = it->second;
|
|
auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
|
|
if (dtype == nvinfer1::DataType::kINT64)
|
|
*reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origH);
|
|
else
|
|
*reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origH;
|
|
}
|
|
}
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("original_width");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
int idx = it->second;
|
|
auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx));
|
|
if (dtype == nvinfer1::DataType::kINT64)
|
|
*reinterpret_cast<int64_t*>(m_decoder.gpuBuffers[idx]) = static_cast<int64_t>(origW);
|
|
else
|
|
*reinterpret_cast<int32_t*>(m_decoder.gpuBuffers[idx]) = origW;
|
|
}
|
|
}
|
|
|
|
// Prompt inputs: box_coords [1,1,4], box_labels [1,1], box_masks [1,1]
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("box_coords");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
int idx = it->second;
|
|
float boxCoords[4] = { 0.f, 0.f, 0.f, 0.f };
|
|
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], boxCoords, sizeof(boxCoords),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
const char* name = m_decoder.engine->getIOTensorName(idx);
|
|
nvinfer1::Dims d; d.nbDims = 3; d.d[0] = 1; d.d[1] = 1; d.d[2] = 4;
|
|
m_decoder.context->setInputShape(name, d);
|
|
}
|
|
}
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("box_labels");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
int idx = it->second;
|
|
const char* name = m_decoder.engine->getIOTensorName(idx);
|
|
auto dtype = m_decoder.engine->getTensorDataType(name);
|
|
if (dtype == nvinfer1::DataType::kINT64) {
|
|
int64_t val = -1;
|
|
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
} else {
|
|
int32_t val = -1;
|
|
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
|
|
m_decoder.context->setInputShape(name, d);
|
|
}
|
|
}
|
|
{
|
|
auto it = m_decoder.nameToIdx.find("box_masks");
|
|
if (it != m_decoder.nameToIdx.end()) {
|
|
int idx = it->second;
|
|
const char* name = m_decoder.engine->getIOTensorName(idx);
|
|
auto dtype = m_decoder.engine->getTensorDataType(name);
|
|
if (dtype == nvinfer1::DataType::kBOOL) {
|
|
bool val = false;
|
|
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
} else {
|
|
int32_t val = 0;
|
|
cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val),
|
|
cudaMemcpyHostToDevice, m_cudaStream);
|
|
}
|
|
nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1;
|
|
m_decoder.context->setInputShape(name, d);
|
|
}
|
|
}
|
|
|
|
// ---- 4) Run TRT decoder ----
|
|
// Find output tensor indices for boxes/scores/masks
|
|
std::string boxesTName, scoresTName, masksTName;
|
|
int boxesIdx = -1, scoresIdx = -1, masksIdx = -1;
|
|
for (auto& kv : m_decoder.nameToIdx) {
|
|
auto mode = m_decoder.engine->getTensorIOMode(kv.first.c_str());
|
|
if (mode != nvinfer1::TensorIOMode::kOUTPUT) continue;
|
|
if (kv.first.find("box") != std::string::npos && kv.first.find("mask") == std::string::npos)
|
|
{ boxesTName = kv.first; boxesIdx = kv.second; }
|
|
else if (kv.first.find("score") != std::string::npos)
|
|
{ scoresTName = kv.first; scoresIdx = kv.second; }
|
|
else if (kv.first.find("mask") != std::string::npos)
|
|
{ masksTName = kv.first; masksIdx = kv.second; }
|
|
}
|
|
if (boxesIdx < 0 || scoresIdx < 0 || masksIdx < 0) {
|
|
_logger.LogError("ANSSAM3::Detect", "Cannot find decoder output tensors", __FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
// Stack-local allocators that return the pre-allocated gpuBuffers.
|
|
// Registering these enables getTensorShape() to return actual
|
|
// (not -1) dimensions for data-dependent outputs after enqueueV3.
|
|
PassthroughOutputAllocator boxAlloc (m_decoder.gpuBuffers[boxesIdx], m_decoder.gpuBufferSizes[boxesIdx]);
|
|
PassthroughOutputAllocator scoreAlloc(m_decoder.gpuBuffers[scoresIdx], m_decoder.gpuBufferSizes[scoresIdx]);
|
|
PassthroughOutputAllocator maskAlloc (m_decoder.gpuBuffers[masksIdx], m_decoder.gpuBufferSizes[masksIdx]);
|
|
|
|
m_decoder.context->setOutputAllocator(boxesTName.c_str(), &boxAlloc);
|
|
m_decoder.context->setOutputAllocator(scoresTName.c_str(), &scoreAlloc);
|
|
m_decoder.context->setOutputAllocator(masksTName.c_str(), &maskAlloc);
|
|
#endif
|
|
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
bool okDec = m_decoder.context->enqueueV3(m_cudaStream);
|
|
#else
|
|
bool okDec = m_decoder.context->enqueueV2(
|
|
reinterpret_cast<void**>(m_decoder.gpuBuffers.data()), m_cudaStream, nullptr);
|
|
#endif
|
|
cudaStreamSynchronize(m_cudaStream);
|
|
|
|
if (!okDec) {
|
|
_logger.LogError("ANSSAM3::Detect", "Decoder enqueue failed", __FILE__, __LINE__);
|
|
return {};
|
|
}
|
|
|
|
// ---- 5) Parse TRT decoder outputs: boxes [N,4], scores [N], masks [N,1,H,W] ----
|
|
int numBoxes = 0, maskH = 0, maskW = 0;
|
|
void* boxesGpu = nullptr;
|
|
void* scoresGpu = nullptr;
|
|
void* masksGpu = nullptr;
|
|
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|
// Read actual shapes from notifyShape() callback
|
|
if (boxAlloc.shapeKnown) {
|
|
auto& s = boxAlloc.actualDims;
|
|
numBoxes = (s.nbDims >= 1) ? static_cast<int>(s.d[0]) : 0;
|
|
boxesGpu = boxAlloc.preAllocBuf;
|
|
}
|
|
if (scoreAlloc.shapeKnown) {
|
|
scoresGpu = scoreAlloc.preAllocBuf;
|
|
}
|
|
if (maskAlloc.shapeKnown) {
|
|
auto& s = maskAlloc.actualDims;
|
|
maskH = (s.nbDims >= 3) ? static_cast<int>(s.d[2]) : 0;
|
|
maskW = (s.nbDims >= 4) ? static_cast<int>(s.d[3]) : 0;
|
|
masksGpu = maskAlloc.preAllocBuf;
|
|
}
|
|
#else
|
|
{
|
|
auto boxShape = m_decoder.context->getTensorShape(boxesTName.c_str());
|
|
auto maskShape = m_decoder.context->getTensorShape(masksTName.c_str());
|
|
numBoxes = (boxShape.nbDims >= 1) ? static_cast<int>(boxShape.d[0]) : 0;
|
|
maskH = (maskShape.nbDims >= 3) ? static_cast<int>(maskShape.d[2]) : 0;
|
|
maskW = (maskShape.nbDims >= 4) ? static_cast<int>(maskShape.d[3]) : 0;
|
|
boxesGpu = m_decoder.gpuBuffers[boxesIdx];
|
|
scoresGpu = m_decoder.gpuBuffers[scoresIdx];
|
|
masksGpu = m_decoder.gpuBuffers[masksIdx];
|
|
}
|
|
#endif
|
|
|
|
if (numBoxes <= 0 || !boxesGpu || !scoresGpu) return {};
|
|
|
|
// Download decoder outputs from GPU → CPU for postprocessing
|
|
size_t boxesBytes = static_cast<size_t>(numBoxes) * 4 * sizeof(float);
|
|
size_t scoresBytes = static_cast<size_t>(numBoxes) * sizeof(float);
|
|
size_t masksBytes = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(bool);
|
|
|
|
std::vector<float> boxesCpu(numBoxes * 4);
|
|
std::vector<float> scoresCpu(numBoxes);
|
|
|
|
cudaMemcpy(boxesCpu.data(), boxesGpu, boxesBytes, cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(scoresCpu.data(), scoresGpu, scoresBytes, cudaMemcpyDeviceToHost);
|
|
|
|
// Masks may be bool or float depending on TRT's internal optimization.
|
|
// Download to a raw buffer and convert to bool.
|
|
auto maskDtype = m_decoder.engine->getTensorDataType(masksTName.c_str());
|
|
std::vector<uint8_t> masksRaw(masksBytes);
|
|
if (masksGpu && maskH > 0 && maskW > 0) {
|
|
if (maskDtype == nvinfer1::DataType::kBOOL) {
|
|
cudaMemcpy(masksRaw.data(), masksGpu, masksBytes, cudaMemcpyDeviceToHost);
|
|
} else {
|
|
// Float masks — download and threshold
|
|
size_t floatBytes = static_cast<size_t>(numBoxes) * 1 * maskH * maskW * sizeof(float);
|
|
std::vector<float> masksFloat(static_cast<size_t>(numBoxes) * maskH * maskW);
|
|
cudaMemcpy(masksFloat.data(), masksGpu, floatBytes, cudaMemcpyDeviceToHost);
|
|
for (size_t j = 0; j < masksFloat.size(); ++j)
|
|
masksRaw[j] = masksFloat[j] > m_segThreshold ? 1 : 0;
|
|
}
|
|
}
|
|
|
|
// Convert raw buffer to bool pointer for PostprocessInstances
|
|
const bool* masksData = reinterpret_cast<const bool*>(masksRaw.data());
|
|
|
|
auto ret = PostprocessInstances(boxesCpu.data(), numBoxes, scoresCpu.data(),
|
|
masksData, maskH, maskW,
|
|
origW, origH, camera_id);
|
|
if (_trackerEnabled) {
|
|
ret = ApplyTracking(ret, camera_id);
|
|
if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// =========================================================================
|
|
// PostprocessInstances — same logic as ONNXSAM3::postprocessResults
|
|
// =========================================================================
|
|
|
|
std::vector<Object> ANSSAM3::PostprocessInstances(
|
|
const float* boxesData, int numBoxes,
|
|
const float* scoresData,
|
|
const bool* masksData,
|
|
int maskH, int maskW,
|
|
int origWidth, int origHeight,
|
|
const std::string& camera_id)
|
|
{
|
|
std::vector<Object> results;
|
|
|
|
for (int i = 0; i < numBoxes; ++i) {
|
|
float score = scoresData[i];
|
|
if (score < m_segThreshold)
|
|
continue;
|
|
|
|
// Box: [x1, y1, x2, y2] in original image coordinates
|
|
float x1 = std::max(0.0f, std::min(boxesData[i * 4 + 0], static_cast<float>(origWidth)));
|
|
float y1 = std::max(0.0f, std::min(boxesData[i * 4 + 1], static_cast<float>(origHeight)));
|
|
float x2 = std::max(0.0f, std::min(boxesData[i * 4 + 2], static_cast<float>(origWidth)));
|
|
float y2 = std::max(0.0f, std::min(boxesData[i * 4 + 3], static_cast<float>(origHeight)));
|
|
|
|
cv::Rect box(static_cast<int>(x1), static_cast<int>(y1),
|
|
static_cast<int>(x2 - x1), static_cast<int>(y2 - y1));
|
|
if (box.width <= 0 || box.height <= 0)
|
|
continue;
|
|
|
|
// Extract this instance's mask: [1, H, W] at index i
|
|
// ORT decoder always outputs bool masks — convert to 0/255 uint8
|
|
cv::Mat boolMask(maskH, maskW, CV_8UC1);
|
|
size_t maskOffset = static_cast<size_t>(i) * 1 * maskH * maskW;
|
|
const bool* src = masksData + maskOffset;
|
|
for (int y = 0; y < maskH; ++y)
|
|
for (int x = 0; x < maskW; ++x)
|
|
boolMask.at<uint8_t>(y, x) = src[y * maskW + x] ? 255 : 0;
|
|
|
|
// Resize mask to original resolution
|
|
cv::Mat fullMask;
|
|
cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight), 0, 0, cv::INTER_LINEAR);
|
|
cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY);
|
|
|
|
// Crop to bounding box
|
|
cv::Mat roiMask = fullMask(box).clone();
|
|
|
|
Object obj;
|
|
obj.box = box;
|
|
obj.confidence = score;
|
|
obj.classId = 0;
|
|
obj.className = "object";
|
|
obj.cameraId = camera_id;
|
|
obj.mask = roiMask;
|
|
|
|
// Create normalized polygon from mask (closed, maxPoints-limited)
|
|
obj.polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
|
|
roiMask, box,
|
|
static_cast<float>(origWidth), static_cast<float>(origHeight));
|
|
|
|
// Fallback: normalized box corners if mask polygon failed
|
|
if (obj.polygon.empty()) {
|
|
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
|
|
box, static_cast<float>(origWidth), static_cast<float>(origHeight));
|
|
}
|
|
|
|
results.push_back(std::move(obj));
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
// =========================================================================
|
|
// Destroy
|
|
// =========================================================================
|
|
|
|
bool ANSSAM3::Destroy()
|
|
{
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
|
try {
|
|
// TRT engine bundles
|
|
m_imgEncoder.destroy();
|
|
m_langEncoder.destroy();
|
|
m_decoder.destroy();
|
|
|
|
// Cached language encoder GPU buffers
|
|
if (m_cachedLangMask) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; }
|
|
m_cachedLangMaskBytes = 0;
|
|
if (m_cachedLangFeats) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; }
|
|
m_cachedLangFeatsBytes = 0;
|
|
|
|
if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; }
|
|
|
|
m_tokenizer.reset();
|
|
m_promptSet = false;
|
|
_modelLoadValid = false;
|
|
_isInitialized = false;
|
|
|
|
return true;
|
|
}
|
|
catch (const std::exception& e) {
|
|
_logger.LogFatal("ANSSAM3::Destroy", e.what(), __FILE__, __LINE__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
ANSSAM3::~ANSSAM3()
|
|
{
|
|
Destroy();
|
|
}
|
|
}
|