#include "ANSSAM3.h" #include "ANSCLIPTokenizer.h" #include "Utility.h" #include #include #include #include #include #include #include namespace ANSCENTER { // ========================================================================= // Helpers // ========================================================================= // Portable FP16 ↔ FP32 conversion (works in plain C++ without NVCC) static float fp16ToFloat(uint16_t h) { uint32_t sign = static_cast(h >> 15) << 31; uint32_t expo = (h >> 10) & 0x1Fu; uint32_t mant = h & 0x3FFu; if (expo == 0) { if (mant == 0) { float f; std::memcpy(&f, &sign, 4); return f; } while (!(mant & 0x400u)) { mant <<= 1; expo--; } expo++; mant &= 0x3FFu; } else if (expo == 31) { expo = 255; } uint32_t bits = sign | ((expo + 127u - 15u) << 23) | (mant << 13); float f; std::memcpy(&f, &bits, 4); return f; } static uint16_t floatToFp16(float val) { uint32_t bits; std::memcpy(&bits, &val, 4); uint16_t sign = static_cast((bits >> 16) & 0x8000u); int32_t expo = ((bits >> 23) & 0xFF) - 127 + 15; uint32_t mant = bits & 0x7FFFFFu; if (expo <= 0) return sign; // underflow → ±0 if (expo >= 31) return sign | 0x7C00u; // overflow → ±inf return sign | static_cast(expo << 10) | static_cast(mant >> 13); } size_t ANSSAM3::DataTypeSize(nvinfer1::DataType dtype) { switch (dtype) { case nvinfer1::DataType::kFLOAT: return 4; case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kINT32: return 4; case nvinfer1::DataType::kINT64: return 8; case nvinfer1::DataType::kINT8: return 1; case nvinfer1::DataType::kBOOL: return 1; #if NV_TENSORRT_MAJOR >= 10 case nvinfer1::DataType::kUINT8: return 1; case nvinfer1::DataType::kFP8: return 1; case nvinfer1::DataType::kBF16: return 2; case nvinfer1::DataType::kINT4: return 1; // conservative — 4-bit packed #endif default: return 4; } } void ANSSAM3::TRTBundle::destroy() { context.reset(); engine.reset(); runtime.reset(); for (int i = 0; i < (int)gpuBuffers.size(); ++i) { if (!gpuBuffers[i]) continue; if (hostBufferIdx.count(i)) free(gpuBuffers[i]); // host-allocated (shape tensor) else cudaFree(gpuBuffers[i]); // device-allocated gpuBuffers[i] = nullptr; } gpuBuffers.clear(); gpuBufferSizes.clear(); hostBufferIdx.clear(); nameToIdx.clear(); } // (CreateOrtDecoderSession removed — decoder now runs under TRT) // ----------------------------------------------------------------- // PassthroughOutputAllocator — lightweight IOutputAllocator that simply // returns the pre-allocated gpuBuffer. Defined here (not in the header) // so the vtable and CUDA symbols stay inside the engine DLL. // Created on the stack in Detect() — no persistent class members needed. // ----------------------------------------------------------------- #if NV_TENSORRT_MAJOR >= 10 struct PassthroughOutputAllocator : public nvinfer1::IOutputAllocator { void* preAllocBuf; // existing gpuBuffers[idx] size_t preAllocSize; // existing gpuBufferSizes[idx] nvinfer1::Dims actualDims{}; bool shapeKnown = false; PassthroughOutputAllocator(void* buf, size_t sz) : preAllocBuf(buf), preAllocSize(sz) {} void* reallocateOutput( char const* /*tensorName*/, void* currentMemory, uint64_t size, uint64_t /*alignment*/) noexcept override { if (size <= preAllocSize) return preAllocBuf; // Fallback: grow (should not happen with generous pre-allocation) void* newBuf = nullptr; if (cudaMalloc(&newBuf, size) == cudaSuccess) { preAllocBuf = newBuf; preAllocSize = size; } return preAllocBuf; } void notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept override { actualDims = dims; shapeKnown = true; } }; #endif // ========================================================================= // EngineFileName — generate cache path: .engine.. // ========================================================================= std::string ANSSAM3::EngineFileName(const std::string& onnxPath, TrtPrecision precision) const { // Extract stem from ONNX path std::filesystem::path p(onnxPath); std::string stem = p.stem().string(); // Get GPU name cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); std::string gpuName(prop.name); gpuName.erase(std::remove_if(gpuName.begin(), gpuName.end(), ::isspace), gpuName.end()); // BF16 requires Ampere+ (compute capability >= 8.0). // On older GPUs BuildAndLoadEngine silently falls back to FP32, // so the filename must match to avoid a .bf16 / .fp32 mismatch. TrtPrecision effective = precision; if (precision == TrtPrecision::BF16 && prop.major < 8) effective = TrtPrecision::FP32; std::string precStr; switch (effective) { case TrtPrecision::FP16: precStr = "fp16"; break; case TrtPrecision::BF16: precStr = "bf16"; break; case TrtPrecision::FP32: precStr = "fp32"; break; } std::string dir = p.parent_path().string(); return dir + "\\" + stem + ".engine." + gpuName + "." + precStr; } // ========================================================================= // BuildAndLoadEngine — build TRT engine from ONNX + load for inference // ========================================================================= bool ANSSAM3::BuildAndLoadEngine(TRTBundle& bundle, const std::string& onnxPath, const std::string& label, TrtPrecision precision) { // Register TRT built-in plugins (needed for RoiAlign in decoder, etc.) // Safe to call multiple times — idempotent. initLibNvInferPlugins(&m_trtLogger, ""); std::string enginePath = EngineFileName(onnxPath, precision); // Check for cached engine if (FileExist(enginePath)) { std::cout << "[ANSSAM3] " << label << ": cached engine found: " << enginePath << std::endl; return LoadTRTEngineBundle(bundle, enginePath, label); } // --- Build from ONNX --- std::cout << "[ANSSAM3] " << label << ": building TRT engine from " << onnxPath << std::endl; if (!FileExist(onnxPath)) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": ONNX file not found: " + onnxPath, __FILE__, __LINE__); return false; } auto builder = std::unique_ptr(nvinfer1::createInferBuilder(m_trtLogger)); if (!builder) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createInferBuilder failed", __FILE__, __LINE__); return false; } auto network = std::unique_ptr(TRT_CREATE_NETWORK(builder)); if (!network) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createNetworkV2 failed", __FILE__, __LINE__); return false; } auto parser = std::unique_ptr(nvonnxparser::createParser(*network, m_trtLogger)); if (!parser) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createParser failed", __FILE__, __LINE__); return false; } // parseFromFile resolves .onnx_data external files relative to the ONNX directory if (!parser->parseFromFile(onnxPath.c_str(), static_cast(nvinfer1::ILogger::Severity::kWARNING))) { for (int i = 0; i < parser->getNbErrors(); ++i) std::cerr << "[ANSSAM3] " << label << " parse error: " << parser->getError(i)->desc() << std::endl; _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": parseFromFile failed", __FILE__, __LINE__); return false; } std::cout << "[ANSSAM3] " << label << ": ONNX parsed successfully." << std::endl; // --- Log and configure input tensors --- auto config = std::unique_ptr(builder->createBuilderConfig()); // Workspace governs max scratch memory TRT can use at runtime. // ImageEncoder FP32 needs huge scratch (3.5 GiB at 4 GiB workspace) which // causes OOM on 8 GiB GPUs. Cap it to 2 GiB — TRT picks leaner tactics // with minimal quality loss. LangEncoder/Decoder are small; keep 4 GiB. const bool isImageEncoder = (label.find("ImageEncoder") != std::string::npos); size_t workspaceBytes = isImageEncoder ? 2048ULL * 1024 * 1024 // 2 GiB for ImageEncoder (prevents OOM) : 4096ULL * 1024 * 1024; // 4 GiB for LangEncoder / Decoder #if NV_TENSORRT_MAJOR >= 10 config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspaceBytes); #else config->setMaxWorkspaceSize(workspaceBytes); #endif // Set precision flags switch (precision) { case TrtPrecision::BF16: { // BF16 requires Ampere+ (compute capability >= 8.0) and TRT 8.6+. // Check GPU capability, then attempt to set the flag if available. cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, 0); #if NV_TENSORRT_MAJOR > 8 || (NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR >= 6) if (devProp.major >= 8) { config->setFlag(nvinfer1::BuilderFlag::kBF16); std::cout << "[ANSSAM3] " << label << ": BF16 precision enabled." << std::endl; } else { std::cout << "[ANSSAM3] " << label << ": GPU CC " << devProp.major << "." << devProp.minor << " does not support BF16, falling back to FP32." << std::endl; } #else (void)devProp; // suppress unused warning std::cout << "[ANSSAM3] " << label << ": TensorRT version does not support BF16, falling back to FP32." << std::endl; #endif break; } case TrtPrecision::FP16: if (builder->platformHasFastFp16()) { config->setFlag(nvinfer1::BuilderFlag::kFP16); std::cout << "[ANSSAM3] " << label << ": FP16 precision enabled." << std::endl; } break; case TrtPrecision::FP32: // No precision flags = FP32 break; } // Create optimization profile with actual ONNX dimensions auto profile = builder->createOptimizationProfile(); int numInputs = network->getNbInputs(); for (int i = 0; i < numInputs; ++i) { auto input = network->getInput(i); const char* name = input->getName(); auto dims = input->getDimensions(); std::cout << "[ANSSAM3] " << label << " input[" << i << "] '" << name << "': ["; for (int d = 0; d < dims.nbDims; ++d) { if (d > 0) std::cout << ", "; std::cout << (dims.d[d] == -1 ? "dyn" : std::to_string(dims.d[d])); } std::cout << "]" << (input->isShapeTensor() ? " (shape tensor)" : "") << std::endl; // Shape tensors: scalar int64 inputs whose VALUES determine output shapes // (e.g. original_height, original_width). Use setShapeValues() not setDimensions(). if (input->isShapeTensor()) { // nbValues = product of dims; for a scalar (nbDims==0) that is 1 int nbValues = 1; for (int d = 0; d < dims.nbDims; ++d) { if (dims.d[d] > 0) nbValues *= dims.d[d]; } std::vector minV(nbValues, 1); std::vector optV(nbValues, 1024); std::vector maxV(nbValues, 4096); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, minV.data(), nbValues); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, optV.data(), nbValues); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, maxV.data(), nbValues); continue; } // Regular execution tensors: replace dynamic dims with concrete values bool hasDynamic = false; nvinfer1::Dims fixedDims = dims; for (int d = 0; d < dims.nbDims; ++d) { if (dims.d[d] == -1) { hasDynamic = true; fixedDims.d[d] = 1; // default batch or sequence } } if (hasDynamic) { profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, fixedDims); profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, fixedDims); profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, fixedDims); } } config->addOptimizationProfile(profile); // --- Mixed precision for Decoder: keep FP16 for bulk ops but force // score/NMS/comparison layers to FP32 so that the internal // thresholding doesn't lose detections due to half-precision // rounding. We mark layers whose names contain NMS-related // keywords, plus all layers of types that perform comparisons // or index-selection (which are part of the NMS pipeline). if (precision == TrtPrecision::FP16 && label == std::string("Decoder")) { config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); int numLayers = network->getNbLayers(); int markedCount = 0; for (int li = 0; li < numLayers; ++li) { auto* layer = network->getLayer(li); if (!layer) continue; std::string lname(layer->getName()); auto ltype = layer->getType(); // Force FP32 on layers involved in score thresholding / NMS: // - Comparison ops (Greater, Less, Equal) // - Sigmoid (final score activation) // - TopK, NonZero, Gather, Select (index-selection in NMS) // - Any layer whose name hints at score/nms/threshold bool needFP32 = false; // By layer type switch (ltype) { case nvinfer1::LayerType::kTOPK: case nvinfer1::LayerType::kGATHER: case nvinfer1::LayerType::kSELECT: case nvinfer1::LayerType::kNON_ZERO: case nvinfer1::LayerType::kSCATTER: needFP32 = true; break; default: break; } // By layer name (ONNX op names often preserved by parser) if (!needFP32) { // Convert to lowercase for matching std::string lower = lname; std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); if (lower.find("score") != std::string::npos || lower.find("nms") != std::string::npos || lower.find("sigmoid") != std::string::npos || lower.find("threshold") != std::string::npos || lower.find("greater") != std::string::npos || lower.find("less") != std::string::npos || lower.find("where") != std::string::npos || lower.find("nonzero") != std::string::npos || lower.find("topk") != std::string::npos) { needFP32 = true; } } // TensorRT forbids setPrecision(kFLOAT) on layers that // produce non-float types (booleans, indices/int32, int64). // Only force FP32 when ALL outputs are floating-point. if (needFP32) { bool allFloat = true; for (int oi = 0; oi < layer->getNbOutputs(); ++oi) { auto dt = layer->getOutputType(oi); if (dt != nvinfer1::DataType::kFLOAT && dt != nvinfer1::DataType::kHALF) { allFloat = false; break; } } if (allFloat) { layer->setPrecision(nvinfer1::DataType::kFLOAT); for (int oi = 0; oi < layer->getNbOutputs(); ++oi) layer->setOutputType(oi, nvinfer1::DataType::kFLOAT); ++markedCount; } } } std::cout << "[ANSSAM3] " << label << ": mixed precision — " << markedCount << "/" << numLayers << " layers forced to FP32 (score/NMS ops)." << std::endl; } // --- Build serialized engine --- std::cout << "[ANSSAM3] " << label << ": building engine (this may take a few minutes)..." << std::endl; unsigned long sehCode = 0; auto plan = std::unique_ptr( buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCode)); if (sehCode != 0) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": engine build crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__); return false; } if (!plan) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": buildSerializedNetwork returned null", __FILE__, __LINE__); return false; } // --- Save to disk --- std::ofstream outFile(enginePath, std::ios::binary); if (!outFile.is_open()) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": cannot write engine file: " + enginePath, __FILE__, __LINE__); return false; } outFile.write(reinterpret_cast(plan->data()), plan->size()); outFile.close(); std::cout << "[ANSSAM3] " << label << ": engine saved to " << enginePath << std::endl; plan.reset(); // --- Load the just-built engine --- return LoadTRTEngineBundle(bundle, enginePath, label); } // ========================================================================= // LoadTRTEngineBundle — deserialize engine, allocate GPU buffers, bind // ========================================================================= bool ANSSAM3::LoadTRTEngineBundle(TRTBundle& bundle, const std::string& enginePath, const std::string& label) { // Read engine file std::ifstream file(enginePath, std::ios::binary | std::ios::ate); if (!file.is_open()) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cannot open: " + enginePath, __FILE__, __LINE__); return false; } std::streamsize fileSize = file.tellg(); file.seekg(0, std::ios::beg); std::vector engineData(fileSize); if (!file.read(engineData.data(), fileSize)) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": read failed", __FILE__, __LINE__); return false; } file.close(); // Deserialize bundle.runtime = std::unique_ptr(nvinfer1::createInferRuntime(m_trtLogger)); if (!bundle.runtime) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createInferRuntime failed", __FILE__, __LINE__); return false; } unsigned long sehCode = 0; bundle.engine = std::unique_ptr( deserializeCudaEngineSafe(bundle.runtime.get(), engineData.data(), engineData.size(), &sehCode)); if (sehCode != 0) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__); return false; } if (!bundle.engine) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize returned null", __FILE__, __LINE__); return false; } // --- Weight streaming (TRT 10+): keep only a budget of weights on GPU, // stream the rest from CPU pinned memory on demand. // Saves ~1.3 GiB VRAM for ImageEncoder (1.8 GiB weights → 512 MiB on GPU). #if NV_TENSORRT_MAJOR >= 10 { int64_t streamableBytes = bundle.engine->getStreamableWeightsSize(); if (streamableBytes > 0 && label.find("ImageEncoder") != std::string::npos) { // Budget = how much weight memory stays on GPU. // 512 MiB keeps hot layers cached; rest streamed via PCIe. const int64_t budgetBytes = 512LL * 1024 * 1024; int64_t actualBudget = std::min(budgetBytes, streamableBytes); bundle.engine->setWeightStreamingBudgetV2(actualBudget); std::cout << "[ANSSAM3] " << label << ": weight streaming enabled (streamable=" << (streamableBytes / (1024*1024)) << " MiB, budget=" << (actualBudget / (1024*1024)) << " MiB)" << std::endl; } } #endif bundle.context = std::unique_ptr(bundle.engine->createExecutionContext()); if (!bundle.context) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createExecutionContext failed", __FILE__, __LINE__); return false; } // Set optimization profile int numProfiles = bundle.engine->getNbOptimizationProfiles(); if (numProfiles > 0) { bundle.context->setOptimizationProfileAsync(0, m_cudaStream); cudaStreamSynchronize(m_cudaStream); } // Allocate buffers (device for execution tensors, host for shape tensors) const int numTensors = bundle.engine->getNbIOTensors(); bundle.gpuBuffers.resize(numTensors, nullptr); bundle.gpuBufferSizes.resize(numTensors, 0); bundle.hostBufferIdx.clear(); bundle.nameToIdx.clear(); for (int i = 0; i < numTensors; ++i) { const char* name = bundle.engine->getIOTensorName(i); auto mode = bundle.engine->getTensorIOMode(name); auto shape = bundle.engine->getTensorShape(name); auto dtype = bundle.engine->getTensorDataType(name); auto loc = bundle.engine->getTensorLocation(name); bool isHost = (loc == nvinfer1::TensorLocation::kHOST); // Check if any dimension is dynamic (-1) bool hasDynamic = false; int64_t numElements = 1; for (int d = 0; d < shape.nbDims; ++d) { int64_t v = shape.d[d]; if (v <= 0) { hasDynamic = true; v = 1; } numElements *= v; } // Scalars (0-dim) still need at least 1 element if (numElements < 1) numElements = 1; // For output tensors with ANY dynamic dim, pre-allocate a generous buffer. // The decoder outputs (boxes [-1,4], scores [-1], masks [-1,-1,-1,-1]) // all have data-dependent first dimension from NonZero/NMS. if (mode == nvinfer1::TensorIOMode::kOUTPUT && hasDynamic) { // Pre-allocate for up to 256 detections with generous mask size numElements = 256 * 1 * 256 * 256; } size_t bufSize = numElements * DataTypeSize(dtype); bundle.gpuBufferSizes[i] = bufSize; if (isHost) { // Shape tensor — allocate host memory bundle.gpuBuffers[i] = calloc(numElements, DataTypeSize(dtype)); if (!bundle.gpuBuffers[i]) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": host alloc failed for " + std::string(name), __FILE__, __LINE__); return false; } bundle.hostBufferIdx.insert(i); } else { // Execution tensor — allocate device memory cudaError_t err = cudaMalloc(&bundle.gpuBuffers[i], bufSize); if (err != cudaSuccess) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cudaMalloc failed for " + std::string(name) + ": " + cudaGetErrorString(err), __FILE__, __LINE__); return false; } cudaMemset(bundle.gpuBuffers[i], 0, bufSize); } // Bind tensor address (host ptr for shape tensors, device ptr for execution tensors) if (!bundle.context->setTensorAddress(name, bundle.gpuBuffers[i])) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": setTensorAddress failed for " + std::string(name), __FILE__, __LINE__); return false; } bundle.nameToIdx[std::string(name)] = i; std::cout << "[ANSSAM3] " << label << " tensor[" << i << "] '" << name << "' " << (mode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << (isHost ? " HOST" : " DEVICE") << " dtype=" << static_cast(dtype) << " bufSize=" << bufSize << std::endl; } // Set input shapes (replace dynamic dims with concrete values) for (int i = 0; i < numTensors; ++i) { const char* name = bundle.engine->getIOTensorName(i); if (bundle.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kINPUT) continue; // Shape tensors (scalar, host memory): TRT reads the value directly // from the host buffer. setInputShape for scalars uses Dims{0, {}}. // Write a default value (1024) into the host buffer at load time. if (bundle.hostBufferIdx.count(i)) { auto dtype = bundle.engine->getTensorDataType(name); if (dtype == nvinfer1::DataType::kINT64) *reinterpret_cast(bundle.gpuBuffers[i]) = 1024; else *reinterpret_cast(bundle.gpuBuffers[i]) = 1024; nvinfer1::Dims scalarDims; scalarDims.nbDims = 0; bundle.context->setInputShape(name, scalarDims); continue; } auto dims = bundle.engine->getTensorShape(name); nvinfer1::Dims inputDims = dims; for (int d = 0; d < inputDims.nbDims; ++d) { if (inputDims.d[d] == -1) inputDims.d[d] = 1; } if (!bundle.context->setInputShape(name, inputDims)) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": setInputShape failed for " + std::string(name), __FILE__, __LINE__); return false; } } std::cout << "[ANSSAM3] " << label << ": loaded successfully (" << numTensors << " tensors)." << std::endl; return true; } // ========================================================================= // EnsureEnginesBuilt — pre-build uncached engines one at a time // Avoids GPU OOM when building one engine while others are already loaded. // ========================================================================= bool ANSSAM3::EnsureEnginesBuilt(const std::string& imgOnnx, const std::string& langOnnx, const std::string& decOnnx) { struct Job { const std::string* onnx; const char* label; TrtPrecision prec; }; Job jobs[] = { {&langOnnx, "LangEncoder", TrtPrecision::FP16}, // FP16 — verified identical to FP32 {&decOnnx, "Decoder", TrtPrecision::FP16}, // FP16 decoder {&imgOnnx, "ImageEncoder", TrtPrecision::FP32}, // FP32 — FP16/BF16 both corrupt backbone FPN }; for (auto& j : jobs) { if (!FileExist(EngineFileName(*j.onnx, j.prec))) { TRTBundle tmp; if (!BuildAndLoadEngine(tmp, *j.onnx, j.label, j.prec)) { _logger.LogError("ANSSAM3::EnsureEnginesBuilt", std::string("Failed to pre-build engine: ") + j.label, __FILE__, __LINE__); tmp.destroy(); return false; } tmp.destroy(); // free GPU memory before next build } } return true; } // ========================================================================= // OptimizeModel // ========================================================================= bool ANSSAM3::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { std::lock_guard lock(_mutex); if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false; _fp16 = fp16; optimizedModelFolder = _modelFolder; std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); // Build engines one at a time, destroying each to free GPU memory TRTBundle tmp; bool ok = true; ok = BuildAndLoadEngine(tmp, imgOnnx, "ImageEncoder", TrtPrecision::FP32); tmp.destroy(); if (ok) { ok = BuildAndLoadEngine(tmp, langOnnx, "LangEncoder", TrtPrecision::FP16); tmp.destroy(); } if (ok) { ok = BuildAndLoadEngine(tmp, decOnnx, "Decoder", TrtPrecision::FP16); tmp.destroy(); } if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; } return ok; } // ========================================================================= // Initialize // ========================================================================= bool ANSSAM3::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap); if (!result) return false; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; // Create CUDA stream cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); // Build/load TRT engines for image + language encoders std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::Initialize", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; // Load tokenizer m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); _logger.LogDebug("ANSSAM3::Initialize", "CLIP tokenizer loaded", __FILE__, __LINE__); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::Initialize", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // LoadModel // ========================================================================= bool ANSSAM3::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword); if (!result) return false; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::LoadModel", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::LoadModel", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // LoadModelFromFolder // ========================================================================= bool ANSSAM3::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (!result) return false; _modelConfig = modelConfig; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); std::string imgOnnx = CreateFilePath(modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::LoadModelFromFolder", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); _logger.LogDebug("ANSSAM3::LoadModelFromFolder", "CLIP tokenizer loaded", __FILE__, __LINE__); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::LoadModelFromFolder", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // SetPrompt — run language encoder, cache outputs on GPU // ========================================================================= bool ANSSAM3::SetPrompt(const std::vector& inputIds, const std::vector& attentionMask) { std::lock_guard lock(_mutex); if (!m_langEncoder.context) { _logger.LogError("ANSSAM3::SetPrompt", "Language encoder not loaded", __FILE__, __LINE__); return false; } // Language encoder input: "tokens" [1, 32] int64 // Find the tokens input tensor auto it = m_langEncoder.nameToIdx.find("tokens"); if (it == m_langEncoder.nameToIdx.end()) { // Try first input const char* firstName = m_langEncoder.engine->getIOTensorName(0); it = m_langEncoder.nameToIdx.find(firstName); } if (it == m_langEncoder.nameToIdx.end()) { _logger.LogError("ANSSAM3::SetPrompt", "Cannot find tokens input tensor", __FILE__, __LINE__); return false; } int tokIdx = it->second; const char* tokName = m_langEncoder.engine->getIOTensorName(tokIdx); auto tokDtype = m_langEncoder.engine->getTensorDataType(tokName); // Upload tokens — handle int64 vs int32 data type if (tokDtype == nvinfer1::DataType::kINT64) { cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], inputIds.data(), inputIds.size() * sizeof(int64_t), cudaMemcpyHostToDevice, m_cudaStream); } else if (tokDtype == nvinfer1::DataType::kINT32) { // TRT may have converted int64 to int32 at build time std::vector tokens32(inputIds.size()); for (size_t i = 0; i < inputIds.size(); ++i) tokens32[i] = static_cast(inputIds[i]); cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], tokens32.data(), tokens32.size() * sizeof(int32_t), cudaMemcpyHostToDevice, m_cudaStream); } // Set input shape nvinfer1::Dims tokenDims; tokenDims.nbDims = 2; tokenDims.d[0] = 1; tokenDims.d[1] = static_cast(inputIds.size()); m_langEncoder.context->setInputShape(tokName, tokenDims); // Run language encoder #if NV_TENSORRT_MAJOR >= 10 bool ok = m_langEncoder.context->enqueueV3(m_cudaStream); #else bool ok = m_langEncoder.context->enqueueV2( reinterpret_cast(m_langEncoder.gpuBuffers.data()), m_cudaStream, nullptr); #endif if (!ok) { _logger.LogError("ANSSAM3::SetPrompt", "Language encoder enqueue failed", __FILE__, __LINE__); return false; } cudaStreamSynchronize(m_cudaStream); // Find language encoder outputs: text_attention_mask and text_memory // output[0]: text_attention_mask [1, 32] bool → cached as m_cachedLangMask // output[1]: text_memory [32, 1, 256] float32 → cached as m_cachedLangFeats // output[2]: text_embeds [32, 1, 1024] float32 → NOT used int maskOutIdx = -1, featsOutIdx = -1; const int numTensors = m_langEncoder.engine->getNbIOTensors(); for (int i = 0; i < numTensors; ++i) { const char* name = m_langEncoder.engine->getIOTensorName(i); if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT) continue; std::string sname(name); if (sname.find("attention_mask") != std::string::npos || sname.find("text_attention") != std::string::npos) { maskOutIdx = i; } else if (sname.find("text_memory") != std::string::npos || sname.find("memory") != std::string::npos) { featsOutIdx = i; } } // Fallback: first 2 outputs in order if (maskOutIdx < 0 || featsOutIdx < 0) { int outCount = 0; for (int i = 0; i < numTensors; ++i) { const char* name = m_langEncoder.engine->getIOTensorName(i); if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT) continue; if (outCount == 0 && maskOutIdx < 0) maskOutIdx = i; else if (outCount == 1 && featsOutIdx < 0) featsOutIdx = i; outCount++; } } // Cache mask on GPU if (maskOutIdx >= 0) { size_t bytes = m_langEncoder.gpuBufferSizes[maskOutIdx]; if (m_cachedLangMask && m_cachedLangMaskBytes < bytes) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; } if (!m_cachedLangMask) { cudaMalloc(&m_cachedLangMask, bytes); } m_cachedLangMaskBytes = bytes; cudaMemcpyAsync(m_cachedLangMask, m_langEncoder.gpuBuffers[maskOutIdx], bytes, cudaMemcpyDeviceToDevice, m_cudaStream); } // Cache features on GPU if (featsOutIdx >= 0) { size_t bytes = m_langEncoder.gpuBufferSizes[featsOutIdx]; if (m_cachedLangFeats && m_cachedLangFeatsBytes < bytes) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; } if (!m_cachedLangFeats) { cudaMalloc(&m_cachedLangFeats, bytes); } m_cachedLangFeatsBytes = bytes; cudaMemcpyAsync(m_cachedLangFeats, m_langEncoder.gpuBuffers[featsOutIdx], bytes, cudaMemcpyDeviceToDevice, m_cudaStream); } cudaStreamSynchronize(m_cudaStream); m_promptSet = true; return true; } bool ANSSAM3::SetPrompt(const std::string& text) { std::lock_guard lock(_mutex); if (!m_tokenizer || !m_tokenizer->IsLoaded()) { _logger.LogError("ANSSAM3::SetPrompt", "Tokenizer not loaded. Place merges.txt in model folder.", __FILE__, __LINE__); return false; } auto result = m_tokenizer->Tokenize(text, m_tokenLength); SetPrompt(result.inputIds, result.attentionMask); return true; } // ========================================================================= // RunInference // ========================================================================= std::vector ANSSAM3::RunInference(const cv::Mat& input) { return RunInference(input, ""); } std::vector ANSSAM3::RunInference(const cv::Mat& input, const std::string& camera_id) { { std::lock_guard lock(_mutex); if (!_modelLoadValid || !_isInitialized) return {}; if (!m_promptSet) { _logger.LogError("ANSSAM3::RunInference", "No prompt set", __FILE__, __LINE__); return {}; } if (input.empty() || input.cols < 10 || input.rows < 10) return {}; } try { return Detect(input, camera_id); } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::RunInference", e.what(), __FILE__, __LINE__); return {}; } } // ========================================================================= // Detect — image encoder + decoder pipeline // ========================================================================= std::vector ANSSAM3::Detect(const cv::Mat& input, const std::string& camera_id) { if (!_modelLoadValid || !m_imgEncoder.context || !m_cudaStream) { return {}; } const int origW = input.cols; const int origH = input.rows; // ---- 1) Find image encoder input tensor and determine dtype ---- auto imgIt = m_imgEncoder.nameToIdx.find("image"); if (imgIt == m_imgEncoder.nameToIdx.end()) { const char* firstName = m_imgEncoder.engine->getIOTensorName(0); imgIt = m_imgEncoder.nameToIdx.find(firstName); } if (imgIt == m_imgEncoder.nameToIdx.end()) { _logger.LogError("ANSSAM3::Detect", "Cannot find image input tensor", __FILE__, __LINE__); return {}; } int imgInputIdx = imgIt->second; const char* imgInputName = m_imgEncoder.engine->getIOTensorName(imgInputIdx); auto imgDtype = m_imgEncoder.engine->getTensorDataType(imgInputName); bool isUint8Input = (imgDtype == nvinfer1::DataType::kINT8 || imgDtype == nvinfer1::DataType::kBOOL); #if NV_TENSORRT_MAJOR >= 10 isUint8Input = isUint8Input || (imgDtype == nvinfer1::DataType::kUINT8); #endif // ---- 1b) Try NV12 fast path — fused NV12→RGB resize CHW directly into TRT buffer ---- bool usedNV12 = false; { auto nv12 = m_nv12Helper.tryNV12DirectToBuffer( input, 0 /*inferenceGpu*/, m_imgEncoder.gpuBuffers[imgInputIdx], m_inputSize, m_inputSize, !isUint8Input, // float32 if not uint8 m_cudaStream, _logger, "ANSSAM3"); usedNV12 = nv12.succeeded; m_nv12Helper.tickInference(); } // ---- 1c) CPU fallback: BGR → RGB, resize to 1008, HWC→CHW, upload ---- if (!usedNV12) { cv::Mat resized; cv::resize(input, resized, cv::Size(m_inputSize, m_inputSize)); cv::Mat rgb; cv::cvtColor(resized, rgb, cv::COLOR_BGR2RGB); const size_t planeSize = static_cast(m_inputSize) * m_inputSize; std::vector imgBuffer(3 * planeSize); cv::Mat channels[3]; cv::split(rgb, channels); for (int c = 0; c < 3; ++c) std::memcpy(imgBuffer.data() + c * planeSize, channels[c].data, planeSize); if (isUint8Input) { cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgBuffer.data(), imgBuffer.size(), cudaMemcpyHostToDevice, m_cudaStream); } else { std::vector imgFloat(imgBuffer.size()); for (size_t i = 0; i < imgBuffer.size(); ++i) imgFloat[i] = static_cast(imgBuffer[i]); cudaMemcpyAsync(m_imgEncoder.gpuBuffers[imgInputIdx], imgFloat.data(), imgFloat.size() * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream); } } // Set image input shape nvinfer1::Dims imgDims; imgDims.nbDims = 3; imgDims.d[0] = 3; imgDims.d[1] = m_inputSize; imgDims.d[2] = m_inputSize; m_imgEncoder.context->setInputShape(imgInputName, imgDims); // ---- 2) Run image encoder ---- // Check for prior CUDA errors (e.g. OOM from memcpy) before enqueue { cudaError_t preErr = cudaGetLastError(); if (preErr != cudaSuccess) { _logger.LogError("ANSSAM3::Detect", std::string("CUDA error before enqueue: ") + cudaGetErrorString(preErr), __FILE__, __LINE__); return {}; } } #if NV_TENSORRT_MAJOR >= 10 bool okImg = m_imgEncoder.context->enqueueV3(m_cudaStream); #else bool okImg = m_imgEncoder.context->enqueueV2( reinterpret_cast(m_imgEncoder.gpuBuffers.data()), m_cudaStream, nullptr); #endif if (!okImg) { cudaError_t postErr = cudaGetLastError(); _logger.LogError("ANSSAM3::Detect", std::string("Image encoder enqueue failed") + (postErr != cudaSuccess ? std::string(": ") + cudaGetErrorString(postErr) : ""), __FILE__, __LINE__); return {}; } cudaStreamSynchronize(m_cudaStream); // ---- 3) Feed encoder outputs into TRT decoder (zero-copy via setTensorAddress) ---- // Helper: copy GPU buffer from image encoder output → decoder input auto feedImgToDec = [&](const std::string& tensorName) { auto srcIt = m_imgEncoder.nameToIdx.find(tensorName); auto dstIt = m_decoder.nameToIdx.find(tensorName); if (srcIt == m_imgEncoder.nameToIdx.end() || dstIt == m_decoder.nameToIdx.end()) return; int srcIdx = srcIt->second; int dstIdx = dstIt->second; // Point decoder input directly at encoder output buffer (true zero-copy) const char* dstName = m_decoder.engine->getIOTensorName(dstIdx); m_decoder.context->setTensorAddress(dstName, m_imgEncoder.gpuBuffers[srcIdx]); // Set input shape from encoder's actual output shape const char* srcName = m_imgEncoder.engine->getIOTensorName(srcIdx); auto shape = m_imgEncoder.context->getTensorShape(srcName); m_decoder.context->setInputShape(dstName, shape); }; feedImgToDec("vision_pos_enc_2"); feedImgToDec("backbone_fpn_0"); feedImgToDec("backbone_fpn_1"); feedImgToDec("backbone_fpn_2"); // Language features — point decoder inputs at cached GPU buffers { auto it = m_decoder.nameToIdx.find("language_mask"); if (it != m_decoder.nameToIdx.end()) { const char* name = m_decoder.engine->getIOTensorName(it->second); m_decoder.context->setTensorAddress(name, m_cachedLangMask); nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = m_tokenLength; m_decoder.context->setInputShape(name, d); } } { auto it = m_decoder.nameToIdx.find("language_features"); if (it != m_decoder.nameToIdx.end()) { const char* name = m_decoder.engine->getIOTensorName(it->second); m_decoder.context->setTensorAddress(name, m_cachedLangFeats); nvinfer1::Dims d; d.nbDims = 3; d.d[0] = m_tokenLength; d.d[1] = 1; d.d[2] = 256; m_decoder.context->setInputShape(name, d); } } // Scalar inputs (original_height, original_width) — host-memory shape tensors { auto it = m_decoder.nameToIdx.find("original_height"); if (it != m_decoder.nameToIdx.end()) { int idx = it->second; auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx)); if (dtype == nvinfer1::DataType::kINT64) *reinterpret_cast(m_decoder.gpuBuffers[idx]) = static_cast(origH); else *reinterpret_cast(m_decoder.gpuBuffers[idx]) = origH; } } { auto it = m_decoder.nameToIdx.find("original_width"); if (it != m_decoder.nameToIdx.end()) { int idx = it->second; auto dtype = m_decoder.engine->getTensorDataType(m_decoder.engine->getIOTensorName(idx)); if (dtype == nvinfer1::DataType::kINT64) *reinterpret_cast(m_decoder.gpuBuffers[idx]) = static_cast(origW); else *reinterpret_cast(m_decoder.gpuBuffers[idx]) = origW; } } // Prompt inputs: box_coords [1,1,4], box_labels [1,1], box_masks [1,1] { auto it = m_decoder.nameToIdx.find("box_coords"); if (it != m_decoder.nameToIdx.end()) { int idx = it->second; float boxCoords[4] = { 0.f, 0.f, 0.f, 0.f }; cudaMemcpyAsync(m_decoder.gpuBuffers[idx], boxCoords, sizeof(boxCoords), cudaMemcpyHostToDevice, m_cudaStream); const char* name = m_decoder.engine->getIOTensorName(idx); nvinfer1::Dims d; d.nbDims = 3; d.d[0] = 1; d.d[1] = 1; d.d[2] = 4; m_decoder.context->setInputShape(name, d); } } { auto it = m_decoder.nameToIdx.find("box_labels"); if (it != m_decoder.nameToIdx.end()) { int idx = it->second; const char* name = m_decoder.engine->getIOTensorName(idx); auto dtype = m_decoder.engine->getTensorDataType(name); if (dtype == nvinfer1::DataType::kINT64) { int64_t val = -1; cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val), cudaMemcpyHostToDevice, m_cudaStream); } else { int32_t val = -1; cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val), cudaMemcpyHostToDevice, m_cudaStream); } nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1; m_decoder.context->setInputShape(name, d); } } { auto it = m_decoder.nameToIdx.find("box_masks"); if (it != m_decoder.nameToIdx.end()) { int idx = it->second; const char* name = m_decoder.engine->getIOTensorName(idx); auto dtype = m_decoder.engine->getTensorDataType(name); if (dtype == nvinfer1::DataType::kBOOL) { bool val = false; cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val), cudaMemcpyHostToDevice, m_cudaStream); } else { int32_t val = 0; cudaMemcpyAsync(m_decoder.gpuBuffers[idx], &val, sizeof(val), cudaMemcpyHostToDevice, m_cudaStream); } nvinfer1::Dims d; d.nbDims = 2; d.d[0] = 1; d.d[1] = 1; m_decoder.context->setInputShape(name, d); } } // ---- 4) Run TRT decoder ---- // Find output tensor indices for boxes/scores/masks std::string boxesTName, scoresTName, masksTName; int boxesIdx = -1, scoresIdx = -1, masksIdx = -1; for (auto& kv : m_decoder.nameToIdx) { auto mode = m_decoder.engine->getTensorIOMode(kv.first.c_str()); if (mode != nvinfer1::TensorIOMode::kOUTPUT) continue; if (kv.first.find("box") != std::string::npos && kv.first.find("mask") == std::string::npos) { boxesTName = kv.first; boxesIdx = kv.second; } else if (kv.first.find("score") != std::string::npos) { scoresTName = kv.first; scoresIdx = kv.second; } else if (kv.first.find("mask") != std::string::npos) { masksTName = kv.first; masksIdx = kv.second; } } if (boxesIdx < 0 || scoresIdx < 0 || masksIdx < 0) { _logger.LogError("ANSSAM3::Detect", "Cannot find decoder output tensors", __FILE__, __LINE__); return {}; } #if NV_TENSORRT_MAJOR >= 10 // Stack-local allocators that return the pre-allocated gpuBuffers. // Registering these enables getTensorShape() to return actual // (not -1) dimensions for data-dependent outputs after enqueueV3. PassthroughOutputAllocator boxAlloc (m_decoder.gpuBuffers[boxesIdx], m_decoder.gpuBufferSizes[boxesIdx]); PassthroughOutputAllocator scoreAlloc(m_decoder.gpuBuffers[scoresIdx], m_decoder.gpuBufferSizes[scoresIdx]); PassthroughOutputAllocator maskAlloc (m_decoder.gpuBuffers[masksIdx], m_decoder.gpuBufferSizes[masksIdx]); m_decoder.context->setOutputAllocator(boxesTName.c_str(), &boxAlloc); m_decoder.context->setOutputAllocator(scoresTName.c_str(), &scoreAlloc); m_decoder.context->setOutputAllocator(masksTName.c_str(), &maskAlloc); #endif #if NV_TENSORRT_MAJOR >= 10 bool okDec = m_decoder.context->enqueueV3(m_cudaStream); #else bool okDec = m_decoder.context->enqueueV2( reinterpret_cast(m_decoder.gpuBuffers.data()), m_cudaStream, nullptr); #endif cudaStreamSynchronize(m_cudaStream); if (!okDec) { _logger.LogError("ANSSAM3::Detect", "Decoder enqueue failed", __FILE__, __LINE__); return {}; } // ---- 5) Parse TRT decoder outputs: boxes [N,4], scores [N], masks [N,1,H,W] ---- int numBoxes = 0, maskH = 0, maskW = 0; void* boxesGpu = nullptr; void* scoresGpu = nullptr; void* masksGpu = nullptr; #if NV_TENSORRT_MAJOR >= 10 // Read actual shapes from notifyShape() callback if (boxAlloc.shapeKnown) { auto& s = boxAlloc.actualDims; numBoxes = (s.nbDims >= 1) ? static_cast(s.d[0]) : 0; boxesGpu = boxAlloc.preAllocBuf; } if (scoreAlloc.shapeKnown) { scoresGpu = scoreAlloc.preAllocBuf; } if (maskAlloc.shapeKnown) { auto& s = maskAlloc.actualDims; maskH = (s.nbDims >= 3) ? static_cast(s.d[2]) : 0; maskW = (s.nbDims >= 4) ? static_cast(s.d[3]) : 0; masksGpu = maskAlloc.preAllocBuf; } #else { auto boxShape = m_decoder.context->getTensorShape(boxesTName.c_str()); auto maskShape = m_decoder.context->getTensorShape(masksTName.c_str()); numBoxes = (boxShape.nbDims >= 1) ? static_cast(boxShape.d[0]) : 0; maskH = (maskShape.nbDims >= 3) ? static_cast(maskShape.d[2]) : 0; maskW = (maskShape.nbDims >= 4) ? static_cast(maskShape.d[3]) : 0; boxesGpu = m_decoder.gpuBuffers[boxesIdx]; scoresGpu = m_decoder.gpuBuffers[scoresIdx]; masksGpu = m_decoder.gpuBuffers[masksIdx]; } #endif if (numBoxes <= 0 || !boxesGpu || !scoresGpu) return {}; // Download decoder outputs from GPU → CPU for postprocessing size_t boxesBytes = static_cast(numBoxes) * 4 * sizeof(float); size_t scoresBytes = static_cast(numBoxes) * sizeof(float); size_t masksBytes = static_cast(numBoxes) * 1 * maskH * maskW * sizeof(bool); std::vector boxesCpu(numBoxes * 4); std::vector scoresCpu(numBoxes); cudaMemcpy(boxesCpu.data(), boxesGpu, boxesBytes, cudaMemcpyDeviceToHost); cudaMemcpy(scoresCpu.data(), scoresGpu, scoresBytes, cudaMemcpyDeviceToHost); // Masks may be bool or float depending on TRT's internal optimization. // Download to a raw buffer and convert to bool. auto maskDtype = m_decoder.engine->getTensorDataType(masksTName.c_str()); std::vector masksRaw(masksBytes); if (masksGpu && maskH > 0 && maskW > 0) { if (maskDtype == nvinfer1::DataType::kBOOL) { cudaMemcpy(masksRaw.data(), masksGpu, masksBytes, cudaMemcpyDeviceToHost); } else { // Float masks — download and threshold size_t floatBytes = static_cast(numBoxes) * 1 * maskH * maskW * sizeof(float); std::vector masksFloat(static_cast(numBoxes) * maskH * maskW); cudaMemcpy(masksFloat.data(), masksGpu, floatBytes, cudaMemcpyDeviceToHost); for (size_t j = 0; j < masksFloat.size(); ++j) masksRaw[j] = masksFloat[j] > m_segThreshold ? 1 : 0; } } // Convert raw buffer to bool pointer for PostprocessInstances const bool* masksData = reinterpret_cast(masksRaw.data()); auto ret = PostprocessInstances(boxesCpu.data(), numBoxes, scoresCpu.data(), masksData, maskH, maskW, origW, origH, camera_id); if (_trackerEnabled) { ret = ApplyTracking(ret, camera_id); if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id); } return ret; } // ========================================================================= // PostprocessInstances — same logic as ONNXSAM3::postprocessResults // ========================================================================= std::vector ANSSAM3::PostprocessInstances( const float* boxesData, int numBoxes, const float* scoresData, const bool* masksData, int maskH, int maskW, int origWidth, int origHeight, const std::string& camera_id) { std::vector results; for (int i = 0; i < numBoxes; ++i) { float score = scoresData[i]; if (score < m_segThreshold) continue; // Box: [x1, y1, x2, y2] in original image coordinates float x1 = std::max(0.0f, std::min(boxesData[i * 4 + 0], static_cast(origWidth))); float y1 = std::max(0.0f, std::min(boxesData[i * 4 + 1], static_cast(origHeight))); float x2 = std::max(0.0f, std::min(boxesData[i * 4 + 2], static_cast(origWidth))); float y2 = std::max(0.0f, std::min(boxesData[i * 4 + 3], static_cast(origHeight))); cv::Rect box(static_cast(x1), static_cast(y1), static_cast(x2 - x1), static_cast(y2 - y1)); if (box.width <= 0 || box.height <= 0) continue; // Extract this instance's mask: [1, H, W] at index i // ORT decoder always outputs bool masks — convert to 0/255 uint8 cv::Mat boolMask(maskH, maskW, CV_8UC1); size_t maskOffset = static_cast(i) * 1 * maskH * maskW; const bool* src = masksData + maskOffset; for (int y = 0; y < maskH; ++y) for (int x = 0; x < maskW; ++x) boolMask.at(y, x) = src[y * maskW + x] ? 255 : 0; // Resize mask to original resolution cv::Mat fullMask; cv::resize(boolMask, fullMask, cv::Size(origWidth, origHeight), 0, 0, cv::INTER_LINEAR); cv::threshold(fullMask, fullMask, 127, 255, cv::THRESH_BINARY); // Crop to bounding box cv::Mat roiMask = fullMask(box).clone(); Object obj; obj.box = box; obj.confidence = score; obj.classId = 0; obj.className = "object"; obj.cameraId = camera_id; obj.mask = roiMask; // Create normalized polygon from mask (closed, maxPoints-limited) obj.polygon = ANSUtilityHelper::MaskToNormalizedPolygon( roiMask, box, static_cast(origWidth), static_cast(origHeight)); // Fallback: normalized box corners if mask polygon failed if (obj.polygon.empty()) { obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon( box, static_cast(origWidth), static_cast(origHeight)); } results.push_back(std::move(obj)); } return results; } // ========================================================================= // Destroy // ========================================================================= bool ANSSAM3::Destroy() { std::lock_guard lock(_mutex); try { // TRT engine bundles m_imgEncoder.destroy(); m_langEncoder.destroy(); m_decoder.destroy(); // Cached language encoder GPU buffers if (m_cachedLangMask) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; } m_cachedLangMaskBytes = 0; if (m_cachedLangFeats) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; } m_cachedLangFeatsBytes = 0; if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; } m_tokenizer.reset(); m_promptSet = false; _modelLoadValid = false; _isInitialized = false; return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::Destroy", e.what(), __FILE__, __LINE__); return false; } } ANSSAM3::~ANSSAM3() { Destroy(); } }