#include "ANSSAM3.h" #include "ANSCLIPTokenizer.h" #include "Utility.h" #include #include #include #include #include #include #include namespace ANSCENTER { // ========================================================================= // Helpers // ========================================================================= // Portable FP16 ↔ FP32 conversion (works in plain C++ without NVCC) static float fp16ToFloat(uint16_t h) { uint32_t sign = static_cast(h >> 15) << 31; uint32_t expo = (h >> 10) & 0x1Fu; uint32_t mant = h & 0x3FFu; if (expo == 0) { if (mant == 0) { float f; std::memcpy(&f, &sign, 4); return f; } while (!(mant & 0x400u)) { mant <<= 1; expo--; } expo++; mant &= 0x3FFu; } else if (expo == 31) { expo = 255; } uint32_t bits = sign | ((expo + 127u - 15u) << 23) | (mant << 13); float f; std::memcpy(&f, &bits, 4); return f; } static uint16_t floatToFp16(float val) { uint32_t bits; std::memcpy(&bits, &val, 4); uint16_t sign = static_cast((bits >> 16) & 0x8000u); int32_t expo = ((bits >> 23) & 0xFF) - 127 + 15; uint32_t mant = bits & 0x7FFFFFu; if (expo <= 0) return sign; // underflow → ±0 if (expo >= 31) return sign | 0x7C00u; // overflow → ±inf return sign | static_cast(expo << 10) | static_cast(mant >> 13); } size_t ANSSAM3::DataTypeSize(nvinfer1::DataType dtype) { switch (dtype) { case nvinfer1::DataType::kFLOAT: return 4; case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kINT32: return 4; case nvinfer1::DataType::kINT64: return 8; case nvinfer1::DataType::kINT8: return 1; case nvinfer1::DataType::kBOOL: return 1; #if NV_TENSORRT_MAJOR >= 10 case nvinfer1::DataType::kUINT8: return 1; case nvinfer1::DataType::kFP8: return 1; case nvinfer1::DataType::kBF16: return 2; case nvinfer1::DataType::kINT4: return 1; // conservative — 4-bit packed #endif default: return 4; } } void ANSSAM3::TRTBundle::destroy() { context.reset(); engine.reset(); runtime.reset(); for (int i = 0; i < (int)gpuBuffers.size(); ++i) { if (!gpuBuffers[i]) continue; if (hostBufferIdx.count(i)) free(gpuBuffers[i]); // host-allocated (shape tensor) else cudaFree(gpuBuffers[i]); // device-allocated gpuBuffers[i] = nullptr; } gpuBuffers.clear(); gpuBufferSizes.clear(); hostBufferIdx.clear(); nameToIdx.clear(); } // (CreateOrtDecoderSession removed — decoder now runs under TRT) // ----------------------------------------------------------------- // PassthroughOutputAllocator — lightweight IOutputAllocator that simply // returns the pre-allocated gpuBuffer. Defined here (not in the header) // so the vtable and CUDA symbols stay inside the engine DLL. // Created on the stack in Detect() — no persistent class members needed. // ----------------------------------------------------------------- #if NV_TENSORRT_MAJOR >= 10 struct PassthroughOutputAllocator : public nvinfer1::IOutputAllocator { void* preAllocBuf; // existing gpuBuffers[idx] size_t preAllocSize; // existing gpuBufferSizes[idx] nvinfer1::Dims actualDims{}; bool shapeKnown = false; PassthroughOutputAllocator(void* buf, size_t sz) : preAllocBuf(buf), preAllocSize(sz) {} void* reallocateOutput( char const* /*tensorName*/, void* currentMemory, uint64_t size, uint64_t /*alignment*/) noexcept override { if (size <= preAllocSize) return preAllocBuf; // Fallback: grow (should not happen with generous pre-allocation) void* newBuf = nullptr; if (cudaMalloc(&newBuf, size) == cudaSuccess) { preAllocBuf = newBuf; preAllocSize = size; } return preAllocBuf; } void notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept override { actualDims = dims; shapeKnown = true; } }; #endif // ========================================================================= // EngineFileName — generate cache path: .engine.. // ========================================================================= std::string ANSSAM3::EngineFileName(const std::string& onnxPath, TrtPrecision precision) const { // Extract stem from ONNX path std::filesystem::path p(onnxPath); std::string stem = p.stem().string(); // Get GPU name cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); std::string gpuName(prop.name); gpuName.erase(std::remove_if(gpuName.begin(), gpuName.end(), ::isspace), gpuName.end()); // BF16 requires Ampere+ (compute capability >= 8.0). // On older GPUs BuildAndLoadEngine silently falls back to FP32, // so the filename must match to avoid a .bf16 / .fp32 mismatch. TrtPrecision effective = precision; if (precision == TrtPrecision::BF16 && prop.major < 8) effective = TrtPrecision::FP32; std::string precStr; switch (effective) { case TrtPrecision::FP16: precStr = "fp16"; break; case TrtPrecision::BF16: precStr = "bf16"; break; case TrtPrecision::FP32: precStr = "fp32"; break; } std::string dir = p.parent_path().string(); return dir + "\\" + stem + ".engine." + gpuName + "." + precStr; } // ========================================================================= // BuildAndLoadEngine — build TRT engine from ONNX + load for inference // ========================================================================= bool ANSSAM3::BuildAndLoadEngine(TRTBundle& bundle, const std::string& onnxPath, const std::string& label, TrtPrecision precision) { // Register TRT built-in plugins (needed for RoiAlign in decoder, etc.) // Safe to call multiple times — idempotent. initLibNvInferPlugins(&m_trtLogger, ""); std::string enginePath = EngineFileName(onnxPath, precision); // Check for cached engine if (FileExist(enginePath)) { std::cout << "[ANSSAM3] " << label << ": cached engine found: " << enginePath << std::endl; return LoadTRTEngineBundle(bundle, enginePath, label); } // --- Build from ONNX --- std::cout << "[ANSSAM3] " << label << ": building TRT engine from " << onnxPath << std::endl; if (!FileExist(onnxPath)) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": ONNX file not found: " + onnxPath, __FILE__, __LINE__); return false; } auto builder = std::unique_ptr(nvinfer1::createInferBuilder(m_trtLogger)); if (!builder) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createInferBuilder failed", __FILE__, __LINE__); return false; } auto network = std::unique_ptr(TRT_CREATE_NETWORK(builder)); if (!network) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createNetworkV2 failed", __FILE__, __LINE__); return false; } auto parser = std::unique_ptr(nvonnxparser::createParser(*network, m_trtLogger)); if (!parser) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": createParser failed", __FILE__, __LINE__); return false; } // parseFromFile resolves .onnx_data external files relative to the ONNX directory if (!parser->parseFromFile(onnxPath.c_str(), static_cast(nvinfer1::ILogger::Severity::kWARNING))) { for (int i = 0; i < parser->getNbErrors(); ++i) std::cerr << "[ANSSAM3] " << label << " parse error: " << parser->getError(i)->desc() << std::endl; _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": parseFromFile failed", __FILE__, __LINE__); return false; } std::cout << "[ANSSAM3] " << label << ": ONNX parsed successfully." << std::endl; // --- Log and configure input tensors --- auto config = std::unique_ptr(builder->createBuilderConfig()); // Workspace governs max scratch memory TRT can use at runtime. // ImageEncoder FP32 needs huge scratch (3.5 GiB at 4 GiB workspace) which // causes OOM on 8 GiB GPUs. Cap it to 2 GiB — TRT picks leaner tactics // with minimal quality loss. LangEncoder/Decoder are small; keep 4 GiB. const bool isImageEncoder = (label.find("ImageEncoder") != std::string::npos); size_t workspaceBytes = isImageEncoder ? 2048ULL * 1024 * 1024 // 2 GiB for ImageEncoder (prevents OOM) : 4096ULL * 1024 * 1024; // 4 GiB for LangEncoder / Decoder #if NV_TENSORRT_MAJOR >= 10 config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspaceBytes); #else config->setMaxWorkspaceSize(workspaceBytes); #endif // Set precision flags switch (precision) { case TrtPrecision::BF16: { // BF16 requires Ampere+ (compute capability >= 8.0) and TRT 8.6+. // Check GPU capability, then attempt to set the flag if available. cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, 0); #if NV_TENSORRT_MAJOR > 8 || (NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR >= 6) if (devProp.major >= 8) { config->setFlag(nvinfer1::BuilderFlag::kBF16); std::cout << "[ANSSAM3] " << label << ": BF16 precision enabled." << std::endl; } else { std::cout << "[ANSSAM3] " << label << ": GPU CC " << devProp.major << "." << devProp.minor << " does not support BF16, falling back to FP32." << std::endl; } #else (void)devProp; // suppress unused warning std::cout << "[ANSSAM3] " << label << ": TensorRT version does not support BF16, falling back to FP32." << std::endl; #endif break; } case TrtPrecision::FP16: if (builder->platformHasFastFp16()) { config->setFlag(nvinfer1::BuilderFlag::kFP16); std::cout << "[ANSSAM3] " << label << ": FP16 precision enabled." << std::endl; } break; case TrtPrecision::FP32: // No precision flags = FP32 break; } // Create optimization profile with actual ONNX dimensions auto profile = builder->createOptimizationProfile(); int numInputs = network->getNbInputs(); for (int i = 0; i < numInputs; ++i) { auto input = network->getInput(i); const char* name = input->getName(); auto dims = input->getDimensions(); std::cout << "[ANSSAM3] " << label << " input[" << i << "] '" << name << "': ["; for (int d = 0; d < dims.nbDims; ++d) { if (d > 0) std::cout << ", "; std::cout << (dims.d[d] == -1 ? "dyn" : std::to_string(dims.d[d])); } std::cout << "]" << (input->isShapeTensor() ? " (shape tensor)" : "") << std::endl; // Shape tensors: scalar int64 inputs whose VALUES determine output shapes // (e.g. original_height, original_width). Use setShapeValues() not setDimensions(). if (input->isShapeTensor()) { // nbValues = product of dims; for a scalar (nbDims==0) that is 1 int nbValues = 1; for (int d = 0; d < dims.nbDims; ++d) { if (dims.d[d] > 0) nbValues *= dims.d[d]; } std::vector minV(nbValues, 1); std::vector optV(nbValues, 1024); std::vector maxV(nbValues, 4096); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMIN, minV.data(), nbValues); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kOPT, optV.data(), nbValues); profile->setShapeValues(name, nvinfer1::OptProfileSelector::kMAX, maxV.data(), nbValues); continue; } // Regular execution tensors: replace dynamic dims with concrete values bool hasDynamic = false; nvinfer1::Dims fixedDims = dims; for (int d = 0; d < dims.nbDims; ++d) { if (dims.d[d] == -1) { hasDynamic = true; fixedDims.d[d] = 1; // default batch or sequence } } if (hasDynamic) { profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, fixedDims); profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, fixedDims); profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, fixedDims); } } config->addOptimizationProfile(profile); // --- Mixed precision for Decoder: keep FP16 for bulk ops but force // score/NMS/comparison layers to FP32 so that the internal // thresholding doesn't lose detections due to half-precision // rounding. We mark layers whose names contain NMS-related // keywords, plus all layers of types that perform comparisons // or index-selection (which are part of the NMS pipeline). if (precision == TrtPrecision::FP16 && label == std::string("Decoder")) { config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); int numLayers = network->getNbLayers(); int markedCount = 0; for (int li = 0; li < numLayers; ++li) { auto* layer = network->getLayer(li); if (!layer) continue; std::string lname(layer->getName()); auto ltype = layer->getType(); // Force FP32 on layers involved in score thresholding / NMS: // - Comparison ops (Greater, Less, Equal) // - Sigmoid (final score activation) // - TopK, NonZero, Gather, Select (index-selection in NMS) // - Any layer whose name hints at score/nms/threshold bool needFP32 = false; // By layer type switch (ltype) { case nvinfer1::LayerType::kTOPK: case nvinfer1::LayerType::kGATHER: case nvinfer1::LayerType::kSELECT: case nvinfer1::LayerType::kNON_ZERO: case nvinfer1::LayerType::kSCATTER: needFP32 = true; break; default: break; } // By layer name (ONNX op names often preserved by parser) if (!needFP32) { // Convert to lowercase for matching std::string lower = lname; std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); if (lower.find("score") != std::string::npos || lower.find("nms") != std::string::npos || lower.find("sigmoid") != std::string::npos || lower.find("threshold") != std::string::npos || lower.find("greater") != std::string::npos || lower.find("less") != std::string::npos || lower.find("where") != std::string::npos || lower.find("nonzero") != std::string::npos || lower.find("topk") != std::string::npos) { needFP32 = true; } } // TensorRT forbids setPrecision(kFLOAT) on layers that // produce non-float types (booleans, indices/int32, int64). // Only force FP32 when ALL outputs are floating-point. if (needFP32) { bool allFloat = true; for (int oi = 0; oi < layer->getNbOutputs(); ++oi) { auto dt = layer->getOutputType(oi); if (dt != nvinfer1::DataType::kFLOAT && dt != nvinfer1::DataType::kHALF) { allFloat = false; break; } } if (allFloat) { layer->setPrecision(nvinfer1::DataType::kFLOAT); for (int oi = 0; oi < layer->getNbOutputs(); ++oi) layer->setOutputType(oi, nvinfer1::DataType::kFLOAT); ++markedCount; } } } std::cout << "[ANSSAM3] " << label << ": mixed precision — " << markedCount << "/" << numLayers << " layers forced to FP32 (score/NMS ops)." << std::endl; } // --- Build serialized engine --- std::cout << "[ANSSAM3] " << label << ": building engine (this may take a few minutes)..." << std::endl; unsigned long sehCode = 0; auto plan = std::unique_ptr( buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCode)); if (sehCode != 0) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": engine build crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__); return false; } if (!plan) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": buildSerializedNetwork returned null", __FILE__, __LINE__); return false; } // --- Save to disk --- std::ofstream outFile(enginePath, std::ios::binary); if (!outFile.is_open()) { _logger.LogError("ANSSAM3::BuildAndLoadEngine", label + ": cannot write engine file: " + enginePath, __FILE__, __LINE__); return false; } outFile.write(reinterpret_cast(plan->data()), plan->size()); outFile.close(); std::cout << "[ANSSAM3] " << label << ": engine saved to " << enginePath << std::endl; plan.reset(); // --- Load the just-built engine --- return LoadTRTEngineBundle(bundle, enginePath, label); } // ========================================================================= // LoadTRTEngineBundle — deserialize engine, allocate GPU buffers, bind // ========================================================================= bool ANSSAM3::LoadTRTEngineBundle(TRTBundle& bundle, const std::string& enginePath, const std::string& label) { // Read engine file std::ifstream file(enginePath, std::ios::binary | std::ios::ate); if (!file.is_open()) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cannot open: " + enginePath, __FILE__, __LINE__); return false; } std::streamsize fileSize = file.tellg(); file.seekg(0, std::ios::beg); std::vector engineData(fileSize); if (!file.read(engineData.data(), fileSize)) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": read failed", __FILE__, __LINE__); return false; } file.close(); // Deserialize bundle.runtime = std::unique_ptr(nvinfer1::createInferRuntime(m_trtLogger)); if (!bundle.runtime) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createInferRuntime failed", __FILE__, __LINE__); return false; } unsigned long sehCode = 0; bundle.engine = std::unique_ptr( deserializeCudaEngineSafe(bundle.runtime.get(), engineData.data(), engineData.size(), &sehCode)); if (sehCode != 0) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize crashed: " + formatCrashCode(sehCode), __FILE__, __LINE__); return false; } if (!bundle.engine) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": deserialize returned null", __FILE__, __LINE__); return false; } // --- Weight streaming (TRT 10+): keep only a budget of weights on GPU, // stream the rest from CPU pinned memory on demand. // Saves ~1.3 GiB VRAM for ImageEncoder (1.8 GiB weights → 512 MiB on GPU). #if NV_TENSORRT_MAJOR >= 10 { int64_t streamableBytes = bundle.engine->getStreamableWeightsSize(); if (streamableBytes > 0 && label.find("ImageEncoder") != std::string::npos) { // Budget = how much weight memory stays on GPU. // 512 MiB keeps hot layers cached; rest streamed via PCIe. const int64_t budgetBytes = 512LL * 1024 * 1024; int64_t actualBudget = std::min(budgetBytes, streamableBytes); bundle.engine->setWeightStreamingBudgetV2(actualBudget); std::cout << "[ANSSAM3] " << label << ": weight streaming enabled (streamable=" << (streamableBytes / (1024*1024)) << " MiB, budget=" << (actualBudget / (1024*1024)) << " MiB)" << std::endl; } } #endif bundle.context = std::unique_ptr(bundle.engine->createExecutionContext()); if (!bundle.context) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": createExecutionContext failed", __FILE__, __LINE__); return false; } // Set optimization profile int numProfiles = bundle.engine->getNbOptimizationProfiles(); if (numProfiles > 0) { bundle.context->setOptimizationProfileAsync(0, m_cudaStream); cudaStreamSynchronize(m_cudaStream); } // Allocate buffers (device for execution tensors, host for shape tensors) const int numTensors = bundle.engine->getNbIOTensors(); bundle.gpuBuffers.resize(numTensors, nullptr); bundle.gpuBufferSizes.resize(numTensors, 0); bundle.hostBufferIdx.clear(); bundle.nameToIdx.clear(); for (int i = 0; i < numTensors; ++i) { const char* name = bundle.engine->getIOTensorName(i); auto mode = bundle.engine->getTensorIOMode(name); auto shape = bundle.engine->getTensorShape(name); auto dtype = bundle.engine->getTensorDataType(name); auto loc = bundle.engine->getTensorLocation(name); bool isHost = (loc == nvinfer1::TensorLocation::kHOST); // Check if any dimension is dynamic (-1) bool hasDynamic = false; int64_t numElements = 1; for (int d = 0; d < shape.nbDims; ++d) { int64_t v = shape.d[d]; if (v <= 0) { hasDynamic = true; v = 1; } numElements *= v; } // Scalars (0-dim) still need at least 1 element if (numElements < 1) numElements = 1; // For output tensors with ANY dynamic dim, pre-allocate a generous buffer. // The decoder outputs (boxes [-1,4], scores [-1], masks [-1,-1,-1,-1]) // all have data-dependent first dimension from NonZero/NMS. if (mode == nvinfer1::TensorIOMode::kOUTPUT && hasDynamic) { // Pre-allocate for up to 256 detections with generous mask size numElements = 256 * 1 * 256 * 256; } size_t bufSize = numElements * DataTypeSize(dtype); bundle.gpuBufferSizes[i] = bufSize; if (isHost) { // Shape tensor — allocate host memory bundle.gpuBuffers[i] = calloc(numElements, DataTypeSize(dtype)); if (!bundle.gpuBuffers[i]) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": host alloc failed for " + std::string(name), __FILE__, __LINE__); return false; } bundle.hostBufferIdx.insert(i); } else { // Execution tensor — allocate device memory cudaError_t err = cudaMalloc(&bundle.gpuBuffers[i], bufSize); if (err != cudaSuccess) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": cudaMalloc failed for " + std::string(name) + ": " + cudaGetErrorString(err), __FILE__, __LINE__); return false; } cudaMemset(bundle.gpuBuffers[i], 0, bufSize); } // Bind tensor address (host ptr for shape tensors, device ptr for execution tensors) if (!bundle.context->setTensorAddress(name, bundle.gpuBuffers[i])) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": setTensorAddress failed for " + std::string(name), __FILE__, __LINE__); return false; } bundle.nameToIdx[std::string(name)] = i; std::cout << "[ANSSAM3] " << label << " tensor[" << i << "] '" << name << "' " << (mode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << (isHost ? " HOST" : " DEVICE") << " dtype=" << static_cast(dtype) << " bufSize=" << bufSize << std::endl; } // Set input shapes (replace dynamic dims with concrete values) for (int i = 0; i < numTensors; ++i) { const char* name = bundle.engine->getIOTensorName(i); if (bundle.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kINPUT) continue; // Shape tensors (scalar, host memory): TRT reads the value directly // from the host buffer. setInputShape for scalars uses Dims{0, {}}. // Write a default value (1024) into the host buffer at load time. if (bundle.hostBufferIdx.count(i)) { auto dtype = bundle.engine->getTensorDataType(name); if (dtype == nvinfer1::DataType::kINT64) *reinterpret_cast(bundle.gpuBuffers[i]) = 1024; else *reinterpret_cast(bundle.gpuBuffers[i]) = 1024; nvinfer1::Dims scalarDims; scalarDims.nbDims = 0; bundle.context->setInputShape(name, scalarDims); continue; } auto dims = bundle.engine->getTensorShape(name); nvinfer1::Dims inputDims = dims; for (int d = 0; d < inputDims.nbDims; ++d) { if (inputDims.d[d] == -1) inputDims.d[d] = 1; } if (!bundle.context->setInputShape(name, inputDims)) { _logger.LogError("ANSSAM3::LoadTRTEngineBundle", label + ": setInputShape failed for " + std::string(name), __FILE__, __LINE__); return false; } } std::cout << "[ANSSAM3] " << label << ": loaded successfully (" << numTensors << " tensors)." << std::endl; return true; } // ========================================================================= // EnsureEnginesBuilt — pre-build uncached engines one at a time // Avoids GPU OOM when building one engine while others are already loaded. // ========================================================================= bool ANSSAM3::EnsureEnginesBuilt(const std::string& imgOnnx, const std::string& langOnnx, const std::string& decOnnx) { struct Job { const std::string* onnx; const char* label; TrtPrecision prec; }; Job jobs[] = { {&langOnnx, "LangEncoder", TrtPrecision::FP16}, // FP16 — verified identical to FP32 {&decOnnx, "Decoder", TrtPrecision::FP16}, // FP16 decoder {&imgOnnx, "ImageEncoder", TrtPrecision::FP32}, // FP32 — FP16/BF16 both corrupt backbone FPN }; for (auto& j : jobs) { if (!FileExist(EngineFileName(*j.onnx, j.prec))) { TRTBundle tmp; if (!BuildAndLoadEngine(tmp, *j.onnx, j.label, j.prec)) { _logger.LogError("ANSSAM3::EnsureEnginesBuilt", std::string("Failed to pre-build engine: ") + j.label, __FILE__, __LINE__); tmp.destroy(); return false; } tmp.destroy(); // free GPU memory before next build } } return true; } // ========================================================================= // OptimizeModel // ========================================================================= bool ANSSAM3::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { std::lock_guard lock(_mutex); if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false; _fp16 = fp16; optimizedModelFolder = _modelFolder; std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); // Build engines one at a time, destroying each to free GPU memory TRTBundle tmp; bool ok = true; ok = BuildAndLoadEngine(tmp, imgOnnx, "ImageEncoder", TrtPrecision::FP32); tmp.destroy(); if (ok) { ok = BuildAndLoadEngine(tmp, langOnnx, "LangEncoder", TrtPrecision::FP16); tmp.destroy(); } if (ok) { ok = BuildAndLoadEngine(tmp, decOnnx, "Decoder", TrtPrecision::FP16); tmp.destroy(); } if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; } return ok; } // ========================================================================= // Initialize // ========================================================================= bool ANSSAM3::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap); if (!result) return false; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; // Create CUDA stream cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); // Build/load TRT engines for image + language encoders std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::Initialize", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; // Load tokenizer m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); _logger.LogDebug("ANSSAM3::Initialize", "CLIP tokenizer loaded", __FILE__, __LINE__); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::Initialize", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // LoadModel // ========================================================================= bool ANSSAM3::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword); if (!result) return false; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); std::string imgOnnx = CreateFilePath(_modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(_modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(_modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::LoadModel", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(_modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::LoadModel", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // LoadModelFromFolder // ========================================================================= bool ANSSAM3::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (!result) return false; _modelConfig = modelConfig; _modelConfig.detectionType = DetectionType::SEGMENTATION; if (_modelConfig.modelConfThreshold < 0.1f) _modelConfig.modelConfThreshold = 0.5f; m_segThreshold = _modelConfig.modelConfThreshold; _fp16 = true; cudaSetDevice(_modelConfig.gpuDeviceIndex); cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); std::string imgOnnx = CreateFilePath(modelFolder, "sam3_image_encoder.onnx"); std::string langOnnx = CreateFilePath(modelFolder, "sam3_language_encoder.onnx"); std::string decOnnx = CreateFilePath(modelFolder, "sam3_decoder.onnx"); // Pre-build uncached TRT engines (avoids GPU OOM during build) if (!EnsureEnginesBuilt(imgOnnx, langOnnx, decOnnx)) { _modelLoadValid = false; return false; } if (!BuildAndLoadEngine(m_imgEncoder, imgOnnx, "ImageEncoder", TrtPrecision::FP32) || !BuildAndLoadEngine(m_langEncoder, langOnnx, "LangEncoder", TrtPrecision::FP16) || !BuildAndLoadEngine(m_decoder, decOnnx, "Decoder", TrtPrecision::FP16)) { _logger.LogError("ANSSAM3::LoadModelFromFolder", "Failed to build/load TRT engines", __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; _isInitialized = true; m_tokenizer = std::make_unique(); std::string tokenizerPath = CreateFilePath(modelFolder, "merges.txt"); if (FileExist(tokenizerPath)) { m_tokenizer->Load(tokenizerPath); _logger.LogDebug("ANSSAM3::LoadModelFromFolder", "CLIP tokenizer loaded", __FILE__, __LINE__); } return true; } catch (const std::exception& e) { _logger.LogFatal("ANSSAM3::LoadModelFromFolder", e.what(), __FILE__, __LINE__); return false; } } // ========================================================================= // SetPrompt — run language encoder, cache outputs on GPU // ========================================================================= bool ANSSAM3::SetPrompt(const std::vector& inputIds, const std::vector& attentionMask) { std::lock_guard lock(_mutex); if (!m_langEncoder.context) { _logger.LogError("ANSSAM3::SetPrompt", "Language encoder not loaded", __FILE__, __LINE__); return false; } // Language encoder input: "tokens" [1, 32] int64 // Find the tokens input tensor auto it = m_langEncoder.nameToIdx.find("tokens"); if (it == m_langEncoder.nameToIdx.end()) { // Try first input const char* firstName = m_langEncoder.engine->getIOTensorName(0); it = m_langEncoder.nameToIdx.find(firstName); } if (it == m_langEncoder.nameToIdx.end()) { _logger.LogError("ANSSAM3::SetPrompt", "Cannot find tokens input tensor", __FILE__, __LINE__); return false; } int tokIdx = it->second; const char* tokName = m_langEncoder.engine->getIOTensorName(tokIdx); auto tokDtype = m_langEncoder.engine->getTensorDataType(tokName); // Upload tokens — handle int64 vs int32 data type if (tokDtype == nvinfer1::DataType::kINT64) { cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], inputIds.data(), inputIds.size() * sizeof(int64_t), cudaMemcpyHostToDevice, m_cudaStream); } else if (tokDtype == nvinfer1::DataType::kINT32) { // TRT may have converted int64 to int32 at build time std::vector tokens32(inputIds.size()); for (size_t i = 0; i < inputIds.size(); ++i) tokens32[i] = static_cast(inputIds[i]); cudaMemcpyAsync(m_langEncoder.gpuBuffers[tokIdx], tokens32.data(), tokens32.size() * sizeof(int32_t), cudaMemcpyHostToDevice, m_cudaStream); } // Set input shape nvinfer1::Dims tokenDims; tokenDims.nbDims = 2; tokenDims.d[0] = 1; tokenDims.d[1] = static_cast(inputIds.size()); m_langEncoder.context->setInputShape(tokName, tokenDims); // Run language encoder #if NV_TENSORRT_MAJOR >= 10 bool ok = m_langEncoder.context->enqueueV3(m_cudaStream); #else bool ok = m_langEncoder.context->enqueueV2( reinterpret_cast(m_langEncoder.gpuBuffers.data()), m_cudaStream, nullptr); #endif if (!ok) { _logger.LogError("ANSSAM3::SetPrompt", "Language encoder enqueue failed", __FILE__, __LINE__); return false; } cudaStreamSynchronize(m_cudaStream); // Find language encoder outputs: text_attention_mask and text_memory // output[0]: text_attention_mask [1, 32] bool → cached as m_cachedLangMask // output[1]: text_memory [32, 1, 256] float32 → cached as m_cachedLangFeats // output[2]: text_embeds [32, 1, 1024] float32 → NOT used int maskOutIdx = -1, featsOutIdx = -1; const int numTensors = m_langEncoder.engine->getNbIOTensors(); for (int i = 0; i < numTensors; ++i) { const char* name = m_langEncoder.engine->getIOTensorName(i); if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT) continue; std::string sname(name); if (sname.find("attention_mask") != std::string::npos || sname.find("text_attention") != std::string::npos) { maskOutIdx = i; } else if (sname.find("text_memory") != std::string::npos || sname.find("memory") != std::string::npos) { featsOutIdx = i; } } // Fallback: first 2 outputs in order if (maskOutIdx < 0 || featsOutIdx < 0) { int outCount = 0; for (int i = 0; i < numTensors; ++i) { const char* name = m_langEncoder.engine->getIOTensorName(i); if (m_langEncoder.engine->getTensorIOMode(name) != nvinfer1::TensorIOMode::kOUTPUT) continue; if (outCount == 0 && maskOutIdx < 0) maskOutIdx = i; else if (outCount == 1 && featsOutIdx < 0) featsOutIdx = i; outCount++; } } // Cache mask on GPU if (maskOutIdx >= 0) { size_t bytes = m_langEncoder.gpuBufferSizes[maskOutIdx]; if (m_cachedLangMask && m_cachedLangMaskBytes < bytes) { cudaFree(m_cachedLangMask); m_cachedLangMask = nullptr; } if (!m_cachedLangMask) { cudaMalloc(&m_cachedLangMask, bytes); } m_cachedLangMaskBytes = bytes; cudaMemcpyAsync(m_cachedLangMask, m_langEncoder.gpuBuffers[maskOutIdx], bytes, cudaMemcpyDeviceToDevice, m_cudaStream); } // Cache features on GPU if (featsOutIdx >= 0) { size_t bytes = m_langEncoder.gpuBufferSizes[featsOutIdx]; if (m_cachedLangFeats && m_cachedLangFeatsBytes < bytes) { cudaFree(m_cachedLangFeats); m_cachedLangFeats = nullptr; } if (!m_cachedLangFeats) { cudaMalloc(&m_cachedLangFeats, bytes); } m_cachedLangFeatsBytes = bytes; cudaMemcpyAsync(m_cachedLangFeats, m_langEncoder.gpuBuffers[featsOutIdx], bytes, cudaMemcpyDeviceToDevice, m_cudaStream); } cudaStreamSynchronize(m_cudaStream); m_promptSet = true; return true; } bool ANSSAM3::SetPrompt(const std::string& text) { std::lock_guard lock(_mutex); if (!m_tokenizer || !m_tokenizer->IsLoaded()) { _logger.LogError("ANSSAM3::SetPrompt", "Tokenizer not loaded. Place merges.txt in model folder.", __FILE__, __LINE__); return false; } auto result = m_tokenizer->Tokenize(text, m_tokenLength); SetPrompt(result.inputIds, result.attentionMask); return true; } // ========================================================================= // RunInference // ========================================================================= std::vector