ANSCORE/engines/ONNXEngine/ONNXEngine.cpp

#include "ONNXEngine.h"
#include "EPLoader.h"
#include "Utility.h"

#include <algorithm>
#include <limits>
#include <filesystem>
#include <fstream>
#include <cstdlib>
#include <system_error>

namespace ANSCENTER {

    // ====================================================================
    // BasicOrtHandler — constructors
    // ====================================================================

    std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
    {
        try {
            // Make sure the Ort API pointer is initialised in THIS DLL.
            if (Ort::Global<void>::api_ == nullptr) {
                Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
            }

            Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
            Ort::SessionOptions opts;
            opts.SetIntraOpNumThreads(1);
            opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
            // Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
            // for a no-inference metadata read.

            std::wstring wpath(onnxPath.begin(), onnxPath.end());
            Ort::Session session(env, wpath.c_str(), opts);

            Ort::AllocatorWithDefaultOptions alloc;
            auto inName = session.GetInputNameAllocated(0, alloc);
            return std::string(inName.get());
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[QueryModelInputName] ORT exception: "
                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
            return "";
        }
        catch (const std::exception& e) {
            std::cerr << "[QueryModelInputName] std exception: "
                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
            return "";
        }
    }

    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
        unsigned int _num_threads)
        : log_id(_onnx_path.data()),
        num_threads(_num_threads),
        m_engineType(static_cast<EngineType>(-1)),
        onnx_path_w(_onnx_path.begin(), _onnx_path.end())  // ← stored as member
    {
        onnx_path = onnx_path_w.c_str();  // ← safe, member owns storage
        initialize_handler();
    }

    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
        EngineType engineType,
        unsigned int _num_threads)
        : log_id(_onnx_path.data()),
        num_threads(_num_threads),
        m_engineType(engineType),
        onnx_path_w(_onnx_path.begin(), _onnx_path.end())  // ← stored as member
    {
        onnx_path = onnx_path_w.c_str();  // ← safe, member owns storage
        initialize_handler();
    }

    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
        EngineType engineType,
        const OrtHandlerOptions& options,
        unsigned int _num_threads)
        : log_id(_onnx_path.data()),
        num_threads(_num_threads),
        m_engineType(engineType),
        m_handlerOptions(options),
        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
    {
        onnx_path = onnx_path_w.c_str();
        initialize_handler();
    }

    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
        const OrtHandlerOptions& options,
        unsigned int _num_threads)
        : log_id(_onnx_path.data()),
        num_threads(_num_threads),
        m_engineType(static_cast<EngineType>(-1)),  // EPLoader auto-detect
        m_handlerOptions(options),
        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
    {
        onnx_path = onnx_path_w.c_str();
        initialize_handler();
    }

    BasicOrtHandler::~BasicOrtHandler()
    {
        if (ort_session) {
            delete ort_session;
            ort_session = nullptr;
        }
        if (memory_info_handler) {
            delete memory_info_handler;
            memory_info_handler = nullptr;
        }
        if (ort_env) {
            delete ort_env;
            ort_env = nullptr;
        }
    }

    // ====================================================================
    // EP appenders
    // ====================================================================

    bool BasicOrtHandler::TryAppendCUDA(Ort::SessionOptions& session_options)
    {
        try {
            OrtCUDAProviderOptionsV2* cuda_options = nullptr;
            Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);

            // Memory-safe GPU configuration for multi-model environments:
            //  - arena_extend_strategy = 1 (kSameAsRequested) to avoid
            //    pre-allocating huge GPU memory blocks that may exceed VRAM
            //  - cudnn_conv_algo_search = HEURISTIC for faster session init
            //  - cudnn_conv_use_max_workspace defaults to "0" to prevent
            //    CUDNN_BACKEND_API_FAILED when TRT engines already occupy
            //    most VRAM on the same GPU. OCR sub-models that need fast
            //    convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
            //  - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
            //    so it doesn't compete with TensorRT for the remaining VRAM
            const char* maxWorkspace =
                m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";

            const char* keys[] = {
                "device_id",
                "arena_extend_strategy",
                "cudnn_conv_algo_search",
                "cudnn_conv_use_max_workspace",
                "gpu_mem_limit"
            };
            const char* values[] = {
                "0",
                "1",              // kSameAsRequested
                "HEURISTIC",      // avoid exhaustive algo search on large model
                maxWorkspace,     // "1" for OCR (perf), "0" elsewhere (safety)
                "2147483648"      // 2 GB arena limit
            };
            Ort::GetApi().UpdateCUDAProviderOptions(
                cuda_options, keys, values, 5);

            session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
            Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);

            std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
                         "cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
                      << ", memLimit=2GB)." << std::endl;
            return true;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ORT] CUDA EP failed: " << e.what() << std::endl;
            return false;
        }
    }

    bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
    {
        try {
            OrtTensorRTProviderOptionsV2* trt_options = nullptr;
            Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);

            // Cache built engines on disk so subsequent runs skip the
            // multi-minute build. Engines are keyed on (model hash, GPU
            // arch, shape profile) so changing any of those triggers
            // a rebuild automatically.
            std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
            if (cacheDir.empty()) {
                // %TEMP%\ANSCENTER\TRTEngineCache
                const char* tmp = std::getenv("TEMP");
                if (!tmp) tmp = std::getenv("TMP");
                if (!tmp) tmp = ".";
                std::filesystem::path p(tmp);
                p /= "ANSCENTER";
                p /= "TRTEngineCache";
                std::error_code ec;
                std::filesystem::create_directories(p, ec);
                cacheDir = p.string();
            }

            // Builder options tuned for *fast first-run*:
            //   - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3
            //   - workspace 1 GB: leaves room for CUDA EP arena and the
            //     LPD's own TRT engine on the same GPU
            //   - timing cache: persists kernel timings between runs so
            //     builds at new shapes get progressively faster
            //   - profile shapes (if set): build ONE dynamic-shape
            //     engine that handles all (batch, width) combos instead
            //     of rebuilding per unique input. Critical for variable
            //     batch workloads — without this, TRT EP rebuilds every
            //     time runtime sees a new shape pair, causing 60-90 s
            //     hangs mid-stream.
            std::filesystem::path timingCachePath =
                std::filesystem::path(cacheDir) / "timing.cache";
            std::string timingCacheStr = timingCachePath.string();

            const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
                                  && !m_handlerOptions.trtProfileOptShapes.empty()
                                  && !m_handlerOptions.trtProfileMaxShapes.empty();

            // Build the key/value arrays. We always set the first 8 keys;
            // the profile shapes are appended only when provided.
            std::vector<const char*> keys = {
                "device_id",
                "trt_fp16_enable",
                "trt_engine_cache_enable",
                "trt_engine_cache_path",
                "trt_max_workspace_size",
                "trt_builder_optimization_level",
                "trt_timing_cache_enable",
                "trt_timing_cache_path"
            };
            std::vector<const char*> values = {
                "0",
                m_handlerOptions.trtFP16 ? "1" : "0",
                "1",
                cacheDir.c_str(),
                "1073741824",     // 1 GB build workspace
                "1",              // fast build (was "3")
                "1",
                cacheDir.c_str()
            };

            if (haveProfile) {
                keys.push_back("trt_profile_min_shapes");
                values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
                keys.push_back("trt_profile_opt_shapes");
                values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
                keys.push_back("trt_profile_max_shapes");
                values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
            }

            Ort::GetApi().UpdateTensorRTProviderOptions(
                trt_options, keys.data(), values.data(), keys.size());

            session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
            Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);

            std::cout << "[ORT] TensorRT EP attached (fp16="
                      << (m_handlerOptions.trtFP16 ? "1" : "0")
                      << ", cache=" << cacheDir
                      << ", profile=" << (haveProfile ? "dynamic" : "static")
                      << ")." << std::endl;
            if (haveProfile) {
                std::cout << "[ORT]   profile min: "
                          << m_handlerOptions.trtProfileMinShapes << std::endl
                          << "[ORT]   profile opt: "
                          << m_handlerOptions.trtProfileOptShapes << std::endl
                          << "[ORT]   profile max: "
                          << m_handlerOptions.trtProfileMaxShapes << std::endl;
            }
            return true;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
            return false;
        }
        catch (const std::exception& e) {
            std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
            return false;
        }
    }
    bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
    {
        try {
            // AppendExecutionProvider("DML") is the correct API for DirectML —
            // there is no V2 variant, so the string-based map is intentional here.
            std::unordered_map<std::string, std::string> options = {
                { "device_id", "0" }
            };
            session_options.AppendExecutionProvider("DML", options);
            std::cout << "[ORT] DirectML EP attached (device 0)." << std::endl;
            return true;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ORT] DirectML EP failed: " << e.what() << std::endl;
            return false;
        }
    }
    bool BasicOrtHandler::TryAppendOpenVINO(Ort::SessionOptions& session_options)
    {
        // Use AppendExecutionProvider_OpenVINO_V2 instead of the generic string API,
        // matching the pattern used in YOLOOD/YOLO12OD/ANSONNXCL etc.
        // Try device configs in priority order, falling back gracefully.
        //
        // NPU availability is probed once per process. If AUTO:NPU,GPU fails on
        // the first call, we skip it for all subsequent models to avoid repeated
        // "Failed to load shared library" errors cluttering the log.
        static bool s_npuProbed = false;
        static bool s_npuAvailable = false;

        const std::string precision = "FP16";
        const std::string numberOfThreads = "4";
        const std::string numberOfStreams = "4";

        auto makeConfig = [&](const std::string& device) {
            return std::unordered_map<std::string, std::string>{
                {"device_type", device}, {"precision", precision},
                {"num_of_threads", numberOfThreads}, {"num_streams", numberOfStreams},
                {"enable_opencl_throttling", "False"}, {"enable_qdq_optimizer", "True"}
            };
        };

        std::vector<std::unordered_map<std::string, std::string>> try_configs;

        // Only try NPU if it hasn't been probed yet or was previously available
        if (!s_npuProbed || s_npuAvailable) {
            try_configs.push_back(makeConfig("AUTO:NPU,GPU"));
        }
        try_configs.push_back(makeConfig("GPU.0"));
        try_configs.push_back(makeConfig("GPU.1"));
        try_configs.push_back(makeConfig("AUTO:GPU,CPU"));

        for (const auto& config : try_configs) {
            try {
                session_options.AppendExecutionProvider_OpenVINO_V2(config);
                const auto& device = config.at("device_type");
                std::cout << "[ORT] OpenVINO EP attached ("
                    << device << ", " << precision << ")." << std::endl;
                ANS_DBG("OrtHandler", "OpenVINO EP attached: %s", device.c_str());

                // If NPU config succeeded, mark it available
                if (device.find("NPU") != std::string::npos) {
                    s_npuProbed = true;
                    s_npuAvailable = true;
                }
                return true;
            }
            catch (const Ort::Exception& e) {
                const auto& device = config.at("device_type");

                // If NPU config failed, remember so we skip it next time
                if (device.find("NPU") != std::string::npos) {
                    if (!s_npuProbed) {
                        std::cout << "[ORT] NPU not available — skipping NPU configs for subsequent models." << std::endl;
                        ANS_DBG("OrtHandler", "NPU not available, will skip in future");
                    }
                    s_npuProbed = true;
                    s_npuAvailable = false;
                } else {
                    std::cerr << "[ORT] OpenVINO EP failed for device "
                        << device << ": " << e.what() << std::endl;
                }
            }
        }
        std::cerr << "[ORT] OpenVINO EP: all device configs failed." << std::endl;
        return false;
    }

    // ====================================================================
    // initialize_handler
    // ====================================================================

    void BasicOrtHandler::initialize_handler()
    {
        ANS_DBG("OrtHandler", "initialize_handler: m_engineType=%d", static_cast<int>(m_engineType));
        const auto& epInfo = EPLoader::Current();
        ANS_DBG("OrtHandler", "initialize_handler: EPLoader type=%d dir=%s",
            static_cast<int>(epInfo.type), epInfo.libraryDir.c_str());
        if (Ort::Global<void>::api_ == nullptr)
            Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));

        std::cout << "[ORT] api_ = " << (void*)Ort::Global<void>::api_ << std::endl;

        EngineType engine = (static_cast<int>(m_engineType) == -1)
            ? epInfo.type : m_engineType;
        // Persist the resolved engine type so subclasses (e.g. ONNXYOLO)
        // can branch on the actual EP at inference time (IoBinding for DML).
        m_engineType = engine;
        ANS_DBG("OrtHandler", "initialize_handler: resolved engine=%d (from %s)",
            static_cast<int>(engine),
            (static_cast<int>(m_engineType) == -1) ? "EPLoader" : "explicit");

        ort_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, log_id);
        memory_info_handler = new Ort::MemoryInfo(
            Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault));

        Ort::SessionOptions session_options;
        session_options.SetIntraOpNumThreads(num_threads);
        // Start with full optimization — will be downgraded to DISABLE_ALL
        // later if we detect a large external data file (e.g. SAM3's 3.3 GB
        // .onnx_data).  Normal small models keep ORT_ENABLE_ALL.
        session_options.SetGraphOptimizationLevel(
            GraphOptimizationLevel::ORT_ENABLE_ALL);
        session_options.SetLogSeverityLevel(4);

        // DirectML REQUIRES these two settings per ORT documentation:
        //  - DisableMemPattern: DML manages its own memory; ORT's memory
        //    pattern optimization conflicts with DML's D3D12 allocator.
        //  - ORT_SEQUENTIAL: DML uses a single command queue and cannot
        //    handle parallel execution mode — doing so causes deadlocks
        //    when synchronizing GPU→CPU data transfers.
        if (engine == EngineType::AMD_GPU) {
            session_options.DisableMemPattern();
            session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);

            // DirectML 1.15.4 (the latest; Microsoft has moved DirectML into
            // sustained engineering only) has a deterministic crash path in
            // amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000)
            // when ORT_ENABLE_ALL applies layout-reorder transforms to
            // YOLO-style conv graphs. Downgrade to EXTENDED on DML: still
            // keeps constant folding and Conv+BN+ReLU fusion (the big wins),
            // drops the risky layout transforms. Perf impact on YOLO is
            // typically under 5%.
            session_options.SetGraphOptimizationLevel(
                GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

            ANS_DBG("OrtHandler",
                "DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt");
        }

        std::vector<std::string> available = Ort::GetAvailableProviders();
        std::cout << "[ORT] Available providers: ";
        for (auto& p : available) std::cout << p << " ";
        std::cout << std::endl;
        //std::cout << "[ORT] Selected engine   : "
        //    << EPLoader::EngineTypeToString(engine) << std::endl;

        auto hasProvider = [&](const std::string& name) -> bool {
            return std::find(available.begin(), available.end(), name)
                != available.end();
            };

        bool epAttached = false;

        switch (engine)
        {
            // --------------------------------------------------------
        case EngineType::NVIDIA_GPU:
            // Try TensorRT EP first when explicitly requested. Falls
            // through to CUDA EP if TRT is missing or option creation
            // fails. Both EPs may be attached at once — ORT picks TRT
            // for nodes it supports and CUDA for the rest.
            if (m_handlerOptions.preferTensorRT
                && hasProvider("TensorrtExecutionProvider")) {
                ANS_DBG("OrtHandler", "Trying TensorRT EP...");
                if (TryAppendTensorRT(session_options)) {
                    epAttached = true;
                }
                else {
                    std::cerr << "[ORT] TensorRT EP attach failed — "
                                 "falling back to CUDA EP." << std::endl;
                }
            }

            ANS_DBG("OrtHandler", "Trying CUDA EP...");
            if (hasProvider("CUDAExecutionProvider")) {
                if (TryAppendCUDA(session_options)) {
                    epAttached = true;
                }
            }
            if (!epAttached) {
                std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
                << std::endl;
                ANS_DBG("OrtHandler", "CUDA EP FAILED — fallback to CPU");
            }
            break;

            // --------------------------------------------------------
        case EngineType::AMD_GPU:
            ANS_DBG("OrtHandler", "Trying DirectML EP...");
            if (hasProvider("DmlExecutionProvider"))
                epAttached = TryAppendDirectML(session_options);
            if (!epAttached) {
                std::cerr << "[ORT] DirectML EP unavailable — falling back to CPU."
                << std::endl;
                ANS_DBG("OrtHandler", "DirectML EP FAILED — fallback to CPU");
            }
            break;

            // --------------------------------------------------------
        case EngineType::OPENVINO_GPU:
            ANS_DBG("OrtHandler", "Trying OpenVINO EP...");
            if (hasProvider("OpenVINOExecutionProvider"))
                epAttached = TryAppendOpenVINO(session_options);
            if (!epAttached) {
                std::cerr << "[ORT] OpenVINO EP unavailable — falling back to CPU."
                << std::endl;
                ANS_DBG("OrtHandler", "OpenVINO EP FAILED — fallback to CPU");
            }
            break;

            // --------------------------------------------------------
        case EngineType::CPU:
        default:
            std::cout << "[ORT] Using CPU EP." << std::endl;
            ANS_DBG("OrtHandler", "Using CPU EP");
            epAttached = true;
            break;
        }

        if (!epAttached) {
            std::cout << "[ORT] Running on CPU EP (fallback)." << std::endl;
            ANS_DBG("OrtHandler", "EP not attached — running on CPU fallback");
        } else {
            ANS_DBG("OrtHandler", "EP attached successfully");
        }

        // ----------------------------------------------------------------
        // Create session
        // ----------------------------------------------------------------
        // ORT resolves external data files (e.g. .onnx_data) relative to
        // the CWD rather than the model file's directory.  Temporarily
        // switch CWD so ORT can locate them.
        //
        // Additionally, ORT's internal memory-mapping of very large
        // external data files (>2 GB) can crash with an access violation
        // on Windows.  When we detect a large .onnx_data file, we
        // pre-load it with standard file I/O and pass the buffer via
        // AddExternalInitializersFromFilesInMemory() so ORT never
        // memory-maps the file itself.
        // ----------------------------------------------------------------
        std::filesystem::path modelFsPath(onnx_path);          // wchar_t*
        std::filesystem::path modelDir = modelFsPath.parent_path();
        std::filesystem::path prevCwd  = std::filesystem::current_path();

        if (!modelDir.empty() && std::filesystem::is_directory(modelDir)) {
            std::filesystem::current_path(modelDir);
            std::cout << "[ORT] CWD -> " << modelDir.string() << std::endl;
        }

        // --- Pre-load external data files if they exist -----------------
        // Keep the buffer alive across session creation (must outlive the
        // Ort::Session constructor call).
        std::vector<char> extDataBuffer;
        {
            // Build the expected external-data filename:
            //   <model_stem>.onnx_data   (e.g. anssam3.onnx_data)
            std::filesystem::path extDataPath =
                modelDir / (modelFsPath.stem().wstring() + L".onnx_data");

            if (std::filesystem::exists(extDataPath)) {
                auto fileSize = std::filesystem::file_size(extDataPath);
                std::cout << "[ORT] External data file found: "
                          << extDataPath.string()
                          << " (" << (fileSize / (1024*1024)) << " MB)" << std::endl;

                // Read entire file into memory with standard I/O.
                // This avoids ORT's internal memory-mapping which can crash
                // with access violation for files > 2 GB on Windows.
                try {
                    std::ifstream ifs(extDataPath, std::ios::binary);
                    if (!ifs) {
                        std::cerr << "[ORT] ERROR: Could not open external data file."
                                  << std::endl;
                    } else {
                        extDataBuffer.resize(static_cast<size_t>(fileSize));
                        std::cout << "[ORT] Reading external data into memory..."
                                  << std::endl;
                        ifs.read(extDataBuffer.data(), static_cast<std::streamsize>(fileSize));
                        ifs.close();
                        std::cout << "[ORT] External data loaded ("
                                  << extDataBuffer.size() << " bytes)." << std::endl;

                        // Tell ORT to use our in-memory buffer instead of
                        // memory-mapping the file.
                        std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
                            extDataPath.filename().wstring()
                        };
                        std::vector<char*>  extBuffers = { extDataBuffer.data() };
                        std::vector<size_t> extLengths = { extDataBuffer.size() };

                        session_options.AddExternalInitializersFromFilesInMemory(
                            extFileNames, extBuffers, extLengths);
                        std::cout << "[ORT] External initializers registered."
                                  << std::endl;

                        // Large external-data models crash ORT's CUDA graph
                        // optimization passes.  Disable all optimization for
                        // these models only.  Normal small models (SCRFD, YOLO,
                        // GlintArcFace, etc.) keep ORT_ENABLE_ALL.
                        session_options.SetGraphOptimizationLevel(
                            GraphOptimizationLevel::ORT_DISABLE_ALL);
                        std::cout << "[ORT] Graph optimization set to DISABLE_ALL "
                                     "(large external data detected)." << std::endl;
                    }
                }
                catch (const std::bad_alloc&) {
                    std::cerr << "[ORT] WARNING: Could not allocate "
                              << (fileSize / (1024*1024)) << " MB for external data. "
                              << "Falling back to ORT file mapping." << std::endl;
                    extDataBuffer.clear();
                    extDataBuffer.shrink_to_fit();
                }
            }
        }

        // --- Load the .onnx model file into a memory buffer too ----------
        // This avoids ORT opening/mapping ANY files during CreateSession.
        std::vector<char> modelBuffer;
        bool useModelBuffer = false;
        if (!extDataBuffer.empty()) {
            // External data was pre-loaded, so also load the .onnx itself
            try {
                auto modelFileSize = std::filesystem::file_size(modelFsPath);
                modelBuffer.resize(static_cast<size_t>(modelFileSize));
                std::ifstream mifs(modelFsPath, std::ios::binary);
                if (mifs) {
                    mifs.read(modelBuffer.data(), static_cast<std::streamsize>(modelFileSize));
                    mifs.close();
                    useModelBuffer = true;
                    std::cout << "[ORT] Model proto loaded into memory ("
                              << modelBuffer.size() << " bytes)." << std::endl;
                }
            }
            catch (const std::exception& e) {
                std::cerr << "[ORT] WARNING: Could not read model file into memory: "
                          << e.what() << ". Using file path." << std::endl;
            }
        }

        // --- Attempt session creation (with CUDA → CPU fallback) --------
        auto createSession = [&](Ort::SessionOptions& opts, const char* label) {
            std::cout << "[ORT] Creating session (" << label << ")..." << std::endl;
            if (useModelBuffer) {
                ort_session = new Ort::Session(*ort_env,
                    modelBuffer.data(), modelBuffer.size(), opts);
            } else {
                ort_session = new Ort::Session(*ort_env, onnx_path, opts);
            }
            std::cout << "[ORT] Session created OK (" << label << ")." << std::endl;
        };

        ANS_DBG("OrtHandler", "Creating session for model: %ls", onnx_path);
        try {
            createSession(session_options, "primary EP");
            ANS_DBG("OrtHandler", "Session created OK with primary EP");
        }
        catch (const Ort::Exception& e) {
            ANS_DBG("OrtHandler", "Session FAILED with primary EP: %s", e.what());
            std::cerr << "[ORT] Session creation FAILED with primary EP: "
                      << e.what() << std::endl;

            // If we were using a GPU EP, fall back to CPU
            if (engine != EngineType::CPU && epAttached) {
                ANS_DBG("OrtHandler", "Retrying with CPU fallback...");
                std::cerr << "[ORT] Retrying with CPU EP (fallback)..." << std::endl;

                // Build fresh session options — no GPU EP, no graph opt
                Ort::SessionOptions cpuOpts;
                cpuOpts.SetIntraOpNumThreads(num_threads);
                cpuOpts.SetGraphOptimizationLevel(
                    GraphOptimizationLevel::ORT_DISABLE_ALL);
                cpuOpts.SetLogSeverityLevel(4);

                // Re-register the in-memory external data if we have it
                if (!extDataBuffer.empty()) {
                    std::filesystem::path extDataPath =
                        modelDir / (modelFsPath.stem().wstring() + L".onnx_data");
                    std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
                        extDataPath.filename().wstring()
                    };
                    std::vector<char*>  extBuffers = { extDataBuffer.data() };
                    std::vector<size_t> extLengths = { extDataBuffer.size() };
                    cpuOpts.AddExternalInitializersFromFilesInMemory(
                        extFileNames, extBuffers, extLengths);
                }

                createSession(cpuOpts, "CPU fallback");
            } else {
                throw;  // re-throw if already on CPU
            }
        }
        catch (const std::exception& e) {
            ANS_DBG("OrtHandler", "Session FAILED (std::exception): %s", e.what());
            std::cerr << "[ORT] Session creation FAILED (std::exception): "
                      << e.what() << std::endl;
            throw;
        }

        // Restore previous CWD & release buffers
        std::filesystem::current_path(prevCwd);
        extDataBuffer.clear();
        extDataBuffer.shrink_to_fit();
        modelBuffer.clear();
        modelBuffer.shrink_to_fit();
        Ort::Allocator allocator(*ort_session, *memory_info_handler);
        std::cout << "[ORT] Allocator created OK." << std::endl;

        // Input
        input_node_names.resize(1);
        input_node_names_.resize(1);
        input_node_names_[0] = OrtCompatiableGetInputName(0, allocator, ort_session);
        input_node_names[0] = input_node_names_[0].data();

        Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
        auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
        input_tensor_size = 1;
        input_node_dims = tensor_info.GetShape();
        for (auto dim : input_node_dims) {
            if (dim > 0) input_tensor_size *= static_cast<size_t>(dim);
        }
        input_values_handler.resize(input_tensor_size);

        // Outputs
        num_outputs = static_cast<int>(ort_session->GetOutputCount());
        output_node_names.resize(num_outputs);
        output_node_names_.resize(num_outputs);
        for (int i = 0; i < num_outputs; ++i) {
            output_node_names_[i] =
                OrtCompatiableGetOutputName(i, allocator, ort_session);
            output_node_names[i] = output_node_names_[i].data();
            output_node_dims.push_back(
                ort_session->GetOutputTypeInfo(i)
                .GetTensorTypeAndShapeInfo().GetShape());
        }
    }

    // ====================================================================
    // GlintArcFace
    // ====================================================================

    Ort::Value GlintArcFace::transform(const cv::Mat& mat)
    {
        if (mat.empty())
            throw std::runtime_error("GlintArcFace::transform — input is empty.");

        cv::Mat canvas;
        cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
        cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
        if (canvas.type() != CV_32FC3)
            canvas.convertTo(canvas, CV_32FC3);

        utils::transform::normalize_inplace(canvas, mean_val, scale_val);

        std::vector<int64_t> shape = input_node_dims;
        if (shape[0] == -1) shape[0] = 1;

        return utils::transform::create_tensor(
            canvas, shape, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    Ort::Value GlintArcFace::transformBatch(const std::vector<cv::Mat>& images)
    {
        if (images.empty())
            throw std::runtime_error("GlintArcFace::transformBatch — batch is empty.");

        const int width = input_node_dims.at(3);
        const int height = input_node_dims.at(2);
        std::vector<cv::Mat> batch;
        batch.reserve(images.size());

        cv::Mat t1, t2, t3;
        for (const auto& mat : images) {
            if (mat.empty())
                throw std::runtime_error("GlintArcFace::transformBatch — empty image in batch.");
            cv::resize(mat, t1, cv::Size(width, height));
            cv::cvtColor(t1, t2, cv::COLOR_BGR2RGB);
            if (t2.type() != CV_32FC3) t2.convertTo(t3, CV_32FC3);
            else t3 = t2.clone();
            utils::transform::normalize_inplace(t3, mean_val, scale_val);
            batch.push_back(t3.clone());
        }

        std::vector<int64_t> shape = input_node_dims;
        shape[0] = static_cast<int64_t>(images.size());

        return utils::transform::create_tensor_batch(
            batch, shape, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    void GlintArcFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
    {
        if (mat.empty()) return;
        Ort::Value input_tensor = transform(mat);
        auto output_tensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &input_tensor, 1,
            output_node_names.data(), num_outputs);

        const unsigned int hidden_dim =
            static_cast<unsigned int>(output_node_dims.at(0).at(1));
        const float* vals =
            output_tensors.at(0).GetTensorMutableData<float>();

        std::vector<float> embedding(vals, vals + hidden_dim);
        cv::normalize(embedding, embedding);

        face_content.embedding = std::move(embedding);
        face_content.dim = hidden_dim;
        face_content.flag = true;
    }

    void GlintArcFace::detectBatch(const std::vector<cv::Mat>& images,
        std::vector<types::FaceContent>& face_contents)
    {
        if (images.empty()) return;
        const size_t batch_size = images.size();
        face_contents.clear();
        face_contents.reserve(batch_size);

        try {
            Ort::Value input_tensor = transformBatch(images);
            auto output_tensors = ort_session->Run(
                Ort::RunOptions{ nullptr },
                input_node_names.data(), &input_tensor, 1,
                output_node_names.data(), num_outputs);

            input_values_handler.clear();
            input_values_handler.shrink_to_fit();

            const float* vals = output_tensors[0].GetTensorData<float>();
            const unsigned int  hidden_dim =
                static_cast<unsigned int>(output_node_dims.at(0).at(1));

            face_contents.resize(batch_size);
            for (size_t i = 0; i < batch_size; ++i) {
                cv::Mat emb_mat(1, hidden_dim, CV_32F,
                    const_cast<float*>(vals + i * hidden_dim));
                cv::Mat emb_norm;
                cv::normalize(emb_mat, emb_norm);

                face_contents[i].embedding = std::vector<float>(
                    emb_norm.begin<float>(), emb_norm.end<float>());
                face_contents[i].dim = hidden_dim;
                face_contents[i].flag = true;
            }
        }
        catch (const Ort::Exception&) {
            face_contents.clear();
            throw;
        }
    }

    // ====================================================================
    // GlintCosFace
    // ====================================================================

    Ort::Value GlintCosFace::transform(const cv::Mat& mat)
    {
        if (mat.empty())
            throw std::runtime_error("GlintCosFace::transform — input is empty.");

        cv::Mat canvas;
        cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
        cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
        canvas.convertTo(canvas, CV_32FC3);
        utils::transform::normalize_inplace(canvas, mean_val, scale_val);

        std::vector<int64_t> shape = input_node_dims;
        if (shape[0] == -1) shape[0] = 1;

        return utils::transform::create_tensor(
            canvas, shape, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    Ort::Value GlintCosFace::transformBatch(const std::vector<cv::Mat>& images)
    {
        if (images.empty())
            throw std::runtime_error("GlintCosFace::transformBatch — batch is empty.");

        const int width = input_node_dims.at(3);
        const int height = input_node_dims.at(2);
        std::vector<cv::Mat> batch;
        batch.reserve(images.size());

        for (const auto& mat : images) {
            if (mat.empty())
                throw std::runtime_error("GlintCosFace::transformBatch — empty image in batch.");
            cv::Mat canvas;
            cv::resize(mat, canvas, cv::Size(width, height));
            cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
            canvas.convertTo(canvas, CV_32FC3);
            utils::transform::normalize_inplace(canvas, mean_val, scale_val);
            batch.push_back(std::move(canvas));
        }

        std::vector<int64_t> shape = input_node_dims;
        shape[0] = static_cast<int64_t>(images.size());

        return utils::transform::create_tensor_batch(
            batch, shape, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    void GlintCosFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
    {
        if (mat.empty()) return;
        Ort::Value input_tensor = transform(mat);
        auto output_tensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &input_tensor, 1,
            output_node_names.data(), num_outputs);

        const unsigned int hidden_dim =
            static_cast<unsigned int>(output_node_dims.at(0).at(1));
        const float* vals =
            output_tensors.at(0).GetTensorMutableData<float>();

        std::vector<float> embedding(vals, vals + hidden_dim);
        cv::normalize(embedding, embedding);

        face_content.embedding = std::move(embedding);
        face_content.dim = hidden_dim;
        face_content.flag = true;
    }

    void GlintCosFace::detectBatch(const std::vector<cv::Mat>& images,
        std::vector<types::FaceContent>& face_contents)
    {
        if (images.empty()) return;
        const size_t batch_size = images.size();
        face_contents.clear();
        face_contents.reserve(batch_size);

        Ort::Value input_tensor = transformBatch(images);
        auto output_tensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &input_tensor, 1,
            output_node_names.data(), num_outputs);

        const float* vals =
            output_tensors.at(0).GetTensorMutableData<float>();
        const unsigned int hidden_dim =
            static_cast<unsigned int>(output_node_dims.at(0).at(1));

        for (size_t i = 0; i < batch_size; ++i) {
            std::vector<float> embedding(vals + i * hidden_dim,
                vals + i * hidden_dim + hidden_dim);
            cv::normalize(embedding, embedding);

            types::FaceContent fc;
            fc.embedding = std::move(embedding);
            fc.dim = hidden_dim;
            fc.flag = true;
            face_contents.emplace_back(std::move(fc));
        }
    }

    // ====================================================================
    // SCRFD — constructors
    // ====================================================================

    SCRFD::SCRFD(const std::string& _onnx_path, unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, _num_threads)
    {
        initial_context();
    }

    SCRFD::SCRFD(const std::string& _onnx_path,
        EngineType engineType,
        unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, engineType, _num_threads)
    {
        initial_context();
    }

    void SCRFD::initial_context()
    {
        if (num_outputs == 6) {
            fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = false;
        }
        else if (num_outputs == 9) {
            fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = true;
        }
    }

    void SCRFD::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
        int target_height, int target_width,
        SCRFDScaleParams& scale_params)
    {
        if (mat.empty()) return;
        int img_height = mat.rows;
        int img_width = mat.cols;

        mat_rs = cv::Mat(target_height, target_width, CV_8UC3, cv::Scalar(0, 0, 0));

        float r = std::min(
            static_cast<float>(target_width) / img_width,
            static_cast<float>(target_height) / img_height);

        int new_w = static_cast<int>(img_width * r);
        int new_h = static_cast<int>(img_height * r);
        int dw = (target_width - new_w) / 2;
        int dh = (target_height - new_h) / 2;

        cv::Mat resized;
        cv::resize(mat, resized, cv::Size(new_w, new_h));
        resized.copyTo(mat_rs(cv::Rect(dw, dh, new_w, new_h)));

        scale_params.ratio = r;
        scale_params.dw = dw;
        scale_params.dh = dh;
        scale_params.flag = true;
    }

    Ort::Value SCRFD::transform(const cv::Mat& mat_rs)
    {
        cv::Mat canvas = mat_rs.clone();
        cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
        utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
        return utils::transform::create_tensor(
            canvas, input_node_dims, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    Ort::Value SCRFD::transformBatch(const std::vector<cv::Mat>& images)
    {
        if (images.empty())
            throw std::runtime_error("SCRFD::transformBatch — batch is empty.");

        const int width = input_node_dims.at(3);
        const int height = input_node_dims.at(2);
        std::vector<cv::Mat> batch;
        batch.reserve(images.size());

        for (const auto& mat : images) {
            if (mat.empty())
                throw std::runtime_error("SCRFD::transformBatch — empty image in batch.");
            cv::Mat canvas;
            cv::resize(mat, canvas, cv::Size(width, height));
            cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
            canvas.convertTo(canvas, CV_32FC3);
            utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
            batch.push_back(std::move(canvas));
        }

        std::vector<int64_t> shape = input_node_dims;
        shape[0] = static_cast<int64_t>(images.size());

        return utils::transform::create_tensor_batch(
            batch, shape, *memory_info_handler,
            input_values_handler, utils::transform::CHW);
    }

    void SCRFD::detect(const cv::Mat& mat,
        std::vector<types::BoxfWithLandmarks>& detected_boxes_kps,
        float score_threshold, float iou_threshold, unsigned int topk)
    {
        if (mat.empty()) return;

        float img_height = static_cast<float>(mat.rows);
        float img_width = static_cast<float>(mat.cols);
        int target_height = static_cast<int>(input_node_dims.at(2));
        int target_width = static_cast<int>(input_node_dims.at(3));

        cv::Mat mat_rs;
        SCRFDScaleParams scale_params;
        resize_unscale(mat, mat_rs, target_height, target_width, scale_params);

        Ort::Value input_tensor = transform(mat_rs);
        auto output_tensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &input_tensor, 1,
            output_node_names.data(), num_outputs);

        std::vector<types::BoxfWithLandmarks> bbox_kps_collection;
        generate_bboxes_kps(scale_params, bbox_kps_collection,
            output_tensors, score_threshold,
            img_height, img_width);
        nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps,
            iou_threshold, topk);
    }

    void SCRFD::generate_points(int target_height, int target_width)
    {
        if (center_points_is_update) return;
        for (auto stride : feat_stride_fpn) {
            unsigned int num_grid_w = target_width / stride;
            unsigned int num_grid_h = target_height / stride;
            for (unsigned int i = 0; i < num_grid_h; ++i) {
                for (unsigned int j = 0; j < num_grid_w; ++j) {
                    for (unsigned int k = 0; k < num_anchors; ++k) {
                        SCRFDPoint pt;
                        pt.cx = static_cast<float>(j);
                        pt.cy = static_cast<float>(i);
                        pt.stride = static_cast<float>(stride);
                        center_points[stride].push_back(pt);
                    }
                }
            }
        }
        center_points_is_update = true;
    }

    void SCRFD::generate_bboxes_kps(const SCRFDScaleParams& scale_params,
        std::vector<types::BoxfWithLandmarks>& bbox_kps_collection,
        std::vector<Ort::Value>& output_tensors,
        float score_threshold,
        float img_height, float img_width)
    {
        const float input_height = static_cast<float>(input_node_dims.at(2));
        const float input_width = static_cast<float>(input_node_dims.at(3));
        generate_points(static_cast<int>(input_height),
            static_cast<int>(input_width));
        bbox_kps_collection.clear();

        if (use_kps) {
            generate_bboxes_kps_single_stride(scale_params,
                output_tensors.at(0), output_tensors.at(3), output_tensors.at(6),
                8, score_threshold, img_height, img_width, bbox_kps_collection);
            generate_bboxes_kps_single_stride(scale_params,
                output_tensors.at(1), output_tensors.at(4), output_tensors.at(7),
                16, score_threshold, img_height, img_width, bbox_kps_collection);
            generate_bboxes_kps_single_stride(scale_params,
                output_tensors.at(2), output_tensors.at(5), output_tensors.at(8),
                32, score_threshold, img_height, img_width, bbox_kps_collection);
        }
        else {
            generate_bboxes_single_stride(scale_params,
                output_tensors.at(0), output_tensors.at(3),
                8, score_threshold, img_height, img_width, bbox_kps_collection);
            generate_bboxes_single_stride(scale_params,
                output_tensors.at(1), output_tensors.at(4),
                16, score_threshold, img_height, img_width, bbox_kps_collection);
            generate_bboxes_single_stride(scale_params,
                output_tensors.at(2), output_tensors.at(5),
                32, score_threshold, img_height, img_width, bbox_kps_collection);
        }
    }

    void SCRFD::generate_bboxes_single_stride(
        const SCRFDScaleParams& scale_params,
        Ort::Value& score_pred, Ort::Value& bbox_pred,
        unsigned int stride, float score_threshold,
        float img_height, float img_width,
        std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
    {
        unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
        auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
        const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
        const float* score_ptr = score_pred.GetTensorMutableData<float>();
        const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();

        float ratio = scale_params.ratio;
        int   dw = scale_params.dw;
        int   dh = scale_params.dh;

        unsigned int count = 0;
        auto& stride_points = center_points[stride];

        for (unsigned int i = 0; i < num_points; ++i) {
            if (score_ptr[i] < score_threshold) continue;
            const auto& point = stride_points.at(i);
            const float* offsets = bbox_ptr + i * 4;

            float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
            float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
            float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
            float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;

            types::BoxfWithLandmarks box_kps;
            box_kps.box.x1 = std::max(0.f, x1);
            box_kps.box.y1 = std::max(0.f, y1);
            box_kps.box.x2 = std::min(img_width - 1.f, x2);
            box_kps.box.y2 = std::min(img_height - 1.f, y2);
            box_kps.box.score = score_ptr[i];
            box_kps.box.label = 1;
            box_kps.box.label_text = "face";
            box_kps.box.flag = true;
            box_kps.flag = true;
            bbox_kps_collection.push_back(box_kps);

            if (++count > max_nms) break;
        }

        if (bbox_kps_collection.size() > nms_pre_) {
            std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
                [](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
                    return a.box.score > b.box.score; });
            bbox_kps_collection.resize(nms_pre_);
        }
    }

    void SCRFD::generate_bboxes_kps_single_stride(
        const SCRFDScaleParams& scale_params,
        Ort::Value& score_pred, Ort::Value& bbox_pred, Ort::Value& kps_pred,
        unsigned int stride, float score_threshold,
        float img_height, float img_width,
        std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
    {
        unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
        auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
        const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
        const float* score_ptr = score_pred.GetTensorMutableData<float>();
        const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();
        const float* kps_ptr = kps_pred.GetTensorMutableData<float>();

        float ratio = scale_params.ratio;
        int   dw = scale_params.dw;
        int   dh = scale_params.dh;

        unsigned int count = 0;
        auto& stride_points = center_points[stride];

        for (unsigned int i = 0; i < num_points; ++i) {
            if (score_ptr[i] < score_threshold) continue;
            const auto& point = stride_points.at(i);
            const float* offsets = bbox_ptr + i * 4;

            float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
            float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
            float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
            float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;

            types::BoxfWithLandmarks box_kps;
            box_kps.box.x1 = std::max(0.f, x1);
            box_kps.box.y1 = std::max(0.f, y1);
            box_kps.box.x2 = std::min(img_width - 1.f, x2);
            box_kps.box.y2 = std::min(img_height - 1.f, y2);
            box_kps.box.score = score_ptr[i];
            box_kps.box.label = 1;
            box_kps.box.label_text = "face";
            box_kps.box.flag = true;

            const float* kps_offsets = kps_ptr + i * 10;
            for (unsigned int j = 0; j < 10; j += 2) {
                cv::Point2f kp;
                kp.x = std::min(std::max(0.f,
                    ((point.cx + kps_offsets[j]) * point.stride - dw) / ratio),
                    img_width - 1.f);
                kp.y = std::min(std::max(0.f,
                    ((point.cy + kps_offsets[j + 1]) * point.stride - dh) / ratio),
                    img_height - 1.f);
                box_kps.landmarks.points.push_back(kp);
            }
            box_kps.landmarks.flag = true;
            box_kps.flag = true;
            bbox_kps_collection.push_back(box_kps);

            if (++count > max_nms) break;
        }

        if (bbox_kps_collection.size() > nms_pre_) {
            std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
                [](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
                    return a.box.score > b.box.score; });
            bbox_kps_collection.resize(nms_pre_);
        }
    }

    void SCRFD::nms_bboxes_kps(std::vector<types::BoxfWithLandmarks>& input,
        std::vector<types::BoxfWithLandmarks>& output,
        float iou_threshold, unsigned int topk)
    {
        if (input.empty()) return;
        std::sort(input.begin(), input.end(),
            [](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
                return a.box.score > b.box.score; });

        const unsigned int box_num = static_cast<unsigned int>(input.size());
        std::vector<int> merged(box_num, 0);
        unsigned int count = 0;

        for (unsigned int i = 0; i < box_num; ++i) {
            if (merged[i]) continue;
            output.push_back(input[i]);
            merged[i] = 1;
            for (unsigned int j = i + 1; j < box_num; ++j) {
                if (merged[j]) continue;
                if (input[i].box.iou_of(input[j].box) > iou_threshold) {
                    merged[j] = 1;
                }
            }
            if (++count >= topk) break;
        }
    }

    // ====================================================================
    // MOVINET
    // ====================================================================

    MOVINET::MOVINET(const std::string& _onnx_path, unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, _num_threads)
    {
        init_io_names();
    }

    MOVINET::MOVINET(const std::string& _onnx_path,
        int _temporal, int _width, int _height, int _channels,
        unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, _num_threads)
    {
        input_params.temporal = _temporal;
        input_params.width    = _width;
        input_params.height   = _height;
        input_params.channels = _channels;
        init_io_names();
    }

    MOVINET::MOVINET(const std::string& _onnx_path,
        EngineType engineType,
        unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, engineType, _num_threads)
    {
        init_io_names();
    }

    MOVINET::MOVINET(const std::string& _onnx_path,
        EngineType engineType,
        int _temporal, int _width, int _height, int _channels,
        unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, engineType, _num_threads)
    {
        input_params.temporal = _temporal;
        input_params.width    = _width;
        input_params.height   = _height;
        input_params.channels = _channels;
        init_io_names();
    }

    void MOVINET::init_io_names()
    {
        Ort::AllocatorWithDefaultOptions allocator;
        _MoviNetInputName =
            ort_session->GetInputNameAllocated(0, allocator).get();
        _MoviNetOutputName =
            ort_session->GetOutputNameAllocated(0, allocator).get();
    }

    Ort::Value MOVINET::transform(const std::deque<cv::Mat>& frames)
    {
        if (frames.size() != static_cast<size_t>(input_params.temporal))
            throw std::runtime_error("MOVINET::transform — frame count != temporal length.");

        std::vector<int64_t> shape = {
            1,
            input_params.channels,
            input_params.temporal,
            input_params.height,
            input_params.width
        };
        auto mem = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
        return utils::transform::create_video_tensor_5d(
            frames, shape, mem, input_tensor_values);
    }

    std::pair<int, float> MOVINET::post_processing(const float* p)
    {
        const int C = output_params.num_classes;
        float m = *std::max_element(p, p + C);
        float s = 0.f;
        std::vector<float> prob(C);
        for (int i = 0; i < C; ++i) s += (prob[i] = std::exp(p[i] - m));
        for (float& v : prob) v /= s;
        int label = static_cast<int>(
            std::max_element(prob.begin(), prob.end()) - prob.begin());
        return { label, prob[label] };
    }

    void MOVINET::inference(const std::deque<cv::Mat>& frames,
        std::pair<int, float>& out_result)
    {
        if (frames.empty() ||
            frames.size() != static_cast<size_t>(input_params.temporal)) {
            std::cerr << "[MOVINET] Invalid frame count." << std::endl;
            out_result = { -1, 0.f };
            return;
        }

        Ort::Value input_tensor = transform(frames);
        const char* in_names[] = { _MoviNetInputName.c_str() };
        const char* out_names[] = { _MoviNetOutputName.c_str() };

        auto outputs = ort_session->Run(
            Ort::RunOptions{ nullptr },
            in_names, &input_tensor, 1, out_names, 1);

        out_result = post_processing(outputs[0].GetTensorData<float>());
    }

    Ort::Value MOVINET::transform(const cv::Mat& mat)
    {
        std::deque<cv::Mat> frames;
        for (int i = 0; i < input_params.temporal; ++i)
            frames.push_back(mat.clone());
        return transform(frames);
    }

    Ort::Value MOVINET::transformBatch(const std::vector<cv::Mat>& images)
    {
        std::deque<cv::Mat> frames;
        if (!images.empty()) {
            for (int i = 0; i < input_params.temporal; ++i)
                frames.push_back(images[i % images.size()].clone());
        }
        return transform(frames);
    }

    // ====================================================================
    // utils::transform
    // ====================================================================

    Ort::Value ANSCENTER::utils::transform::create_tensor(
        const cv::Mat& mat,
        const std::vector<int64_t>& tensor_dims,
        const Ort::MemoryInfo& memory_info_handler,
        std::vector<float>& tensor_value_handler,
        unsigned int data_format)
    {
        if (mat.empty() || tensor_dims.size() != 4 || tensor_dims.at(0) != 1)
            return Ort::Value(nullptr);

        const unsigned int channels = mat.channels();
        cv::Mat mat_ref;
        if (mat.type() != CV_32FC(channels))
            mat.convertTo(mat_ref, CV_32FC(channels));
        else
            mat_ref = mat;

        if (data_format == CHW) {
            const unsigned int H = tensor_dims.at(2);
            const unsigned int W = tensor_dims.at(3);
            const unsigned int C = tensor_dims.at(1);
            if (C != channels) return Ort::Value(nullptr);

            const unsigned int total = C * H * W;
            tensor_value_handler.resize(total);

            cv::Mat resized;
            if (H != static_cast<unsigned int>(mat_ref.rows) ||
                W != static_cast<unsigned int>(mat_ref.cols))
                cv::resize(mat_ref, resized, cv::Size(W, H));
            else
                resized = mat_ref;

            std::vector<cv::Mat> chans;
            cv::split(resized, chans);
            for (unsigned int c = 0; c < C; ++c)
                std::memcpy(tensor_value_handler.data() + c * H * W,
                    chans[c].data, H * W * sizeof(float));

            return Ort::Value::CreateTensor<float>(
                memory_info_handler, tensor_value_handler.data(),
                total, tensor_dims.data(), tensor_dims.size());
        }

        // HWC
        const unsigned int H = tensor_dims.at(1);
        const unsigned int W = tensor_dims.at(2);
        const unsigned int C = tensor_dims.at(3);
        const unsigned int total = C * H * W;
        if (C != channels) return Ort::Value(nullptr);
        tensor_value_handler.resize(total);

        cv::Mat resized;
        if (H != static_cast<unsigned int>(mat_ref.rows) ||
            W != static_cast<unsigned int>(mat_ref.cols))
            cv::resize(mat_ref, resized, cv::Size(W, H));
        else
            resized = mat_ref;

        std::memcpy(tensor_value_handler.data(), resized.data, total * sizeof(float));
        return Ort::Value::CreateTensor<float>(
            memory_info_handler, tensor_value_handler.data(),
            total, tensor_dims.data(), tensor_dims.size());
    }

    Ort::Value ANSCENTER::utils::transform::create_tensor_batch(
        const std::vector<cv::Mat>& batch_mats,
        const std::vector<int64_t>& tensor_dims,
        const Ort::MemoryInfo& memory_info_handler,
        std::vector<float>& tensor_value_handler,
        unsigned int data_format)
    {
        if (batch_mats.empty() || tensor_dims.size() != 4)
            return Ort::Value(nullptr);

        const size_t       N = batch_mats.size();
        const unsigned int C = tensor_dims.at(1);
        const unsigned int H = tensor_dims.at(2);
        const unsigned int W = tensor_dims.at(3);
        const unsigned int image_size = C * H * W;
        const unsigned int total = static_cast<unsigned int>(N) * image_size;

        tensor_value_handler.resize(total);

        for (size_t b = 0; b < N; ++b) {
            const cv::Mat& mat = batch_mats[b];
            if (mat.empty() || static_cast<unsigned int>(mat.channels()) != C)
                return Ort::Value(nullptr);

            cv::Mat mat_ref;
            if (mat.type() != CV_32FC(C))
                mat.convertTo(mat_ref, CV_32FC(C));
            else
                mat_ref = mat;

            cv::Mat resized;
            if (static_cast<unsigned int>(mat_ref.rows) != H ||
                static_cast<unsigned int>(mat_ref.cols) != W)
                cv::resize(mat_ref, resized, cv::Size(W, H));
            else
                resized = mat_ref;

            const size_t batch_offset = b * image_size;

            if (data_format == CHW) {
                const float* src = resized.ptr<float>(0);
                const size_t plane_size = H * W;
                for (unsigned int c = 0; c < C; ++c) {
                    float* dst = tensor_value_handler.data() + batch_offset + c * plane_size;
                    for (size_t i = 0; i < plane_size; ++i)
                        dst[i] = src[i * C + c];
                }
            }
            else {
                std::memcpy(tensor_value_handler.data() + batch_offset,
                    resized.data, image_size * sizeof(float));
            }
        }

        return Ort::Value::CreateTensor<float>(
            memory_info_handler, tensor_value_handler.data(),
            total, tensor_dims.data(), tensor_dims.size());
    }

    Ort::Value ANSCENTER::utils::transform::create_video_tensor_5d(
        const std::deque<cv::Mat>& frames,
        const std::vector<int64_t>& tensor_dims,
        const Ort::MemoryInfo& memory_info_handler,
        std::vector<float>& tensor_value_handler)
    {
        if (tensor_dims.size() != 5 || tensor_dims[0] != 1)
            throw std::runtime_error("create_video_tensor_5d: expect [1,C,T,H,W]");

        const unsigned int C = tensor_dims[1];
        const unsigned int T = tensor_dims[2];
        const unsigned int H = tensor_dims[3];
        const unsigned int W = tensor_dims[4];

        if (frames.size() != T)
            throw std::runtime_error("create_video_tensor_5d: frame count != T");

        const size_t total = static_cast<size_t>(C) * T * H * W;
        tensor_value_handler.resize(total);

        for (unsigned int t = 0; t < T; ++t) {
            cv::Mat frame_ref;
            if (frames[t].type() != CV_32FC(C))
                frames[t].convertTo(frame_ref, CV_32FC(C), 1.0 / 255.0);
            else
                frame_ref = frames[t];

            cv::Mat resized;
            if (static_cast<unsigned int>(frame_ref.rows) != H ||
                static_cast<unsigned int>(frame_ref.cols) != W)
                cv::resize(frame_ref, resized, cv::Size(W, H));
            else
                resized = frame_ref;

            cv::cvtColor(resized, resized, cv::COLOR_BGR2RGB);

            std::vector<cv::Mat> chans;
            cv::split(resized, chans);
            for (unsigned int c = 0; c < C; ++c) {
                float* dst = tensor_value_handler.data()
                    + c * (T * H * W) + t * (H * W);
                std::memcpy(dst, chans[c].data, H * W * sizeof(float));
            }
        }

        return Ort::Value::CreateTensor<float>(
            memory_info_handler, tensor_value_handler.data(),
            total, tensor_dims.data(), tensor_dims.size());
    }

    cv::Mat ANSCENTER::utils::transform::normalize(
        const cv::Mat& mat, float mean, float scale)
    {
        cv::Mat matf;
        if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
        else matf = mat;
        return (matf - mean) * scale;
    }

    cv::Mat ANSCENTER::utils::transform::normalize(
        const cv::Mat& mat, const float* mean, const float* scale)
    {
        cv::Mat out;
        if (mat.type() != CV_32FC3) mat.convertTo(out, CV_32FC3);
        else out = mat.clone();
        for (int i = 0; i < out.rows; ++i) {
            cv::Vec3f* p = out.ptr<cv::Vec3f>(i);
            for (int j = 0; j < out.cols; ++j) {
                p[j][0] = (p[j][0] - mean[0]) * scale[0];
                p[j][1] = (p[j][1] - mean[1]) * scale[1];
                p[j][2] = (p[j][2] - mean[2]) * scale[2];
            }
        }
        return out;
    }

    void ANSCENTER::utils::transform::normalize(
        const cv::Mat& inmat, cv::Mat& outmat, float mean, float scale)
    {
        outmat = ANSCENTER::utils::transform::normalize(inmat, mean, scale);
    }

    void ANSCENTER::utils::transform::normalize_inplace(
        cv::Mat& mat_inplace, float mean, float scale)
    {
        if (mat_inplace.type() != CV_32FC3)
            mat_inplace.convertTo(mat_inplace, CV_32FC3);
        ANSCENTER::utils::transform::normalize(mat_inplace, mat_inplace, mean, scale);
    }

    void ANSCENTER::utils::transform::normalize_inplace(
        cv::Mat& mat_inplace, const float* mean, const float* scale)
    {
        if (mat_inplace.type() != CV_32FC3)
            mat_inplace.convertTo(mat_inplace, CV_32FC3);
        for (int i = 0; i < mat_inplace.rows; ++i) {
            cv::Vec3f* p = mat_inplace.ptr<cv::Vec3f>(i);
            for (int j = 0; j < mat_inplace.cols; ++j) {
                p[j][0] = (p[j][0] - mean[0]) * scale[0];
                p[j][1] = (p[j][1] - mean[1]) * scale[1];
                p[j][2] = (p[j][2] - mean[2]) * scale[2];
            }
        }
    }

    // ====================================================================
    // BoundingBoxType template method implementations
    // ====================================================================

    template<typename T1, typename T2>
    template<typename O1, typename O2>
    inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
        ANSCENTER::types::BoundingBoxType<T1, T2>::iou_of(
            const BoundingBoxType<O1, O2>& other) const
    {
        auto tbox = other.template convert_type<value_type, score_type>();
        value_type ix1 = std::max(x1, tbox.x1);
        value_type iy1 = std::max(y1, tbox.y1);
        value_type ix2 = std::min(x2, tbox.x2);
        value_type iy2 = std::min(y2, tbox.y2);
        value_type iw = ix2 - ix1 + static_cast<value_type>(1);
        value_type ih = iy2 - iy1 + static_cast<value_type>(1);
        if (iw <= 0 || ih <= 0)
            return std::numeric_limits<value_type>::min();
        value_type inter = iw * ih;
        return inter / (area() + tbox.area() - inter);
    }

    template<typename T1, typename T2>
    inline ::cv::Rect ANSCENTER::types::BoundingBoxType<T1, T2>::rect() const
    {
        auto b = convert_type<int>();
        return ::cv::Rect(b.x1, b.y1, b.width(), b.height());
    }

    template<typename T1, typename T2>
    inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::tl() const
    {
        auto b = convert_type<int>();
        return ::cv::Point2i(b.x1, b.y1);
    }

    template<typename T1, typename T2>
    inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::rb() const
    {
        auto b = convert_type<int>();
        return ::cv::Point2i(b.x2, b.y2);
    }

    template<typename T1, typename T2>
    inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
        ANSCENTER::types::BoundingBoxType<T1, T2>::width() const
    {
        return x2 - x1 + static_cast<value_type>(1);
    }

    template<typename T1, typename T2>
    inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
        ANSCENTER::types::BoundingBoxType<T1, T2>::height() const
    {
        return y2 - y1 + static_cast<value_type>(1);
    }

    template<typename T1, typename T2>
    inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
        ANSCENTER::types::BoundingBoxType<T1, T2>::area() const
    {
        return std::fabs(width() * height());
    }

} // namespace ANSCENTER