#ifndef ANSCLIPTOKENIZER_H #define ANSCLIPTOKENIZER_H #pragma once #include #include #include #include #include namespace ANSCENTER { struct TokenizerResult { std::vector inputIds; std::vector attentionMask; }; /// CLIP BPE tokenizer for text-prompted segmentation models. /// /// Requires a BPE merges file (merges.txt from HuggingFace /// openai/clip-vit-base-patch32). Place the file alongside the /// ONNX model in the model folder. class ANSCLIPTokenizer { public: /// Load BPE vocabulary from a merges file. /// @param mergesFilePath Path to the CLIP BPE merges file (merges.txt). /// @return true on success. bool Load(const std::string& mergesFilePath); /// @return true if vocabulary has been loaded. bool IsLoaded() const { return m_loaded; } /// Tokenize a text prompt into input IDs and attention mask. /// @param text The text to tokenize (e.g., "person"). /// @param maxLength Output sequence length (padded/truncated). Default 32. /// @return TokenizerResult with inputIds and attentionMask vectors. TokenizerResult Tokenize(const std::string& text, int maxLength = 32) const; private: bool m_loaded = false; // Byte value (0-255) -> unicode string representation (CLIP byte encoding) std::string m_byteEncoder[256]; // BPE merge: (token_a, token_b) -> priority rank (lower = merge first) std::map, int> m_bpeRanks; // Token string -> integer ID std::unordered_map m_encoder; static constexpr int BOS_TOKEN = 49406; // <|startoftext|> static constexpr int EOS_TOKEN = 49407; // <|endoftext|> // BPE result cache (word -> BPE token list) mutable std::unordered_map> m_cache; void initByteEncoder(); std::vector bpe(const std::string& word) const; static std::vector preTokenize(const std::string& text); static std::string codepointToUtf8(int codepoint); }; } #endif