Initial setup for CLion
This commit is contained in:
65
ANSODEngine/ANSCLIPTokenizer.h
Normal file
65
ANSODEngine/ANSCLIPTokenizer.h
Normal file
@@ -0,0 +1,65 @@
|
||||
#ifndef ANSCLIPTOKENIZER_H
|
||||
#define ANSCLIPTOKENIZER_H
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <cstdint>
|
||||
|
||||
namespace ANSCENTER
|
||||
{
|
||||
struct TokenizerResult
|
||||
{
|
||||
std::vector<int64_t> inputIds;
|
||||
std::vector<int64_t> attentionMask;
|
||||
};
|
||||
|
||||
/// CLIP BPE tokenizer for text-prompted segmentation models.
|
||||
///
|
||||
/// Requires a BPE merges file (merges.txt from HuggingFace
|
||||
/// openai/clip-vit-base-patch32). Place the file alongside the
|
||||
/// ONNX model in the model folder.
|
||||
class ANSCLIPTokenizer
|
||||
{
|
||||
public:
|
||||
/// Load BPE vocabulary from a merges file.
|
||||
/// @param mergesFilePath Path to the CLIP BPE merges file (merges.txt).
|
||||
/// @return true on success.
|
||||
bool Load(const std::string& mergesFilePath);
|
||||
|
||||
/// @return true if vocabulary has been loaded.
|
||||
bool IsLoaded() const { return m_loaded; }
|
||||
|
||||
/// Tokenize a text prompt into input IDs and attention mask.
|
||||
/// @param text The text to tokenize (e.g., "person").
|
||||
/// @param maxLength Output sequence length (padded/truncated). Default 32.
|
||||
/// @return TokenizerResult with inputIds and attentionMask vectors.
|
||||
TokenizerResult Tokenize(const std::string& text, int maxLength = 32) const;
|
||||
|
||||
private:
|
||||
bool m_loaded = false;
|
||||
|
||||
// Byte value (0-255) -> unicode string representation (CLIP byte encoding)
|
||||
std::string m_byteEncoder[256];
|
||||
|
||||
// BPE merge: (token_a, token_b) -> priority rank (lower = merge first)
|
||||
std::map<std::pair<std::string, std::string>, int> m_bpeRanks;
|
||||
|
||||
// Token string -> integer ID
|
||||
std::unordered_map<std::string, int> m_encoder;
|
||||
|
||||
static constexpr int BOS_TOKEN = 49406; // <|startoftext|>
|
||||
static constexpr int EOS_TOKEN = 49407; // <|endoftext|>
|
||||
|
||||
// BPE result cache (word -> BPE token list)
|
||||
mutable std::unordered_map<std::string, std::vector<std::string>> m_cache;
|
||||
|
||||
void initByteEncoder();
|
||||
std::vector<std::string> bpe(const std::string& word) const;
|
||||
static std::vector<std::string> preTokenize(const std::string& text);
|
||||
static std::string codepointToUtf8(int codepoint);
|
||||
};
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user