Expand description
§smoltok-core
Core library for smoltok, providing BPE (Byte Pair Encoding) tokenization functionality.
This crate defines the Tokenizer trait and implements simple and regex-based BPE tokenizers.
Re-exports§
pub use regex::GPT4_SPLIT_PATTERN;pub use regex::ParallelRegexBPETokenizer;pub use regex::ParallelRegexBPETokenizerConfig;pub use regex::RegexBPETokenizer;pub use regex::RegexBPETokenizerConfig;pub use regex::RegexBPETokenizerConfigError;pub use regex::RegexCompilationError;pub use simple::SimpleBPETokenizer;pub use simple::SimpleBPETokenizerConfig;pub use tokenizer::Deserializable;pub use tokenizer::MergeRule;pub use tokenizer::Serializable;pub use tokenizer::TokenId;pub use tokenizer::TokenPair;pub use tokenizer::Tokenizer;pub use tokenizer::Trainable;