smoltok_core/lib.rs
1//! # smoltok-core
2//!
3//! Core library for `smoltok`, providing BPE (Byte Pair Encoding) tokenization functionality.
4//! This crate defines the Tokenizer trait and implements simple and regex-based BPE tokenizers.
5
6pub mod regex;
7pub mod simple;
8pub mod tokenizer;
9
10pub use regex::{
11 GPT4_SPLIT_PATTERN, ParallelRegexBPETokenizer, ParallelRegexBPETokenizerConfig, RegexBPETokenizer,
12 RegexBPETokenizerConfig, RegexBPETokenizerConfigError, RegexCompilationError,
13};
14pub use simple::{SimpleBPETokenizer, SimpleBPETokenizerConfig};
15pub use tokenizer::{Deserializable, MergeRule, Serializable, TokenId, TokenPair, Tokenizer, Trainable};