smoltok_core/
lib.rs

1//! # smoltok-core
2//!
3//! Core library for `smoltok`, providing BPE (Byte Pair Encoding) tokenization functionality.
4//! This crate defines the Tokenizer trait and implements simple and regex-based BPE tokenizers.
5
6pub mod regex;
7pub mod simple;
8pub mod tokenizer;
9
10pub use regex::{
11    GPT4_SPLIT_PATTERN, ParallelRegexBPETokenizer, ParallelRegexBPETokenizerConfig, RegexBPETokenizer,
12    RegexBPETokenizerConfig, RegexBPETokenizerConfigError, RegexCompilationError,
13};
14pub use simple::{SimpleBPETokenizer, SimpleBPETokenizerConfig};
15pub use tokenizer::{Deserializable, MergeRule, Serializable, TokenId, TokenPair, Tokenizer, Trainable};