Skip to main content

entrenar/tokenizer/
mod.rs

1//! Subword Tokenization Module (#26)
2//!
3//! Just-in-Time tokenization for training pipelines with BPE and WordPiece support.
4//! Includes integration with aprender for HuggingFace-compatible tokenizer loading.
5//!
6//! # Toyota Principle: Just-in-Time (ジャスト・イン・タイム)
7//!
8//! Tokenize on demand during training, not upfront - reducing memory footprint
9//! and enabling dynamic vocabulary adaptation.
10//!
11//! # Example
12//!
13//! ```
14//! use entrenar::tokenizer::{BPETokenizer, Tokenizer, TokenizerConfig};
15//!
16//! fn example() -> Result<(), Box<dyn std::error::Error>> {
17//!     // Create a BPE tokenizer
18//!     let config = TokenizerConfig::bpe().with_vocab_size(1000);
19//!     let mut tokenizer = BPETokenizer::new(config);
20//!
21//!     // Train on corpus
22//!     let corpus = vec!["hello world", "hello there"];
23//!     tokenizer.train(&corpus)?;
24//!
25//!     // Tokenize text
26//!     let tokens = tokenizer.encode("hello world")?;
27//!     let decoded = tokenizer.decode(&tokens)?;
28//!     Ok(())
29//! }
30//! ```
31//!
32//! # HuggingFace Integration
33//!
34//! Load pre-trained tokenizers from HuggingFace tokenizer.json files:
35//!
36//! ```rust,ignore
37//! use entrenar::tokenizer::HfTokenizer;
38//!
39//! fn example() -> Result<(), Box<dyn std::error::Error>> {
40//!     // Load from HuggingFace tokenizer.json
41//!     let tokenizer = HfTokenizer::from_file("path/to/tokenizer.json")?;
42//!     let tokens = tokenizer.encode("Hello, world!");
43//!     Ok(())
44//! }
45//! ```
46
47mod bpe;
48mod char;
49mod config;
50mod error;
51mod hf;
52mod traits;
53
54// Re-export all public types for API compatibility
55pub use bpe::BPETokenizer;
56pub use char::CharTokenizer;
57pub use config::{Normalization, SpecialTokens, TokenizerConfig, TokenizerType};
58pub use error::{Result, TokenizerError};
59pub use hf::{
60    bytes_to_unicode, load_hf_from_files, load_hf_from_json, HfBpeConfig, HfBpeTokenizer,
61    HfTokenizer, MergeRule, Qwen2BpeTokenizer,
62};
63pub use traits::{TokenId, Tokenizer};