entrenar/tokenizer/mod.rs
1//! Subword Tokenization Module (#26)
2//!
3//! Just-in-Time tokenization for training pipelines with BPE and WordPiece support.
4//! Includes integration with aprender for HuggingFace-compatible tokenizer loading.
5//!
6//! # Toyota Principle: Just-in-Time (ジャスト・イン・タイム)
7//!
8//! Tokenize on demand during training, not upfront - reducing memory footprint
9//! and enabling dynamic vocabulary adaptation.
10//!
11//! # Example
12//!
13//! ```
14//! use entrenar::tokenizer::{BPETokenizer, Tokenizer, TokenizerConfig};
15//!
16//! fn example() -> Result<(), Box<dyn std::error::Error>> {
17//! // Create a BPE tokenizer
18//! let config = TokenizerConfig::bpe().with_vocab_size(1000);
19//! let mut tokenizer = BPETokenizer::new(config);
20//!
21//! // Train on corpus
22//! let corpus = vec!["hello world", "hello there"];
23//! tokenizer.train(&corpus)?;
24//!
25//! // Tokenize text
26//! let tokens = tokenizer.encode("hello world")?;
27//! let decoded = tokenizer.decode(&tokens)?;
28//! Ok(())
29//! }
30//! ```
31//!
32//! # HuggingFace Integration
33//!
34//! Load pre-trained tokenizers from HuggingFace tokenizer.json files:
35//!
36//! ```rust,ignore
37//! use entrenar::tokenizer::HfTokenizer;
38//!
39//! fn example() -> Result<(), Box<dyn std::error::Error>> {
40//! // Load from HuggingFace tokenizer.json
41//! let tokenizer = HfTokenizer::from_file("path/to/tokenizer.json")?;
42//! let tokens = tokenizer.encode("Hello, world!");
43//! Ok(())
44//! }
45//! ```
46
47mod bpe;
48mod char;
49mod config;
50mod error;
51mod hf;
52mod traits;
53
54// Re-export all public types for API compatibility
55pub use bpe::BPETokenizer;
56pub use char::CharTokenizer;
57pub use config::{Normalization, SpecialTokens, TokenizerConfig, TokenizerType};
58pub use error::{Result, TokenizerError};
59pub use hf::{
60 bytes_to_unicode, load_hf_from_files, load_hf_from_json, HfBpeConfig, HfBpeTokenizer,
61 HfTokenizer, MergeRule, Qwen2BpeTokenizer,
62};
63pub use traits::{TokenId, Tokenizer};