1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
//! Subword Tokenization Module (#26)
//!
//! Just-in-Time tokenization for training pipelines with BPE and WordPiece support.
//! Includes integration with aprender for HuggingFace-compatible tokenizer loading.
//!
//! # Toyota Principle: Just-in-Time (ジャスト・イン・タイム)
//!
//! Tokenize on demand during training, not upfront - reducing memory footprint
//! and enabling dynamic vocabulary adaptation.
//!
//! # Example
//!
//! ```
//! use entrenar::tokenizer::{BPETokenizer, Tokenizer, TokenizerConfig};
//!
//! fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // Create a BPE tokenizer
//! let config = TokenizerConfig::bpe().with_vocab_size(1000);
//! let mut tokenizer = BPETokenizer::new(config);
//!
//! // Train on corpus
//! let corpus = vec!["hello world", "hello there"];
//! tokenizer.train(&corpus)?;
//!
//! // Tokenize text
//! let tokens = tokenizer.encode("hello world")?;
//! let decoded = tokenizer.decode(&tokens)?;
//! Ok(())
//! }
//! ```
//!
//! # HuggingFace Integration
//!
//! Load pre-trained tokenizers from HuggingFace tokenizer.json files:
//!
//! ```rust,ignore
//! use entrenar::tokenizer::HfTokenizer;
//!
//! fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // Load from HuggingFace tokenizer.json
//! let tokenizer = HfTokenizer::from_file("path/to/tokenizer.json")?;
//! let tokens = tokenizer.encode("Hello, world!");
//! Ok(())
//! }
//! ```
// Re-export all public types for API compatibility
pub use BPETokenizer;
pub use charCharTokenizer;
pub use ;
pub use ;
pub use ;
pub use ;