pub struct BpeTokenizer { /* private fields */ }Expand description
A Byte Pair Encoding (BPE) tokenizer
BPE is a subword tokenization algorithm that iteratively merges the most frequent pairs of tokens (bytes or characters) to form new tokens.
Implementations§
Source§impl BpeTokenizer
impl BpeTokenizer
Sourcepub fn new(config: BpeConfig) -> Self
pub fn new(config: BpeConfig) -> Self
Create a new BPE tokenizer with the given configuration
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 25)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (lines 18-30)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Sourcepub fn with_defaults() -> Self
pub fn with_defaults() -> Self
Create a new BPE tokenizer with default configuration
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 45)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (line 60)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Sourcepub fn vocab_size(&self) -> usize
pub fn vocab_size(&self) -> usize
Get the vocabulary size
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 30)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (line 36)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Sourcepub fn has_vocabulary(&self) -> bool
pub fn has_vocabulary(&self) -> bool
Check if the tokenizer has a vocabulary
Sourcepub fn vocabulary(&self) -> Option<&BpeVocabulary>
pub fn vocabulary(&self) -> Option<&BpeVocabulary>
Get a reference to the tokenizer’s vocabulary
Sourcepub fn set_vocabulary(&mut self, vocabulary: BpeVocabulary)
pub fn set_vocabulary(&mut self, vocabulary: BpeVocabulary)
Set the tokenizer’s vocabulary
Sourcepub fn save_vocabulary(&self, path: impl AsRef<Path>) -> Result<()>
pub fn save_vocabulary(&self, path: impl AsRef<Path>) -> Result<()>
Save the tokenizer’s vocabulary to a file
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 41)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (line 56)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Sourcepub fn load_vocabulary(&mut self, path: impl AsRef<Path>) -> Result<()>
pub fn load_vocabulary(&mut self, path: impl AsRef<Path>) -> Result<()>
Load the tokenizer’s vocabulary from a file
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 46)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (line 61)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Sourcepub fn train(&mut self, corpus: &[&str]) -> Result<()>
pub fn train(&mut self, corpus: &[&str]) -> Result<()>
Train the BPE tokenizer on a corpus
Examples found in repository?
examples/bpe_tokenizer_example.rs (line 28)
5fn main() -> Result<()> {
6 // Example corpus for training the tokenizer
7 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 // Create a BPE tokenizer with custom configuration
18 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 // Train the tokenizer on the corpus
28 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 // Test the tokenizer on a new sentence
33 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 // Save the vocabulary for later use
40 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 // Create a new tokenizer and load the saved vocabulary
45 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 // Test that the loaded tokenizer produces the same tokens
49 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 // Clean up the vocabulary file
54 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}More examples
examples/bpe_tokenization_demo.rs (line 33)
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 // Create a simple corpus for training
9 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 // Configure and train the BPE tokenizer
18 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, // Small vocabulary for demonstration
20 min_frequency: 2, // Only merge pairs that appear at least twice
21 special_tokens: vec![
22 // Add special tokens
23 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, // Start with characters (not words)
29 lowercase: true, // Convert all text to lowercase
30 });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 // Display vocabulary information
36 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 // Tokenize some examples
40 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 // Save the tokenizer's vocabulary to a file
55 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 // Load the vocabulary and tokenize again
60 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}Trait Implementations§
Source§impl Clone for BpeTokenizer
impl Clone for BpeTokenizer
Source§fn clone(&self) -> BpeTokenizer
fn clone(&self) -> BpeTokenizer
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for BpeTokenizer
impl Debug for BpeTokenizer
Auto Trait Implementations§
impl Freeze for BpeTokenizer
impl RefUnwindSafe for BpeTokenizer
impl Send for BpeTokenizer
impl Sync for BpeTokenizer
impl Unpin for BpeTokenizer
impl UnwindSafe for BpeTokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
The inverse inclusion map: attempts to construct
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
Checks if
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
Use with care! Same as
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
The inclusion map: converts
self to the equivalent element of its superset.