BpeTokenizer

Struct BpeTokenizer 

Source
pub struct BpeTokenizer { /* private fields */ }
Expand description

A Byte Pair Encoding (BPE) tokenizer

BPE is a subword tokenization algorithm that iteratively merges the most frequent pairs of tokens (bytes or characters) to form new tokens.

Implementations§

Source§

impl BpeTokenizer

Source

pub fn new(config: BpeConfig) -> Self

Create a new BPE tokenizer with the given configuration

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 25)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (lines 18-30)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}
Source

pub fn with_defaults() -> Self

Create a new BPE tokenizer with default configuration

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 45)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (line 60)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}
Source

pub fn vocab_size(&self) -> usize

Get the vocabulary size

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 30)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (line 36)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}
Source

pub fn has_vocabulary(&self) -> bool

Check if the tokenizer has a vocabulary

Source

pub fn vocabulary(&self) -> Option<&BpeVocabulary>

Get a reference to the tokenizer’s vocabulary

Source

pub fn set_vocabulary(&mut self, vocabulary: BpeVocabulary)

Set the tokenizer’s vocabulary

Source

pub fn save_vocabulary(&self, path: impl AsRef<Path>) -> Result<()>

Save the tokenizer’s vocabulary to a file

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 41)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (line 56)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}
Source

pub fn load_vocabulary(&mut self, path: impl AsRef<Path>) -> Result<()>

Load the tokenizer’s vocabulary from a file

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 46)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (line 61)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}
Source

pub fn train(&mut self, corpus: &[&str]) -> Result<()>

Train the BPE tokenizer on a corpus

Examples found in repository?
examples/bpe_tokenizer_example.rs (line 28)
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}
More examples
Hide additional examples
examples/bpe_tokenization_demo.rs (line 33)
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Trait Implementations§

Source§

impl Clone for BpeTokenizer

Source§

fn clone(&self) -> BpeTokenizer

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for BpeTokenizer

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Tokenizer for BpeTokenizer

Source§

fn tokenize(&self, text: &str) -> Result<Vec<String>>

Tokenize the input text into tokens
Source§

fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync>

Clone the tokenizer (for use in parallel processing)
Source§

fn tokenize_batch(&self, texts: &[&str]) -> Result<Vec<Vec<String>>>

Tokenize batch of text

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<SS, SP> SupersetOf<SS> for SP
where SS: SubsetOf<SP>,

Source§

fn to_subset(&self) -> Option<SS>

The inverse inclusion map: attempts to construct self from the equivalent element of its superset. Read more
Source§

fn is_in_subset(&self) -> bool

Checks if self is actually part of its subset T (and can be converted to it).
Source§

fn to_subset_unchecked(&self) -> SS

Use with care! Same as self.to_subset but without any property checks. Always succeeds.
Source§

fn from_subset(element: &SS) -> SP

The inclusion map: converts self to the equivalent element of its superset.
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V