Struct BpeTokenizer

Source

pub struct BpeTokenizer { /* private fields */ }

Expand description

A Byte Pair Encoding (BPE) tokenizer

BPE is a subword tokenization algorithm that iteratively merges the most frequent pairs of tokens (bytes or characters) to form new tokens.

Implementations§

Source §

impl BpeTokenizer

Source

pub fn new(config: BpeConfig) -> Self

Create a new BPE tokenizer with the given configuration

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 25)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (lines 18-30)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Source

pub fn with_defaults() -> Self

Create a new BPE tokenizer with default configuration

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 45)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (line 60)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Source

pub fn vocab_size(&self) -> usize

Get the vocabulary size

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 30)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (line 36)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Source

pub fn has_vocabulary(&self) -> bool

Check if the tokenizer has a vocabulary

Source

pub fn vocabulary(&self) -> Option<&BpeVocabulary>

Get a reference to the tokenizer’s vocabulary

Source

pub fn set_vocabulary(&mut self, vocabulary: BpeVocabulary)

Set the tokenizer’s vocabulary

Source

pub fn save_vocabulary(&self, path: impl AsRef<Path>) -> Result<()>

Save the tokenizer’s vocabulary to a file

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 41)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (line 56)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Source

pub fn load_vocabulary(&mut self, path: impl AsRef<Path>) -> Result<()>

Load the tokenizer’s vocabulary from a file

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 46)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (line 61)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Source

pub fn train(&mut self, corpus: &[&str]) -> Result<()>

Train the BPE tokenizer on a corpus

Examples found in repository ?

examples/bpe_tokenizer_example.rs (line 28)

5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}

More examples

Hide additional examples

examples/bpe_tokenization_demo.rs (line 33)

4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}

Trait Implementations§

Source §

impl Clone for BpeTokenizer

Source §

fn clone(&self) -> BpeTokenizer

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl Debug for BpeTokenizer

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl Tokenizer for BpeTokenizer

Source §

fn tokenize(&self, text: &str) -> Result<Vec<String>>

Tokenize the input text into tokens

Source §

fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync>

Clone the tokenizer (for use in parallel processing)

Source §

fn tokenize_batch(&self, texts: &[&str]) -> Result<Vec<Vec<String>>>

Tokenize batch of text

Auto Trait Implementations§

§

impl UnwindSafe for BpeTokenizer

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> IntoEither for T

Source §

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more

Source §