bpe_tokenization_demo/
bpe_tokenization_demo.rs

1use scirs2_text::{BpeConfig, BpeTokenizer, Result, Tokenizer};
2
3#[allow(dead_code)]
4fn main() -> Result<()> {
5    println!("Byte Pair Encoding (BPE) Tokenization Demo");
6    println!("===========================================\n");
7
8    // Create a simple corpus for training
9    let corpus = [
10        "Hello, this is a demonstration of BPE tokenization.",
11        "BPE learns subword units by iteratively merging the most frequent pairs.",
12        "It is particularly useful for languages with rich morphology.",
13        "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14        "This improves handling of rare and out-of-vocabulary words.",
15    ];
16
17    // Configure and train the BPE tokenizer
18    let mut tokenizer = BpeTokenizer::new(BpeConfig {
19        vocab_size: 100,  // Small vocabulary for demonstration
20        min_frequency: 2, // Only merge pairs that appear at least twice
21        special_tokens: vec![
22            // Add special tokens
23            "<PAD>".to_string(),
24            "<UNK>".to_string(),
25            "<BOS>".to_string(),
26            "<EOS>".to_string(),
27        ],
28        character_level: true, // Start with characters (not words)
29        lowercase: true,       // Convert all text to lowercase
30    });
31
32    println!("Training BPE tokenizer on a small corpus...");
33    tokenizer.train(&corpus)?;
34
35    // Display vocabulary information
36    let vocab_size = tokenizer.vocab_size();
37    println!("Learned vocabulary size: {vocab_size}\n");
38
39    // Tokenize some examples
40    let examples = [
41        "Hello world!",
42        "uncommonness",
43        "tokenization demonstration",
44        "Out-of-vocabulary handling",
45    ];
46
47    for example in &examples {
48        let tokens = tokenizer.tokenize(example)?;
49        println!("Original: \"{example}\"");
50        println!("Tokenized: {tokens:?}");
51        println!("Token count: {}\n", tokens.len());
52    }
53
54    // Save the tokenizer's vocabulary to a file
55    let vocab_path = "bpe_vocab.txt";
56    tokenizer.save_vocabulary(vocab_path)?;
57    println!("Saved vocabulary to {vocab_path}");
58
59    // Load the vocabulary and tokenize again
60    let mut new_tokenizer = BpeTokenizer::with_defaults();
61    new_tokenizer.load_vocabulary(vocab_path)?;
62
63    let testtext = "Hello, demonstrating vocabulary loading!";
64    let tokens = new_tokenizer.tokenize(testtext)?;
65    println!("\nTokenization after loading vocabulary:");
66    println!("Original: \"{testtext}\"");
67    println!("Tokenized: {tokens:?}");
68
69    Ok(())
70}