bpe_tokenization_demo/
bpe_tokenization_demo.rs1use scirs2_text::{BpeConfig, BpeTokenizer, Result, Tokenizer};
2
3#[allow(dead_code)]
4fn main() -> Result<()> {
5 println!("Byte Pair Encoding (BPE) Tokenization Demo");
6 println!("===========================================\n");
7
8 let corpus = [
10 "Hello, this is a demonstration of BPE tokenization.",
11 "BPE learns subword units by iteratively merging the most frequent pairs.",
12 "It is particularly useful for languages with rich morphology.",
13 "Words like 'uncommonness' can be broken into 'un', 'common', 'ness'.",
14 "This improves handling of rare and out-of-vocabulary words.",
15 ];
16
17 let mut tokenizer = BpeTokenizer::new(BpeConfig {
19 vocab_size: 100, min_frequency: 2, special_tokens: vec![
22 "<PAD>".to_string(),
24 "<UNK>".to_string(),
25 "<BOS>".to_string(),
26 "<EOS>".to_string(),
27 ],
28 character_level: true, lowercase: true, });
31
32 println!("Training BPE tokenizer on a small corpus...");
33 tokenizer.train(&corpus)?;
34
35 let vocab_size = tokenizer.vocab_size();
37 println!("Learned vocabulary size: {vocab_size}\n");
38
39 let examples = [
41 "Hello world!",
42 "uncommonness",
43 "tokenization demonstration",
44 "Out-of-vocabulary handling",
45 ];
46
47 for example in &examples {
48 let tokens = tokenizer.tokenize(example)?;
49 println!("Original: \"{example}\"");
50 println!("Tokenized: {tokens:?}");
51 println!("Token count: {}\n", tokens.len());
52 }
53
54 let vocab_path = "bpe_vocab.txt";
56 tokenizer.save_vocabulary(vocab_path)?;
57 println!("Saved vocabulary to {vocab_path}");
58
59 let mut new_tokenizer = BpeTokenizer::with_defaults();
61 new_tokenizer.load_vocabulary(vocab_path)?;
62
63 let testtext = "Hello, demonstrating vocabulary loading!";
64 let tokens = new_tokenizer.tokenize(testtext)?;
65 println!("\nTokenization after loading vocabulary:");
66 println!("Original: \"{testtext}\"");
67 println!("Tokenized: {tokens:?}");
68
69 Ok(())
70}