bpe_tokenizer_example/
bpe_tokenizer_example.rs1use scirs2_text::{BpeConfig, BpeTokenizer, Result, Tokenizer};
2use std::path::Path;
3
4#[allow(dead_code)]
5fn main() -> Result<()> {
6 let corpus = [
8 "this is a test sentence for bpe tokenization",
9 "another test sentence with some overlapping words",
10 "bpe works by merging common character pairs",
11 "the algorithm builds a vocabulary of subword units",
12 "these subword tokens can handle out-of-vocabulary words",
13 ];
14
15 println!("Training BPE tokenizer...");
16
17 let config = BpeConfig {
19 vocab_size: 100,
20 min_frequency: 1,
21 special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22 ..Default::default()
23 };
24
25 let mut tokenizer = BpeTokenizer::new(config);
26
27 tokenizer.train(&corpus)?;
29
30 println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32 let testtext = "this is an unseen sentence with some new words";
34 let tokens = tokenizer.tokenize(testtext)?;
35
36 println!("\nInput text: {testtext}");
37 println!("Tokenized: {tokens:?}");
38
39 let vocab_path = Path::new("bpe_vocab.json");
41 tokenizer.save_vocabulary(vocab_path)?;
42 println!("\nVocabulary saved to: {vocab_path:?}");
43
44 let mut new_tokenizer = BpeTokenizer::with_defaults();
46 new_tokenizer.load_vocabulary(vocab_path)?;
47
48 let new_tokens = new_tokenizer.tokenize(testtext)?;
50 println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51 assert_eq!(tokens, new_tokens);
52
53 std::fs::remove_file(vocab_path)?;
55
56 Ok(())
57}