bpe_tokenizer_example/
bpe_tokenizer_example.rs

1use scirs2_text::{BpeConfig, BpeTokenizer, Result, Tokenizer};
2use std::path::Path;
3
4#[allow(dead_code)]
5fn main() -> Result<()> {
6    // Example corpus for training the tokenizer
7    let corpus = [
8        "this is a test sentence for bpe tokenization",
9        "another test sentence with some overlapping words",
10        "bpe works by merging common character pairs",
11        "the algorithm builds a vocabulary of subword units",
12        "these subword tokens can handle out-of-vocabulary words",
13    ];
14
15    println!("Training BPE tokenizer...");
16
17    // Create a BPE tokenizer with custom configuration
18    let config = BpeConfig {
19        vocab_size: 100,
20        min_frequency: 1,
21        special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
22        ..Default::default()
23    };
24
25    let mut tokenizer = BpeTokenizer::new(config);
26
27    // Train the tokenizer on the corpus
28    tokenizer.train(&corpus)?;
29
30    println!("Vocabulary size: {}", tokenizer.vocab_size());
31
32    // Test the tokenizer on a new sentence
33    let testtext = "this is an unseen sentence with some new words";
34    let tokens = tokenizer.tokenize(testtext)?;
35
36    println!("\nInput text: {testtext}");
37    println!("Tokenized: {tokens:?}");
38
39    // Save the vocabulary for later use
40    let vocab_path = Path::new("bpe_vocab.json");
41    tokenizer.save_vocabulary(vocab_path)?;
42    println!("\nVocabulary saved to: {vocab_path:?}");
43
44    // Create a new tokenizer and load the saved vocabulary
45    let mut new_tokenizer = BpeTokenizer::with_defaults();
46    new_tokenizer.load_vocabulary(vocab_path)?;
47
48    // Test that the loaded tokenizer produces the same tokens
49    let new_tokens = new_tokenizer.tokenize(testtext)?;
50    println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
51    assert_eq!(tokens, new_tokens);
52
53    // Clean up the vocabulary file
54    std::fs::remove_file(vocab_path)?;
55
56    Ok(())
57}