use scirs2_text::{BpeConfig, BpeTokenizer, Result, Tokenizer};
use std::path::Path;
#[allow(dead_code)]
fn main() -> Result<()> {
let corpus = [
"this is a test sentence for bpe tokenization",
"another test sentence with some overlapping words",
"bpe works by merging common character pairs",
"the algorithm builds a vocabulary of subword units",
"these subword tokens can handle out-of-vocabulary words",
];
println!("Training BPE tokenizer...");
let config = BpeConfig {
vocab_size: 100,
min_frequency: 1,
special_tokens: vec!["<pad>".to_string(), "<unk>".to_string()],
..Default::default()
};
let mut tokenizer = BpeTokenizer::new(config);
tokenizer.train(&corpus)?;
println!("Vocabulary size: {}", tokenizer.vocab_size());
let testtext = "this is an unseen sentence with some new words";
let tokens = tokenizer.tokenize(testtext)?;
println!("\nInput text: {testtext}");
println!("Tokenized: {tokens:?}");
let vocab_path = Path::new("bpe_vocab.json");
tokenizer.save_vocabulary(vocab_path)?;
println!("\nVocabulary saved to: {vocab_path:?}");
let mut new_tokenizer = BpeTokenizer::with_defaults();
new_tokenizer.load_vocabulary(vocab_path)?;
let new_tokens = new_tokenizer.tokenize(testtext)?;
println!("\nTokenized with loaded vocabulary: {new_tokens:?}");
assert_eq!(tokens, new_tokens);
std::fs::remove_file(vocab_path)?;
Ok(())
}