use tokie::Tokenizer;
fn main() {
let mut tokenizer = Tokenizer::from_pretrained("bert-base-uncased").unwrap();
tokenizer.enable_truncation(tokie::TruncationParams {
max_length: 10,
..Default::default()
});
let enc = tokenizer.encode("This is a longer sentence that will be truncated to fit", true);
println!("Truncated to 10 tokens: {:?}", enc.ids);
println!("Length: {}", enc.ids.len());
tokenizer.enable_padding(tokie::PaddingParams {
strategy: tokie::PaddingStrategy::Fixed(16),
pad_id: tokenizer.pad_token_id().unwrap_or(0),
..Default::default()
});
let texts = &["Hello world", "Short", "A much longer sentence for testing purposes"];
let batch = tokenizer.encode_batch(texts, true);
println!("\nBatch encoding (truncated to 10, padded to 16):");
for (i, enc) in batch.iter().enumerate() {
println!(" [{}] ids={:?}", i, enc.ids);
println!(" mask={:?}", enc.attention_mask);
}
assert!(batch.iter().all(|e| e.len() == 16));
println!("\nAll sequences padded to length 16");
tokenizer.no_padding(); let pair = tokenizer.encode_pair("What is machine learning?", "Machine learning is a subset of AI.", true);
println!("\nPair encoding:");
println!(" ids: {:?}", pair.ids);
println!(" type_ids: {:?}", pair.type_ids);
println!(" Length: {} (truncated to max 10)", pair.ids.len());
tokenizer.no_truncation();
tokenizer.no_padding();
let enc = tokenizer.encode("Back to normal — no truncation or padding", true);
println!("\nAfter reset: {} tokens", enc.ids.len());
}