tokenizers 0.22.2

Provides an implementation of today's most used tokenizers, with a focus on performances and versatility.
Documentation
1
2
3
4
5
6
7
8
9
10
11
use tokenizers::Tokenizer;

fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;

    let data = std::fs::read_to_string("data/big.txt")?;
    let data: Vec<_> = data.lines().collect();
    let add_special_tokens = false;
    tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
    Ok(())
}