tokenizers 0.23.1

Provides an implementation of today's most used tokenizers, with a focus on performances and versatility.
Documentation
use std::time::{Duration, Instant};

use std::hint::black_box;

use tokenizers::{
    Decoder, EncodeInput, Model, Normalizer, PostProcessor, PreTokenizer, TokenizerImpl, Trainer,
};

#[allow(dead_code)]
pub fn iter_bench_encode<M, N, PT, PP, D>(
    iters: u64,
    tokenizer: &TokenizerImpl<M, N, PT, PP, D>,
    lines: &[EncodeInput],
) -> Duration
where
    M: Model,
    N: Normalizer,
    PT: PreTokenizer,
    PP: PostProcessor,
    D: Decoder,
{
    let mut duration = Duration::new(0, 0);
    for _i in 0..iters {
        for line in lines {
            let input = line.clone();
            let start = Instant::now();
            let _ = black_box(tokenizer.encode(input, false));
            duration = duration.checked_add(start.elapsed()).unwrap();
        }
    }
    duration
}

#[allow(dead_code)]
pub fn iter_bench_encode_batch<M, N, PT, PP, D>(
    iters: u64,
    tokenizer: &TokenizerImpl<M, N, PT, PP, D>,
    batches: &[Vec<EncodeInput>],
) -> Duration
where
    M: Model + Send + Sync,
    N: Normalizer + Send + Sync,
    PT: PreTokenizer + Send + Sync,
    PP: PostProcessor + Send + Sync,
    D: Decoder + Send + Sync,
{
    let mut duration = Duration::new(0, 0);
    for _i in 0..iters {
        for batch in batches {
            let batch = batch.clone();
            let start = Instant::now();
            let _ = black_box(tokenizer.encode_batch(batch, false));
            duration = duration.checked_add(start.elapsed()).unwrap();
        }
    }
    duration
}

#[allow(dead_code)]
pub fn iter_bench_train<T, M, N, PT, PP, D>(
    iters: u64,
    tokenizer: &mut TokenizerImpl<M, N, PT, PP, D>,
    trainer: &mut T,
    files: Vec<String>,
) -> Duration
where
    T: Trainer<Model = M> + Sync,
    M: Model + Send + Sync,
    N: Normalizer + Send + Sync,
    PT: PreTokenizer + Send + Sync,
    PP: PostProcessor + Send + Sync,
    D: Decoder + Send + Sync,
{
    let mut duration = Duration::new(0, 0);
    for _i in 0..iters {
        let start = Instant::now();
        tokenizer.train_from_files(trainer, files.clone()).unwrap();
        duration = duration.checked_add(start.elapsed()).unwrap();
    }
    duration
}