use std::time::Instant;
use crate::tokenizers::{Tokenizer, benchmark::dev::traits::{
TokenizerDevelopmentResult, TokenizerVocabMetrics, TokenizerInferenceMetrics,
}};
pub struct TokenizerDevelopmentBencher;
impl TokenizerDevelopmentBencher {
pub fn new() -> Self {
Self
}
pub fn run<T: Tokenizer>(&self, tokenizer: &mut T, data_path: &str, corpus_name: &str) -> TokenizerDevelopmentResult {
let training_metrics = tokenizer
.train(data_path, corpus_name)
.expect("Training failed");
let tokens = tokenizer.vocab_tokens();
let lengths: Vec<usize> = tokens.iter().map(|t| t.len()).collect();
let vocab_size = lengths.len();
let avg_token_length = if vocab_size > 0 {
lengths.iter().sum::<usize>() as f64 / vocab_size as f64
} else {
0.0
};
let vocab_metrics = TokenizerVocabMetrics {
vocab_size,
avg_token_length,
max_token_length: lengths.iter().copied().max().unwrap_or(0),
min_token_length: lengths.iter().copied().min().unwrap_or(0),
};
let corpus_text = std::fs::read_to_string(data_path).expect("Failed to read corpus");
let encode_start = Instant::now();
let encoded = tokenizer.encode(&corpus_text);
let encode_elapsed_ns = encode_start.elapsed().as_nanos() as f64;
let decode_start = Instant::now();
let _ = tokenizer.decode(&encoded);
let decode_elapsed_ns = decode_start.elapsed().as_nanos() as f64;
let token_count = encoded.len();
let inference_metrics = TokenizerInferenceMetrics {
avg_encode_time_ns: encode_elapsed_ns / token_count as f64,
avg_decode_time_ns: decode_elapsed_ns / token_count as f64,
encode_throughput_tokens_per_sec: token_count as f64 / (encode_elapsed_ns / 1e9),
decode_throughput_tokens_per_sec: token_count as f64 / (decode_elapsed_ns / 1e9),
corpus_token_count: token_count,
};
TokenizerDevelopmentResult {
version: tokenizer.name().to_string(),
corpus_name: corpus_name.to_string(),
corpus_size_bytes: tokenizer.corpus().total_size_bytes(),
training_metrics,
vocab_metrics,
inference_metrics,
}
}
}