#![expect(
clippy::stable_sort_primitive,
clippy::cast_precision_loss,
reason = "example utility -- clarity over micro-optimization"
)]
use std::path::Path;
const CHARS_PER_TOKEN: f64 = 4.0;
fn main() {
let dir = std::env::args().nth(1).unwrap_or_else(|| ".".into());
let files = ripvec_core::walk::collect_files(Path::new(&dir), None);
let mut token_counts: Vec<usize> = Vec::new();
for path in &files {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let Some(lang) = ripvec_core::languages::config_for_extension(ext) else {
continue;
};
let Ok(source) = std::fs::read_to_string(path) else {
continue;
};
let cfg = ripvec_core::chunk::ChunkConfig::default();
let chunks = ripvec_core::chunk::chunk_file(path, &source, &lang, &cfg);
for chunk in &chunks {
let char_count = chunk.content.chars().count();
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "ceil of positive quotient is in 0..=char_count; fits in usize"
)]
let approx_tokens = ((char_count as f64 / CHARS_PER_TOKEN).ceil() as usize).max(1);
token_counts.push(approx_tokens);
}
}
token_counts.sort();
let n = token_counts.len();
if n == 0 {
println!("No chunks found");
return;
}
println!("Total chunks: {n}");
println!("P25: {}", token_counts[n / 4]);
println!("P50: {}", token_counts[n / 2]);
println!("P75: {}", token_counts[3 * n / 4]);
println!("P90: {}", token_counts[9 * n / 10]);
println!("P95: {}", token_counts[95 * n / 100]);
println!("P99: {}", token_counts[99 * n / 100]);
println!("Max: {}", token_counts[n - 1]);
println!();
let buckets = [0usize, 16, 32, 64, 128, 256, 512, 1024, usize::MAX];
for w in buckets.windows(2) {
let count = token_counts
.iter()
.filter(|&&t| t >= w[0] && t < w[1])
.count();
let pct = count as f64 / n as f64 * 100.0;
if w[1] == usize::MAX {
println!("{:>5}+ : {:>6} ({:.1}%)", w[0], count, pct);
} else {
println!("{:>5}-{:<5}: {:>6} ({:.1}%)", w[0], w[1], count, pct);
}
}
}