ripvec-core 3.1.1

//! Dump approximate token-length distribution for chunks in a directory.
//!
//! The BGE-small tokenizer was removed in v3.0.0 (surgery/remove-transformer-engines).
//! Token count is approximated by char count divided by an empirical median
//! chars-per-token constant (4.0 for Latin-script BPE tokenizers such as the
//! Model2Vec BPE tokenizer used by potion-base-32M / potion-code-16M).
//! This is a diagnostic utility: the approximation is accurate enough for
//! distribution analysis (identifying outlier chunks, tuning chunk size
//! defaults) without requiring a live tokenizer load or a network download.
//!
//! @Parnas (1972): hiding the doomed tokenizer behind an approximation removes
//! the surface dependency on the transformer-engine family cleanly.

#![expect(
    clippy::stable_sort_primitive,
    clippy::cast_precision_loss,
    reason = "example utility -- clarity over micro-optimization"
)]

use std::path::Path;

/// Empirical median chars per BPE token for Latin-script source code.
/// Derived from potion-base-32M vocab statistics (median token length ~4 chars).
const CHARS_PER_TOKEN: f64 = 4.0;

fn main() {
    let dir = std::env::args().nth(1).unwrap_or_else(|| ".".into());
    let files = ripvec_core::walk::collect_files(Path::new(&dir), None);
    let mut token_counts: Vec<usize> = Vec::new();

    for path in &files {
        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
        let Some(lang) = ripvec_core::languages::config_for_extension(ext) else {
            continue;
        };
        let Ok(source) = std::fs::read_to_string(path) else {
            continue;
        };
        let cfg = ripvec_core::chunk::ChunkConfig::default();
        let chunks = ripvec_core::chunk::chunk_file(path, &source, &lang, &cfg);
        for chunk in &chunks {
            let char_count = chunk.content.chars().count();
            // Approximate token count: char count / median chars-per-token.
            // Round up so zero-char chunks register as 1 rather than 0.
            // ceil() of a positive quotient is non-negative and bounded by
            // char_count (always fits in usize).
            #[expect(
                clippy::cast_possible_truncation,
                clippy::cast_sign_loss,
                reason = "ceil of positive quotient is in 0..=char_count; fits in usize"
            )]
            let approx_tokens = ((char_count as f64 / CHARS_PER_TOKEN).ceil() as usize).max(1);
            token_counts.push(approx_tokens);
        }
    }

    token_counts.sort();
    let n = token_counts.len();
    if n == 0 {
        println!("No chunks found");
        return;
    }
    println!("Total chunks: {n}");
    println!("P25: {}", token_counts[n / 4]);
    println!("P50: {}", token_counts[n / 2]);
    println!("P75: {}", token_counts[3 * n / 4]);
    println!("P90: {}", token_counts[9 * n / 10]);
    println!("P95: {}", token_counts[95 * n / 100]);
    println!("P99: {}", token_counts[99 * n / 100]);
    println!("Max: {}", token_counts[n - 1]);
    println!();

    let buckets = [0usize, 16, 32, 64, 128, 256, 512, 1024, usize::MAX];
    for w in buckets.windows(2) {
        let count = token_counts
            .iter()
            .filter(|&&t| t >= w[0] && t < w[1])
            .count();
        let pct = count as f64 / n as f64 * 100.0;
        if w[1] == usize::MAX {
            println!("{:>5}+     : {:>6} ({:.1}%)", w[0], count, pct);
        } else {
            println!("{:>5}-{:<5}: {:>6} ({:.1}%)", w[0], w[1], count, pct);
        }
    }
}