Skip to main content

vocabulary_audit/
vocabulary_audit.rs

1//! §8b discipline check: every standard-vocabulary primitive name must be a
2//! SINGLE BPE token, or it leaks the abstraction saving back. Audits MechGen's
3//! registered SWE vocabulary (resolve.rs) against the real cl100k + o200k BPE.
4//!
5//!   cargo run -p agentic-eval --example vocabulary_audit --features real-tokens
6
7use agentic_eval::tokens::Model;
8
9/// MechGen's standard SWE vocabulary (resolve.rs register_builtins, §8).
10const VOCAB: &[&str] = &[
11    "map", "filter", "fold", "reduce", "sum", "len", "sort", "reverse", "zip",
12    "freq", "first", "last", "count", "any", "all", "find", "take", "range",
13    "keys", "values", "flatten", "group", "scan", "contains", "min", "max", "abs",
14    // string / text vocabulary
15    "split", "join", "chars", "words", "lines", "upper", "lower",
16];
17
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22    println!(
23        "tokenizer: {}   names: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25        VOCAB.len()
26    );
27
28    // Agents emit a name with a leading space; BPE is space-aware.
29    let mut single = 0usize;
30    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31    for &name in VOCAB {
32        let ctx = format!(" {name}");
33        let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34        if c <= 1 && o <= 1 {
35            single += 1;
36        } else {
37            offenders.push((name, c, o));
38        }
39    }
40
41    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42    if offenders.is_empty() {
43        println!("  ✓ every vocabulary name is a single token — the §8b discipline holds.");
44    } else {
45        println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46        for (n, c, o) in &offenders {
47            println!("  {n:<12} cl100k {c}  o200k {o}");
48        }
49    }
50
51    println!("\nWHY IT MATTERS");
52    println!("  The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53    println!("  `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54    println!("  `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}