vocabulary_audit/
vocabulary_audit.rs1use agentic_eval::tokens::Model;
8
9const VOCAB: &[&str] = &[
11 "map", "filter", "fold", "reduce", "sum", "len", "sort", "reverse", "zip",
12 "freq", "first", "last", "count", "any", "all", "find", "take", "range",
13 "keys", "values", "flatten", "group", "scan", "contains", "min", "max", "abs",
14 "split", "join", "chars", "words", "lines", "upper", "lower",
16];
17
18fn main() {
19 let cl = Model::OpenAiGpt4;
20 let o2 = Model::OpenAiGpt4o;
21 println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22 println!(
23 "tokenizer: {} names: {}\n",
24 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25 VOCAB.len()
26 );
27
28 let mut single = 0usize;
30 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31 for &name in VOCAB {
32 let ctx = format!(" {name}");
33 let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34 if c <= 1 && o <= 1 {
35 single += 1;
36 } else {
37 offenders.push((name, c, o));
38 }
39 }
40
41 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42 if offenders.is_empty() {
43 println!(" ✓ every vocabulary name is a single token — the §8b discipline holds.");
44 } else {
45 println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46 for (n, c, o) in &offenders {
47 println!(" {n:<12} cl100k {c} o200k {o}");
48 }
49 }
50
51 println!("\nWHY IT MATTERS");
52 println!(" The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53 println!(" `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54 println!(" `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}