1use agentic_eval::tokens::Model;
10
11const KEYWORDS: &[&str] = &[
13 "C", "D", "E", "Err", "I", "M", "None", "Ok", "S", "Some", "T", "U", "Y", "Z",
14 "af", "agent", "async", "break", "const", "continue", "data", "defer", "df",
15 "effect", "else", "enum", "evolve", "extend", "extern", "f", "fact", "fitness",
16 "fn", "for", "forward", "fx", "gd", "genome", "grad", "grammar_extension",
17 "guard", "handle", "hx", "if", "impl", "in", "is", "kb", "layer", "let", "loop",
18 "m", "match", "mod", "mut", "mutate", "net", "or", "param", "pipeline", "pub",
19 "query", "ret", "return", "rule", "select", "sp", "spec", "static", "struct",
20 "sw", "swarm", "swarm_fan_out", "swarm_map_reduce", "swarm_pipeline",
21 "swarm_race", "swarm_saga", "tensor", "train", "trait", "type", "u", "uf",
22 "unsafe", "use", "v", "val", "var", "where", "while", "xd", "xn", "yield", "yl",
23];
24
25fn main() {
26 let cl = Model::OpenAiGpt4;
27 let o2 = Model::OpenAiGpt4o;
28 println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29 println!(
30 "tokenizer: {} keywords: {}\n",
31 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32 KEYWORDS.len()
33 );
34
35 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39 let mut single = 0usize;
40 for &kw in KEYWORDS {
41 let ctx = format!(" {kw}");
42 let c = cl.count(&ctx);
43 let o = o2.count(&ctx);
44 if c <= 1 && o <= 1 {
45 single += 1;
46 } else {
47 offenders.push((kw, c, o));
48 }
49 }
50
51 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52 println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53 if offenders.is_empty() {
54 println!(" (none)");
55 } else {
56 offenders.sort_by(|a, b| b.1.cmp(&a.1));
57 for (kw, c, o) in &offenders {
58 println!(" {kw:<20} cl100k {c} o200k {o}");
59 }
60 }
61
62 println!("\nVERDICT");
63 println!(
64 " {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65 single, KEYWORDS.len()
66 );
67 println!(" f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68 if !offenders.is_empty() {
69 println!(" The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70 println!(" get a single-token agent-mode alias. They are specialized (swarm combinators,");
71 println!(" grammar extension) — rare in practice, so the realized token cost is small, but");
72 println!(" the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73 }
74}