Skip to main content

keyword_audit/
keyword_audit.rs

1//! Migration step 4 (AB_INITIO_DESIGN.md §6): audit every MechGen reserved word
2//! against the real BPE tokenizers (cl100k + o200k). A token-efficient surface
3//! wants every keyword to be a **single BPE token**; this finds the offenders so
4//! they can get a single-token agent-mode form. The analogue of the ontology
5//! drift-guard, but for tokenizer alignment.
6//!
7//!   cargo run -p agentic-eval --example keyword_audit --features real-tokens
8
9use agentic_eval::tokens::Model;
10
11/// MechGen's reserved words (from prototype/src/lexer.rs KEYWORDS).
12const KEYWORDS: &[&str] = &[
13    "C", "D", "E", "Err", "I", "M", "None", "Ok", "S", "Some", "T", "U", "Y", "Z",
14    "af", "agent", "async", "break", "const", "continue", "data", "defer", "df",
15    "effect", "else", "enum", "evolve", "extend", "extern", "f", "fact", "fitness",
16    "fn", "for", "forward", "fx", "gd", "genome", "grad", "grammar_extension",
17    "guard", "handle", "hx", "if", "impl", "in", "is", "kb", "layer", "let", "loop",
18    "m", "match", "mod", "mut", "mutate", "net", "or", "param", "pipeline", "pub",
19    "query", "ret", "return", "rule", "select", "sp", "spec", "static", "struct",
20    "sw", "swarm", "swarm_fan_out", "swarm_map_reduce", "swarm_pipeline",
21    "swarm_race", "swarm_saga", "tensor", "train", "trait", "type", "u", "uf",
22    "unsafe", "use", "v", "val", "var", "where", "while", "xd", "xn", "yield", "yl",
23];
24
25fn main() {
26    let cl = Model::OpenAiGpt4;
27    let o2 = Model::OpenAiGpt4o;
28    println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29    println!(
30        "tokenizer: {}   keywords: {}\n",
31        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32        KEYWORDS.len()
33    );
34
35    // A keyword usually appears with a leading space in code; BPE is space-aware,
36    // so " return" can differ from "return". Audit the in-context form (leading
37    // space) — that is what an agent actually emits.
38    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39    let mut single = 0usize;
40    for &kw in KEYWORDS {
41        let ctx = format!(" {kw}");
42        let c = cl.count(&ctx);
43        let o = o2.count(&ctx);
44        if c <= 1 && o <= 1 {
45            single += 1;
46        } else {
47            offenders.push((kw, c, o));
48        }
49    }
50
51    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52    println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53    if offenders.is_empty() {
54        println!("  (none)");
55    } else {
56        offenders.sort_by(|a, b| b.1.cmp(&a.1));
57        for (kw, c, o) in &offenders {
58            println!("  {kw:<20} cl100k {c}  o200k {o}");
59        }
60    }
61
62    println!("\nVERDICT");
63    println!(
64        "  {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65        single, KEYWORDS.len()
66    );
67    println!("  f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68    if !offenders.is_empty() {
69        println!("  The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70        println!("  get a single-token agent-mode alias. They are specialized (swarm combinators,");
71        println!("  grammar extension) — rare in practice, so the realized token cost is small, but");
72        println!("  the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73    }
74}