Skip to main content

design_tokens/
design_tokens.rs

1//! Measuring the *design levers* for token efficiency with the real BPE
2//! tokenizers (cl100k + o200k). For each task, three semantically-equivalent
3//! programs at different ceremony levels:
4//!   A = ceremony-heavy (explicit types, braces, semicolons, imports, wrappers)
5//!   B = current-MechGen-ish (sigils, `val/var`, partial inference)
6//!   C = ab-initio (maximal inference, layout instead of braces/`;`, ambient
7//!       builtins, terse safety sigils — safety/types live in the compiler, not
8//!       the token budget)
9//!
10//! The point: the *payload* (names/ops/literals) is a floor, but the *ceremony*
11//! is real tokens you can design away. C must cost the fewest tokens while still
12//! denoting the same program (a compiler infers the rest).
13//!
14//!   cargo run -p agentic-eval --example design_tokens --features real-tokens
15
16use agentic_eval::tokens::Model;
17
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Token-efficiency design levers (real cl100k + o200k BPE) ===");
22    println!(
23        "tokenizer: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
25    );
26
27    // Each task: (name, ceremony-heavy, current-ish, ab-initio).
28    let tasks: &[(&str, &str, &str, &str)] = &[
29        (
30            "word-count",
31            // A — ceremony-heavy
32            "use std::collections::HashMap;\n\nfn count_words(text: &str) -> HashMap<String, u32> {\n    let mut counts: HashMap<String, u32> = HashMap::new();\n    for word in text.split_whitespace() {\n        *counts.entry(word.to_string()).or_insert(0) += 1;\n    }\n    counts\n}",
33            // B — current-MechGen-ish (sigils, var, some inference)
34            "fn count_words(text: &str) -> {s: u32} {\n    var counts = {s: u32}.new()\n    for word in text.split() {\n        counts.entry(word).or(0) += 1\n    }\n    counts\n}",
35            // C — ab-initio (inference + layout + ambient builtins)
36            "count_words text =\n  counts = {}\n  for w in split text\n    counts[w] += 1\n  counts",
37        ),
38        (
39            "factorial",
40            "fn factorial(n: u64) -> u64 {\n    if n <= 1 {\n        return 1;\n    }\n    n * factorial(n - 1)\n}",
41            "fn factorial(n: u64) -> u64 {\n    if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
42            "fact n =\n  if n <= 1: 1\n  else: n * fact (n - 1)",
43        ),
44        (
45            "safe-divide", // returns optional/result — safety ceremony vs sigil
46            "fn safe_div(a: i32, b: i32) -> Option<i32> {\n    if b == 0 {\n        return None;\n    }\n    Some(a / b)\n}",
47            "fn safe_div(a: i32, b: i32) -> ?i32 {\n    if b == 0 { none } else { a / b }\n}",
48            "div a b =\n  if b == 0: none\n  else: a / b",
49        ),
50    ];
51
52    println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "task", "form", "cl100k", "o200k", "chars");
53    let (mut a_cl, mut b_cl, mut c_cl) = (0, 0, 0);
54    let (mut a_o, mut b_o, mut c_o) = (0, 0, 0);
55    for (name, a, b, c) in tasks {
56        let row = |label: &str, s: &str| {
57            println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "", label, cl.count(s), o2.count(s), s.chars().count());
58        };
59        println!("[{name}]");
60        row("A heavy", a);
61        row("B curr", b);
62        row("C abinit", c);
63        a_cl += cl.count(a); b_cl += cl.count(b); c_cl += cl.count(c);
64        a_o += o2.count(a); b_o += o2.count(b); c_o += o2.count(c);
65    }
66
67    println!("\nTOTALS (3 tasks):");
68    println!("  A ceremony-heavy   cl100k {a_cl:>3}   o200k {a_o:>3}   (baseline)");
69    println!("  B current-ish      cl100k {b_cl:>3} ({:.0}%)   o200k {b_o:>3} ({:.0}%)", 100.0 * b_cl as f64 / a_cl as f64, 100.0 * b_o as f64 / a_o as f64);
70    println!("  C ab-initio        cl100k {c_cl:>3} ({:.0}%)   o200k {c_o:>3} ({:.0}%)", 100.0 * c_cl as f64 / a_cl as f64, 100.0 * c_o as f64 / a_o as f64);
71    println!("\n  → ab-initio cuts ~{:.0}% of cl100k tokens vs ceremony-heavy by REMOVING ceremony",
72        100.0 * (1.0 - c_cl as f64 / a_cl as f64));
73    println!("    (types/mutability/return/imports inferred; layout replaces braces+`;`; terse safety");
74    println!("    sigils; ambient builtins). The remaining tokens are the irreducible payload —");
75    println!("    names/ops/literals — which no design can remove. That residue IS the token floor.");
76}