pub enum Model {
OpenAiGpt4,
OpenAiGpt4o,
AnthropicClaude,
Heuristic,
}Expand description
A popular agentic AI system, identified by its tokenizer family.
Variants§
OpenAiGpt4
OpenAI GPT-4 / GPT-3.5-turbo family — cl100k_base BPE.
OpenAiGpt4o
OpenAI GPT-4o / o-series family — o200k_base BPE.
AnthropicClaude
Anthropic Claude. Approximation: Anthropic publishes no offline
tokenizer crate, so this uses the shared heuristic_tokens estimate (the
same as Model::Heuristic) and must be read as an estimate, not an exact
count. Model::is_exact returns false for it.
Heuristic
A tokenizer-agnostic labeled heuristic (no model-specific BPE).
Implementations§
Source§impl Model
impl Model
Sourcepub fn name(self) -> &'static str
pub fn name(self) -> &'static str
A human-readable label for the model/tokenizer (e.g. for report output).
Examples found in repository?
12fn main() {
13 println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15 // Two encodings of "read a file and keep the large entries".
16 let legible = Program::new(
17 "legible",
18 r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19 )
20 .with_standing_context("ls/where/map are standard, high-probability names")
21 .with_output("name\nfoo.rs\nbar.rs");
22 let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23 .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24 .with_output("name\nfoo.rs\nbar.rs")
25 .with_retries(8); // terse cipher is mis-emitted more often
26
27 // ── 1. Token efficiency ──────────────────────────────────────────────
28 println!("[1] Token efficiency (amortized over 30 turns):");
29 for model in [
30 Model::OpenAiGpt4,
31 Model::OpenAiGpt4o,
32 Model::AnthropicClaude,
33 ] {
34 let cmp = compare(&legible, &cipher, model, 30);
35 println!(
36 " {:<28} legible={:>6} cipher={:>6} → {} wins ({:.2}x){}",
37 model.name(),
38 cmp.a_total,
39 cmp.b_total,
40 if cmp.winner_is_a { "legible" } else { "cipher" },
41 cmp.ratio,
42 if model.is_exact() { "" } else { " [est]" },
43 );
44 }
45
46 // ── 2. Determinism ───────────────────────────────────────────────────
47 // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48 let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49 let mut t = 0u64;
50 let noisy = assess_determinism(5, || {
51 t += 1;
52 format!("name\nfoo.rs\nbar.rs # at {t}")
53 });
54 println!("\n[2] Determinism:");
55 println!(
56 " canonical output : deterministic={} ({} distinct / {} runs)",
57 canonical.deterministic, canonical.distinct, canonical.runs
58 );
59 println!(
60 " timestamped output: deterministic={} ({} distinct / {} runs)",
61 noisy.deterministic, noisy.distinct, noisy.runs
62 );
63
64 // ── 3. Reliability ───────────────────────────────────────────────────
65 // The legible form parses on all 6 sample invocations; the cipher mis-parses
66 // twice but at least returns a structured error once.
67 let samples = [0, 1, 2, 3, 4, 5];
68 let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69 let cipher_rel = assess_reliability(&samples, |&i| match i {
70 4 => Outcome::structured_failure(),
71 5 => Outcome::opaque_failure(),
72 _ => Outcome::ok(),
73 });
74 println!("\n[3] Reliability:");
75 println!(
76 " legible: pass {:.0}% actionable {:.0}%",
77 legible_rel.pass_rate * 100.0,
78 legible_rel.actionable_rate * 100.0
79 );
80 println!(
81 " cipher : pass {:.0}% actionable {:.0}%",
82 cipher_rel.pass_rate * 100.0,
83 cipher_rel.actionable_rate * 100.0
84 );
85
86 // ── 4. Safety ────────────────────────────────────────────────────────
87 // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88 // Destructive effect. Score the gating under the agent policy.
89 let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90 let destructive = assess_safety(
91 &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92 Mode::Agent,
93 );
94 println!("\n[4] Safety (agent policy):");
95 println!(
96 " read+write task : grade {} (bounded={}, {} approval-gated)",
97 read_only.grade, read_only.bounded, read_only.approval_gated
98 );
99 println!(
100 " rm+exec task : grade {} (bounded={}, {} approval-gated, {} denied)",
101 destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102 );
103
104 println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105 println!("context counts, more deterministic and reliable to parse, and the agent policy");
106 println!("bounds the blast radius of even the destructive variant.");
107}Sourcepub fn from_name(name: &str) -> Option<Model>
pub fn from_name(name: &str) -> Option<Model>
Parse a model from a short identifier (case-insensitive), for CLI/config
use. Accepts common aliases: gpt4/gpt-4/cl100k; gpt4o/gpt-4o/
o200k; claude/anthropic; heuristic/heur. Returns None otherwise.
Sourcepub fn is_exact(self) -> bool
pub fn is_exact(self) -> bool
Whether this model’s count is exact (a real BPE) in this build, vs. an
estimate. OpenAI families are exact only with --features real-tokens.
Examples found in repository?
11fn main() {
12 let cl = Model::OpenAiGpt4;
13 let o2 = Model::OpenAiGpt4o;
14 println!(
15 "tokenizer exact: cl100k={} o200k={}",
16 cl.is_exact(),
17 o2.is_exact()
18 );
19 println!("{:>7} {:>7} file", "cl100k", "o200k");
20 for path in std::env::args().skip(1) {
21 match fs::read_to_string(&path) {
22 Ok(s) => println!("{:>7} {:>7} {}", cl.count(&s), o2.count(&s), path),
23 Err(e) => println!(" ERR ERR {path}: {e}"),
24 }
25 }
26}More examples
18fn main() {
19 let cl = Model::OpenAiGpt4;
20 let o2 = Model::OpenAiGpt4o;
21 println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22 println!(
23 "tokenizer: {} names: {}\n",
24 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25 VOCAB.len()
26 );
27
28 // Agents emit a name with a leading space; BPE is space-aware.
29 let mut single = 0usize;
30 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31 for &name in VOCAB {
32 let ctx = format!(" {name}");
33 let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34 if c <= 1 && o <= 1 {
35 single += 1;
36 } else {
37 offenders.push((name, c, o));
38 }
39 }
40
41 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42 if offenders.is_empty() {
43 println!(" ✓ every vocabulary name is a single token — the §8b discipline holds.");
44 } else {
45 println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46 for (n, c, o) in &offenders {
47 println!(" {n:<12} cl100k {c} o200k {o}");
48 }
49 }
50
51 println!("\nWHY IT MATTERS");
52 println!(" The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53 println!(" `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54 println!(" `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}25fn main() {
26 let cl = Model::OpenAiGpt4;
27 let o2 = Model::OpenAiGpt4o;
28 println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29 println!(
30 "tokenizer: {} keywords: {}\n",
31 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32 KEYWORDS.len()
33 );
34
35 // A keyword usually appears with a leading space in code; BPE is space-aware,
36 // so " return" can differ from "return". Audit the in-context form (leading
37 // space) — that is what an agent actually emits.
38 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39 let mut single = 0usize;
40 for &kw in KEYWORDS {
41 let ctx = format!(" {kw}");
42 let c = cl.count(&ctx);
43 let o = o2.count(&ctx);
44 if c <= 1 && o <= 1 {
45 single += 1;
46 } else {
47 offenders.push((kw, c, o));
48 }
49 }
50
51 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52 println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53 if offenders.is_empty() {
54 println!(" (none)");
55 } else {
56 offenders.sort_by(|a, b| b.1.cmp(&a.1));
57 for (kw, c, o) in &offenders {
58 println!(" {kw:<20} cl100k {c} o200k {o}");
59 }
60 }
61
62 println!("\nVERDICT");
63 println!(
64 " {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65 single, KEYWORDS.len()
66 );
67 println!(" f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68 if !offenders.is_empty() {
69 println!(" The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70 println!(" get a single-token agent-mode alias. They are specialized (swarm combinators,");
71 println!(" grammar extension) — rare in practice, so the realized token cost is small, but");
72 println!(" the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73 }
74}16fn main() {
17 let cl = Model::OpenAiGpt4;
18 let o2 = Model::OpenAiGpt4o;
19 println!("=== Abstraction as the post-floor token lever (real BPE) ===");
20 println!(
21 "tokenizer: {}\n",
22 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
23 );
24
25 // (intent, hand-rolled [compiles today], with-vocabulary [proposed primitive])
26 let cases: &[(&str, &str, &str)] = &[
27 (
28 "sum a list",
29 "f sum(xs)\n var t = 0\n for x in xs\n t = t + x\n t",
30 "f sum(xs)\n fold(xs, 0, +)",
31 ),
32 (
33 "word frequencies",
34 "f wc(ws)\n var m = {}\n for w in ws\n m[w] = m[w] + 1\n m",
35 "f wc(ws)\n freq(ws)",
36 ),
37 (
38 "evens, doubled",
39 "f f(xs)\n var out = []\n for x in xs\n if x % 2 == 0\n out.push(x * 2)\n out",
40 "f f(xs)\n xs | filter even | map double",
41 ),
42 (
43 "max of a list",
44 "f max(xs)\n var m = xs[0]\n for x in xs\n if x > m\n m = x\n m",
45 "f max(xs)\n reduce(xs, max)",
46 ),
47 ];
48
49 println!("{:<18} {:>9} {:>9} {:>7}", "intent", "handrolled", "vocab", "saved");
50 let (mut h_cl, mut v_cl, mut h_o, mut v_o) = (0, 0, 0, 0);
51 for (name, hand, vocab) in cases {
52 let (h, v) = (cl.count(hand), cl.count(vocab));
53 println!("{name:<18} {h:>9} {v:>9} {:>6}%", 100 - 100 * v / h);
54 h_cl += h; v_cl += v;
55 h_o += o2.count(hand); v_o += o2.count(vocab);
56 }
57 println!("\nTOTAL cl100k {h_cl} → {v_cl} ({}% saved) o200k {h_o} → {v_o} ({}% saved)",
58 100 - 100 * v_cl / h_cl, 100 - 100 * v_o / h_o);
59
60 println!("\nFINDING");
61 println!(" At the surface floor, abstraction is the only per-call token lever left, and it is");
62 println!(" POSITIVE-SUM: a single-token, total, capability-typed primitive (a) cuts payload");
63 println!(" tokens (above), (b) RAISES reliability (no hand-rolled off-by-one / empty-list bug),");
64 println!(" and (c) preserves safety (the primitive's effect rides its type to the boundary).");
65 println!(" Encoding tricks (binary, dense UTF-8) and layout were all token-neutral-or-worse —");
66 println!(" vocabulary is the one that pays. The discipline: name primitives as single BPE tokens,");
67 println!(" make them total, and choose them by the empirical frequency of SWE intents.");
68}87fn main() {
88 let exact = Model::OpenAiGpt4.is_exact();
89 println!("=== MechGen 'digital rain' vs token streams (cl100k + o200k BPE) ===");
90 println!(
91 "tokenizer: {}\n",
92 if exact { "REAL tiktoken (exact)" } else { "HEURISTIC (CJK undercounted — rerun with --features real-tokens)" }
93 );
94
95 let samples: &[(&str, &str)] = &[
96 ("net", "net MLP { layer fc1: Linear(8, 16); layer act: ReLU; layer fc2: Linear(16, 4); forward { fc1 } }"),
97 ("fn", "fn factorial(n: u64) -> u64 { if n <= 1 { return 1; } n * factorial(n - 1) }"),
98 ("kb", "kb Family { fact parent(alice, bob); fact parent(bob, carol); rule gp(x: i32, z: i32) where parent(x, y), parent(y, z) { x } }"),
99 ];
100
101 println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "kind", "form", "cl100k tok", "o200k tok", "chars");
102 let cl = Model::OpenAiGpt4;
103 let o2 = Model::OpenAiGpt4o;
104 let (mut s_cl, mut s_o, mut r_cl, mut r_o, mut rl_cl, mut rl_o) = (0, 0, 0, 0, 0, 0);
105 for (name, src) in samples {
106 let (stream, legend) = rain(src);
107 let rl = format!("{legend}\n{stream}"); // rain + legend (one-off, reversible)
108 let bytes = src.as_bytes();
109 let base = b64(bytes);
110 let row = |label: &str, s: &str| {
111 println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "", label, cl.count(s), o2.count(s), chars(s));
112 };
113 println!("[{name}]");
114 row("source", src);
115 row("rain(stream)", &stream);
116 row("rain+legend", &rl);
117 row("base64(bytes)", &base);
118 s_cl += cl.count(src); s_o += o2.count(src);
119 r_cl += cl.count(&stream); r_o += o2.count(&stream);
120 rl_cl += cl.count(&rl); rl_o += o2.count(&rl);
121 }
122
123 println!("\nTOTALS (3 samples) — token ratio vs source (>1.0 = rain is WORSE):");
124 println!(" source cl100k {s_cl:>4} o200k {s_o:>4}");
125 println!(" rain(stream) cl100k {r_cl:>4} ({:.2}x) o200k {r_o:>4} ({:.2}x)", r_cl as f64 / s_cl as f64, r_o as f64 / s_o as f64);
126 println!(" rain+legend cl100k {rl_cl:>4} ({:.2}x) o200k {rl_o:>4} ({:.2}x)", rl_cl as f64 / s_cl as f64, rl_o as f64 / s_o as f64);
127
128 println!("\nVERDICT");
129 println!(" Digital rain shrinks CHARACTERS (~3x — the Matrix look) but the dense glyph");
130 println!(" stream costs MORE BPE tokens than the ASCII source (ratios above), because the");
131 println!(" tokenizer splits each rare multi-byte glyph into several tokens. Adding the legend");
132 println!(" (needed for reversibility) makes a one-off snippet far worse still. Even an");
133 println!(" amortized shared codebook (stream only) does not beat source on tokens.");
134 println!(" This is the project's token-floor finding, re-confirmed on the dense-symbol idea:");
135 println!(" an LLM emits TOKENS, not glyphs/bytes — the information (names/ops/dims) is the floor.");
136 if !exact {
137 println!("\n (heuristic run: it counts ~1 token/char and UNDER-counts CJK — the real cl100k/o200k");
138 println!(" gap is larger. Rerun with --features real-tokens for the exact, even-worse numbers.)");
139 }
140}18fn main() {
19 let cl = Model::OpenAiGpt4;
20 let o2 = Model::OpenAiGpt4o;
21 println!("=== Token-efficiency design levers (real cl100k + o200k BPE) ===");
22 println!(
23 "tokenizer: {}\n",
24 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
25 );
26
27 // Each task: (name, ceremony-heavy, current-ish, ab-initio).
28 let tasks: &[(&str, &str, &str, &str)] = &[
29 (
30 "word-count",
31 // A — ceremony-heavy
32 "use std::collections::HashMap;\n\nfn count_words(text: &str) -> HashMap<String, u32> {\n let mut counts: HashMap<String, u32> = HashMap::new();\n for word in text.split_whitespace() {\n *counts.entry(word.to_string()).or_insert(0) += 1;\n }\n counts\n}",
33 // B — current-MechGen-ish (sigils, var, some inference)
34 "fn count_words(text: &str) -> {s: u32} {\n var counts = {s: u32}.new()\n for word in text.split() {\n counts.entry(word).or(0) += 1\n }\n counts\n}",
35 // C — ab-initio (inference + layout + ambient builtins)
36 "count_words text =\n counts = {}\n for w in split text\n counts[w] += 1\n counts",
37 ),
38 (
39 "factorial",
40 "fn factorial(n: u64) -> u64 {\n if n <= 1 {\n return 1;\n }\n n * factorial(n - 1)\n}",
41 "fn factorial(n: u64) -> u64 {\n if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
42 "fact n =\n if n <= 1: 1\n else: n * fact (n - 1)",
43 ),
44 (
45 "safe-divide", // returns optional/result — safety ceremony vs sigil
46 "fn safe_div(a: i32, b: i32) -> Option<i32> {\n if b == 0 {\n return None;\n }\n Some(a / b)\n}",
47 "fn safe_div(a: i32, b: i32) -> ?i32 {\n if b == 0 { none } else { a / b }\n}",
48 "div a b =\n if b == 0: none\n else: a / b",
49 ),
50 ];
51
52 println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "task", "form", "cl100k", "o200k", "chars");
53 let (mut a_cl, mut b_cl, mut c_cl) = (0, 0, 0);
54 let (mut a_o, mut b_o, mut c_o) = (0, 0, 0);
55 for (name, a, b, c) in tasks {
56 let row = |label: &str, s: &str| {
57 println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "", label, cl.count(s), o2.count(s), s.chars().count());
58 };
59 println!("[{name}]");
60 row("A heavy", a);
61 row("B curr", b);
62 row("C abinit", c);
63 a_cl += cl.count(a); b_cl += cl.count(b); c_cl += cl.count(c);
64 a_o += o2.count(a); b_o += o2.count(b); c_o += o2.count(c);
65 }
66
67 println!("\nTOTALS (3 tasks):");
68 println!(" A ceremony-heavy cl100k {a_cl:>3} o200k {a_o:>3} (baseline)");
69 println!(" B current-ish cl100k {b_cl:>3} ({:.0}%) o200k {b_o:>3} ({:.0}%)", 100.0 * b_cl as f64 / a_cl as f64, 100.0 * b_o as f64 / a_o as f64);
70 println!(" C ab-initio cl100k {c_cl:>3} ({:.0}%) o200k {c_o:>3} ({:.0}%)", 100.0 * c_cl as f64 / a_cl as f64, 100.0 * c_o as f64 / a_o as f64);
71 println!("\n → ab-initio cuts ~{:.0}% of cl100k tokens vs ceremony-heavy by REMOVING ceremony",
72 100.0 * (1.0 - c_cl as f64 / a_cl as f64));
73 println!(" (types/mutability/return/imports inferred; layout replaces braces+`;`; terse safety");
74 println!(" sigils; ambient builtins). The remaining tokens are the irreducible payload —");
75 println!(" names/ops/literals — which no design can remove. That residue IS the token floor.");
76}Sourcepub fn count(self, text: &str) -> usize
pub fn count(self, text: &str) -> usize
Count the tokens in text under this model.
Examples found in repository?
11fn main() {
12 let cl = Model::OpenAiGpt4;
13 let o2 = Model::OpenAiGpt4o;
14 println!(
15 "tokenizer exact: cl100k={} o200k={}",
16 cl.is_exact(),
17 o2.is_exact()
18 );
19 println!("{:>7} {:>7} file", "cl100k", "o200k");
20 for path in std::env::args().skip(1) {
21 match fs::read_to_string(&path) {
22 Ok(s) => println!("{:>7} {:>7} {}", cl.count(&s), o2.count(&s), path),
23 Err(e) => println!(" ERR ERR {path}: {e}"),
24 }
25 }
26}More examples
18fn main() {
19 let cl = Model::OpenAiGpt4;
20 let o2 = Model::OpenAiGpt4o;
21 println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22 println!(
23 "tokenizer: {} names: {}\n",
24 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25 VOCAB.len()
26 );
27
28 // Agents emit a name with a leading space; BPE is space-aware.
29 let mut single = 0usize;
30 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31 for &name in VOCAB {
32 let ctx = format!(" {name}");
33 let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34 if c <= 1 && o <= 1 {
35 single += 1;
36 } else {
37 offenders.push((name, c, o));
38 }
39 }
40
41 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42 if offenders.is_empty() {
43 println!(" ✓ every vocabulary name is a single token — the §8b discipline holds.");
44 } else {
45 println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46 for (n, c, o) in &offenders {
47 println!(" {n:<12} cl100k {c} o200k {o}");
48 }
49 }
50
51 println!("\nWHY IT MATTERS");
52 println!(" The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53 println!(" `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54 println!(" `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}25fn main() {
26 let cl = Model::OpenAiGpt4;
27 let o2 = Model::OpenAiGpt4o;
28 println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29 println!(
30 "tokenizer: {} keywords: {}\n",
31 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32 KEYWORDS.len()
33 );
34
35 // A keyword usually appears with a leading space in code; BPE is space-aware,
36 // so " return" can differ from "return". Audit the in-context form (leading
37 // space) — that is what an agent actually emits.
38 let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39 let mut single = 0usize;
40 for &kw in KEYWORDS {
41 let ctx = format!(" {kw}");
42 let c = cl.count(&ctx);
43 let o = o2.count(&ctx);
44 if c <= 1 && o <= 1 {
45 single += 1;
46 } else {
47 offenders.push((kw, c, o));
48 }
49 }
50
51 println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52 println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53 if offenders.is_empty() {
54 println!(" (none)");
55 } else {
56 offenders.sort_by(|a, b| b.1.cmp(&a.1));
57 for (kw, c, o) in &offenders {
58 println!(" {kw:<20} cl100k {c} o200k {o}");
59 }
60 }
61
62 println!("\nVERDICT");
63 println!(
64 " {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65 single, KEYWORDS.len()
66 );
67 println!(" f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68 if !offenders.is_empty() {
69 println!(" The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70 println!(" get a single-token agent-mode alias. They are specialized (swarm combinators,");
71 println!(" grammar extension) — rare in practice, so the realized token cost is small, but");
72 println!(" the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73 }
74}16fn main() {
17 let cl = Model::OpenAiGpt4;
18 let o2 = Model::OpenAiGpt4o;
19 println!("=== Abstraction as the post-floor token lever (real BPE) ===");
20 println!(
21 "tokenizer: {}\n",
22 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
23 );
24
25 // (intent, hand-rolled [compiles today], with-vocabulary [proposed primitive])
26 let cases: &[(&str, &str, &str)] = &[
27 (
28 "sum a list",
29 "f sum(xs)\n var t = 0\n for x in xs\n t = t + x\n t",
30 "f sum(xs)\n fold(xs, 0, +)",
31 ),
32 (
33 "word frequencies",
34 "f wc(ws)\n var m = {}\n for w in ws\n m[w] = m[w] + 1\n m",
35 "f wc(ws)\n freq(ws)",
36 ),
37 (
38 "evens, doubled",
39 "f f(xs)\n var out = []\n for x in xs\n if x % 2 == 0\n out.push(x * 2)\n out",
40 "f f(xs)\n xs | filter even | map double",
41 ),
42 (
43 "max of a list",
44 "f max(xs)\n var m = xs[0]\n for x in xs\n if x > m\n m = x\n m",
45 "f max(xs)\n reduce(xs, max)",
46 ),
47 ];
48
49 println!("{:<18} {:>9} {:>9} {:>7}", "intent", "handrolled", "vocab", "saved");
50 let (mut h_cl, mut v_cl, mut h_o, mut v_o) = (0, 0, 0, 0);
51 for (name, hand, vocab) in cases {
52 let (h, v) = (cl.count(hand), cl.count(vocab));
53 println!("{name:<18} {h:>9} {v:>9} {:>6}%", 100 - 100 * v / h);
54 h_cl += h; v_cl += v;
55 h_o += o2.count(hand); v_o += o2.count(vocab);
56 }
57 println!("\nTOTAL cl100k {h_cl} → {v_cl} ({}% saved) o200k {h_o} → {v_o} ({}% saved)",
58 100 - 100 * v_cl / h_cl, 100 - 100 * v_o / h_o);
59
60 println!("\nFINDING");
61 println!(" At the surface floor, abstraction is the only per-call token lever left, and it is");
62 println!(" POSITIVE-SUM: a single-token, total, capability-typed primitive (a) cuts payload");
63 println!(" tokens (above), (b) RAISES reliability (no hand-rolled off-by-one / empty-list bug),");
64 println!(" and (c) preserves safety (the primitive's effect rides its type to the boundary).");
65 println!(" Encoding tricks (binary, dense UTF-8) and layout were all token-neutral-or-worse —");
66 println!(" vocabulary is the one that pays. The discipline: name primitives as single BPE tokens,");
67 println!(" make them total, and choose them by the empirical frequency of SWE intents.");
68}87fn main() {
88 let exact = Model::OpenAiGpt4.is_exact();
89 println!("=== MechGen 'digital rain' vs token streams (cl100k + o200k BPE) ===");
90 println!(
91 "tokenizer: {}\n",
92 if exact { "REAL tiktoken (exact)" } else { "HEURISTIC (CJK undercounted — rerun with --features real-tokens)" }
93 );
94
95 let samples: &[(&str, &str)] = &[
96 ("net", "net MLP { layer fc1: Linear(8, 16); layer act: ReLU; layer fc2: Linear(16, 4); forward { fc1 } }"),
97 ("fn", "fn factorial(n: u64) -> u64 { if n <= 1 { return 1; } n * factorial(n - 1) }"),
98 ("kb", "kb Family { fact parent(alice, bob); fact parent(bob, carol); rule gp(x: i32, z: i32) where parent(x, y), parent(y, z) { x } }"),
99 ];
100
101 println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "kind", "form", "cl100k tok", "o200k tok", "chars");
102 let cl = Model::OpenAiGpt4;
103 let o2 = Model::OpenAiGpt4o;
104 let (mut s_cl, mut s_o, mut r_cl, mut r_o, mut rl_cl, mut rl_o) = (0, 0, 0, 0, 0, 0);
105 for (name, src) in samples {
106 let (stream, legend) = rain(src);
107 let rl = format!("{legend}\n{stream}"); // rain + legend (one-off, reversible)
108 let bytes = src.as_bytes();
109 let base = b64(bytes);
110 let row = |label: &str, s: &str| {
111 println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "", label, cl.count(s), o2.count(s), chars(s));
112 };
113 println!("[{name}]");
114 row("source", src);
115 row("rain(stream)", &stream);
116 row("rain+legend", &rl);
117 row("base64(bytes)", &base);
118 s_cl += cl.count(src); s_o += o2.count(src);
119 r_cl += cl.count(&stream); r_o += o2.count(&stream);
120 rl_cl += cl.count(&rl); rl_o += o2.count(&rl);
121 }
122
123 println!("\nTOTALS (3 samples) — token ratio vs source (>1.0 = rain is WORSE):");
124 println!(" source cl100k {s_cl:>4} o200k {s_o:>4}");
125 println!(" rain(stream) cl100k {r_cl:>4} ({:.2}x) o200k {r_o:>4} ({:.2}x)", r_cl as f64 / s_cl as f64, r_o as f64 / s_o as f64);
126 println!(" rain+legend cl100k {rl_cl:>4} ({:.2}x) o200k {rl_o:>4} ({:.2}x)", rl_cl as f64 / s_cl as f64, rl_o as f64 / s_o as f64);
127
128 println!("\nVERDICT");
129 println!(" Digital rain shrinks CHARACTERS (~3x — the Matrix look) but the dense glyph");
130 println!(" stream costs MORE BPE tokens than the ASCII source (ratios above), because the");
131 println!(" tokenizer splits each rare multi-byte glyph into several tokens. Adding the legend");
132 println!(" (needed for reversibility) makes a one-off snippet far worse still. Even an");
133 println!(" amortized shared codebook (stream only) does not beat source on tokens.");
134 println!(" This is the project's token-floor finding, re-confirmed on the dense-symbol idea:");
135 println!(" an LLM emits TOKENS, not glyphs/bytes — the information (names/ops/dims) is the floor.");
136 if !exact {
137 println!("\n (heuristic run: it counts ~1 token/char and UNDER-counts CJK — the real cl100k/o200k");
138 println!(" gap is larger. Rerun with --features real-tokens for the exact, even-worse numbers.)");
139 }
140}18fn main() {
19 let cl = Model::OpenAiGpt4;
20 let o2 = Model::OpenAiGpt4o;
21 println!("=== Token-efficiency design levers (real cl100k + o200k BPE) ===");
22 println!(
23 "tokenizer: {}\n",
24 if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
25 );
26
27 // Each task: (name, ceremony-heavy, current-ish, ab-initio).
28 let tasks: &[(&str, &str, &str, &str)] = &[
29 (
30 "word-count",
31 // A — ceremony-heavy
32 "use std::collections::HashMap;\n\nfn count_words(text: &str) -> HashMap<String, u32> {\n let mut counts: HashMap<String, u32> = HashMap::new();\n for word in text.split_whitespace() {\n *counts.entry(word.to_string()).or_insert(0) += 1;\n }\n counts\n}",
33 // B — current-MechGen-ish (sigils, var, some inference)
34 "fn count_words(text: &str) -> {s: u32} {\n var counts = {s: u32}.new()\n for word in text.split() {\n counts.entry(word).or(0) += 1\n }\n counts\n}",
35 // C — ab-initio (inference + layout + ambient builtins)
36 "count_words text =\n counts = {}\n for w in split text\n counts[w] += 1\n counts",
37 ),
38 (
39 "factorial",
40 "fn factorial(n: u64) -> u64 {\n if n <= 1 {\n return 1;\n }\n n * factorial(n - 1)\n}",
41 "fn factorial(n: u64) -> u64 {\n if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
42 "fact n =\n if n <= 1: 1\n else: n * fact (n - 1)",
43 ),
44 (
45 "safe-divide", // returns optional/result — safety ceremony vs sigil
46 "fn safe_div(a: i32, b: i32) -> Option<i32> {\n if b == 0 {\n return None;\n }\n Some(a / b)\n}",
47 "fn safe_div(a: i32, b: i32) -> ?i32 {\n if b == 0 { none } else { a / b }\n}",
48 "div a b =\n if b == 0: none\n else: a / b",
49 ),
50 ];
51
52 println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "task", "form", "cl100k", "o200k", "chars");
53 let (mut a_cl, mut b_cl, mut c_cl) = (0, 0, 0);
54 let (mut a_o, mut b_o, mut c_o) = (0, 0, 0);
55 for (name, a, b, c) in tasks {
56 let row = |label: &str, s: &str| {
57 println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "", label, cl.count(s), o2.count(s), s.chars().count());
58 };
59 println!("[{name}]");
60 row("A heavy", a);
61 row("B curr", b);
62 row("C abinit", c);
63 a_cl += cl.count(a); b_cl += cl.count(b); c_cl += cl.count(c);
64 a_o += o2.count(a); b_o += o2.count(b); c_o += o2.count(c);
65 }
66
67 println!("\nTOTALS (3 tasks):");
68 println!(" A ceremony-heavy cl100k {a_cl:>3} o200k {a_o:>3} (baseline)");
69 println!(" B current-ish cl100k {b_cl:>3} ({:.0}%) o200k {b_o:>3} ({:.0}%)", 100.0 * b_cl as f64 / a_cl as f64, 100.0 * b_o as f64 / a_o as f64);
70 println!(" C ab-initio cl100k {c_cl:>3} ({:.0}%) o200k {c_o:>3} ({:.0}%)", 100.0 * c_cl as f64 / a_cl as f64, 100.0 * c_o as f64 / a_o as f64);
71 println!("\n → ab-initio cuts ~{:.0}% of cl100k tokens vs ceremony-heavy by REMOVING ceremony",
72 100.0 * (1.0 - c_cl as f64 / a_cl as f64));
73 println!(" (types/mutability/return/imports inferred; layout replaces braces+`;`; terse safety");
74 println!(" sigils; ambient builtins). The remaining tokens are the irreducible payload —");
75 println!(" names/ops/literals — which no design can remove. That residue IS the token floor.");
76}