Skip to main content

Model

Enum Model 

Source
pub enum Model {
    OpenAiGpt4,
    OpenAiGpt4o,
    AnthropicClaude,
    Heuristic,
}
Expand description

A popular agentic AI system, identified by its tokenizer family.

Variants§

§

OpenAiGpt4

OpenAI GPT-4 / GPT-3.5-turbo family — cl100k_base BPE.

§

OpenAiGpt4o

OpenAI GPT-4o / o-series family — o200k_base BPE.

§

AnthropicClaude

Anthropic Claude. Approximation: Anthropic publishes no offline tokenizer crate, so this uses the shared heuristic_tokens estimate (the same as Model::Heuristic) and must be read as an estimate, not an exact count. Model::is_exact returns false for it.

§

Heuristic

A tokenizer-agnostic labeled heuristic (no model-specific BPE).

Implementations§

Source§

impl Model

Source

pub fn name(self) -> &'static str

A human-readable label for the model/tokenizer (e.g. for report output).

Examples found in repository?
examples/evaluate.rs (line 37)
12fn main() {
13    println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15    // Two encodings of "read a file and keep the large entries".
16    let legible = Program::new(
17        "legible",
18        r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19    )
20    .with_standing_context("ls/where/map are standard, high-probability names")
21    .with_output("name\nfoo.rs\nbar.rs");
22    let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23        .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24        .with_output("name\nfoo.rs\nbar.rs")
25        .with_retries(8); // terse cipher is mis-emitted more often
26
27    // ── 1. Token efficiency ──────────────────────────────────────────────
28    println!("[1] Token efficiency (amortized over 30 turns):");
29    for model in [
30        Model::OpenAiGpt4,
31        Model::OpenAiGpt4o,
32        Model::AnthropicClaude,
33    ] {
34        let cmp = compare(&legible, &cipher, model, 30);
35        println!(
36            "  {:<28} legible={:>6}  cipher={:>6}  → {} wins ({:.2}x){}",
37            model.name(),
38            cmp.a_total,
39            cmp.b_total,
40            if cmp.winner_is_a { "legible" } else { "cipher" },
41            cmp.ratio,
42            if model.is_exact() { "" } else { " [est]" },
43        );
44    }
45
46    // ── 2. Determinism ───────────────────────────────────────────────────
47    // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48    let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49    let mut t = 0u64;
50    let noisy = assess_determinism(5, || {
51        t += 1;
52        format!("name\nfoo.rs\nbar.rs  # at {t}")
53    });
54    println!("\n[2] Determinism:");
55    println!(
56        "  canonical output : deterministic={} ({} distinct / {} runs)",
57        canonical.deterministic, canonical.distinct, canonical.runs
58    );
59    println!(
60        "  timestamped output: deterministic={} ({} distinct / {} runs)",
61        noisy.deterministic, noisy.distinct, noisy.runs
62    );
63
64    // ── 3. Reliability ───────────────────────────────────────────────────
65    // The legible form parses on all 6 sample invocations; the cipher mis-parses
66    // twice but at least returns a structured error once.
67    let samples = [0, 1, 2, 3, 4, 5];
68    let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69    let cipher_rel = assess_reliability(&samples, |&i| match i {
70        4 => Outcome::structured_failure(),
71        5 => Outcome::opaque_failure(),
72        _ => Outcome::ok(),
73    });
74    println!("\n[3] Reliability:");
75    println!(
76        "  legible: pass {:.0}%  actionable {:.0}%",
77        legible_rel.pass_rate * 100.0,
78        legible_rel.actionable_rate * 100.0
79    );
80    println!(
81        "  cipher : pass {:.0}%  actionable {:.0}%",
82        cipher_rel.pass_rate * 100.0,
83        cipher_rel.actionable_rate * 100.0
84    );
85
86    // ── 4. Safety ────────────────────────────────────────────────────────
87    // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88    // Destructive effect. Score the gating under the agent policy.
89    let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90    let destructive = assess_safety(
91        &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92        Mode::Agent,
93    );
94    println!("\n[4] Safety (agent policy):");
95    println!(
96        "  read+write task : grade {} (bounded={}, {} approval-gated)",
97        read_only.grade, read_only.bounded, read_only.approval_gated
98    );
99    println!(
100        "  rm+exec task    : grade {} (bounded={}, {} approval-gated, {} denied)",
101        destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102    );
103
104    println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105    println!("context counts, more deterministic and reliable to parse, and the agent policy");
106    println!("bounds the blast radius of even the destructive variant.");
107}
Source

pub fn all() -> [Model; 4]

Every model this build can count for (exact or approximate).

Source

pub fn from_name(name: &str) -> Option<Model>

Parse a model from a short identifier (case-insensitive), for CLI/config use. Accepts common aliases: gpt4/gpt-4/cl100k; gpt4o/gpt-4o/ o200k; claude/anthropic; heuristic/heur. Returns None otherwise.

Source

pub fn is_exact(self) -> bool

Whether this model’s count is exact (a real BPE) in this build, vs. an estimate. OpenAI families are exact only with --features real-tokens.

Examples found in repository?
examples/tokens_of.rs (line 16)
11fn main() {
12    let cl = Model::OpenAiGpt4;
13    let o2 = Model::OpenAiGpt4o;
14    println!(
15        "tokenizer exact: cl100k={} o200k={}",
16        cl.is_exact(),
17        o2.is_exact()
18    );
19    println!("{:>7} {:>7}   file", "cl100k", "o200k");
20    for path in std::env::args().skip(1) {
21        match fs::read_to_string(&path) {
22            Ok(s) => println!("{:>7} {:>7}   {}", cl.count(&s), o2.count(&s), path),
23            Err(e) => println!("    ERR     ERR   {path}: {e}"),
24        }
25    }
26}
More examples
Hide additional examples
examples/vocabulary_audit.rs (line 24)
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22    println!(
23        "tokenizer: {}   names: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25        VOCAB.len()
26    );
27
28    // Agents emit a name with a leading space; BPE is space-aware.
29    let mut single = 0usize;
30    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31    for &name in VOCAB {
32        let ctx = format!(" {name}");
33        let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34        if c <= 1 && o <= 1 {
35            single += 1;
36        } else {
37            offenders.push((name, c, o));
38        }
39    }
40
41    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42    if offenders.is_empty() {
43        println!("  ✓ every vocabulary name is a single token — the §8b discipline holds.");
44    } else {
45        println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46        for (n, c, o) in &offenders {
47            println!("  {n:<12} cl100k {c}  o200k {o}");
48        }
49    }
50
51    println!("\nWHY IT MATTERS");
52    println!("  The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53    println!("  `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54    println!("  `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}
examples/keyword_audit.rs (line 31)
25fn main() {
26    let cl = Model::OpenAiGpt4;
27    let o2 = Model::OpenAiGpt4o;
28    println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29    println!(
30        "tokenizer: {}   keywords: {}\n",
31        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32        KEYWORDS.len()
33    );
34
35    // A keyword usually appears with a leading space in code; BPE is space-aware,
36    // so " return" can differ from "return". Audit the in-context form (leading
37    // space) — that is what an agent actually emits.
38    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39    let mut single = 0usize;
40    for &kw in KEYWORDS {
41        let ctx = format!(" {kw}");
42        let c = cl.count(&ctx);
43        let o = o2.count(&ctx);
44        if c <= 1 && o <= 1 {
45            single += 1;
46        } else {
47            offenders.push((kw, c, o));
48        }
49    }
50
51    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52    println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53    if offenders.is_empty() {
54        println!("  (none)");
55    } else {
56        offenders.sort_by(|a, b| b.1.cmp(&a.1));
57        for (kw, c, o) in &offenders {
58            println!("  {kw:<20} cl100k {c}  o200k {o}");
59        }
60    }
61
62    println!("\nVERDICT");
63    println!(
64        "  {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65        single, KEYWORDS.len()
66    );
67    println!("  f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68    if !offenders.is_empty() {
69        println!("  The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70        println!("  get a single-token agent-mode alias. They are specialized (swarm combinators,");
71        println!("  grammar extension) — rare in practice, so the realized token cost is small, but");
72        println!("  the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73    }
74}
examples/abstraction_tokens.rs (line 22)
16fn main() {
17    let cl = Model::OpenAiGpt4;
18    let o2 = Model::OpenAiGpt4o;
19    println!("=== Abstraction as the post-floor token lever (real BPE) ===");
20    println!(
21        "tokenizer: {}\n",
22        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
23    );
24
25    // (intent, hand-rolled [compiles today], with-vocabulary [proposed primitive])
26    let cases: &[(&str, &str, &str)] = &[
27        (
28            "sum a list",
29            "f sum(xs)\n  var t = 0\n  for x in xs\n    t = t + x\n  t",
30            "f sum(xs)\n  fold(xs, 0, +)",
31        ),
32        (
33            "word frequencies",
34            "f wc(ws)\n  var m = {}\n  for w in ws\n    m[w] = m[w] + 1\n  m",
35            "f wc(ws)\n  freq(ws)",
36        ),
37        (
38            "evens, doubled",
39            "f f(xs)\n  var out = []\n  for x in xs\n    if x % 2 == 0\n      out.push(x * 2)\n  out",
40            "f f(xs)\n  xs | filter even | map double",
41        ),
42        (
43            "max of a list",
44            "f max(xs)\n  var m = xs[0]\n  for x in xs\n    if x > m\n      m = x\n  m",
45            "f max(xs)\n  reduce(xs, max)",
46        ),
47    ];
48
49    println!("{:<18} {:>9} {:>9} {:>7}", "intent", "handrolled", "vocab", "saved");
50    let (mut h_cl, mut v_cl, mut h_o, mut v_o) = (0, 0, 0, 0);
51    for (name, hand, vocab) in cases {
52        let (h, v) = (cl.count(hand), cl.count(vocab));
53        println!("{name:<18} {h:>9} {v:>9} {:>6}%", 100 - 100 * v / h);
54        h_cl += h; v_cl += v;
55        h_o += o2.count(hand); v_o += o2.count(vocab);
56    }
57    println!("\nTOTAL  cl100k {h_cl} → {v_cl} ({}% saved)   o200k {h_o} → {v_o} ({}% saved)",
58        100 - 100 * v_cl / h_cl, 100 - 100 * v_o / h_o);
59
60    println!("\nFINDING");
61    println!("  At the surface floor, abstraction is the only per-call token lever left, and it is");
62    println!("  POSITIVE-SUM: a single-token, total, capability-typed primitive (a) cuts payload");
63    println!("  tokens (above), (b) RAISES reliability (no hand-rolled off-by-one / empty-list bug),");
64    println!("  and (c) preserves safety (the primitive's effect rides its type to the boundary).");
65    println!("  Encoding tricks (binary, dense UTF-8) and layout were all token-neutral-or-worse —");
66    println!("  vocabulary is the one that pays. The discipline: name primitives as single BPE tokens,");
67    println!("  make them total, and choose them by the empirical frequency of SWE intents.");
68}
examples/rain_tokens.rs (line 88)
87fn main() {
88    let exact = Model::OpenAiGpt4.is_exact();
89    println!("=== MechGen 'digital rain' vs token streams (cl100k + o200k BPE) ===");
90    println!(
91        "tokenizer: {}\n",
92        if exact { "REAL tiktoken (exact)" } else { "HEURISTIC (CJK undercounted — rerun with --features real-tokens)" }
93    );
94
95    let samples: &[(&str, &str)] = &[
96        ("net", "net MLP { layer fc1: Linear(8, 16); layer act: ReLU; layer fc2: Linear(16, 4); forward { fc1 } }"),
97        ("fn", "fn factorial(n: u64) -> u64 { if n <= 1 { return 1; } n * factorial(n - 1) }"),
98        ("kb", "kb Family { fact parent(alice, bob); fact parent(bob, carol); rule gp(x: i32, z: i32) where parent(x, y), parent(y, z) { x } }"),
99    ];
100
101    println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "kind", "form", "cl100k tok", "o200k tok", "chars");
102    let cl = Model::OpenAiGpt4;
103    let o2 = Model::OpenAiGpt4o;
104    let (mut s_cl, mut s_o, mut r_cl, mut r_o, mut rl_cl, mut rl_o) = (0, 0, 0, 0, 0, 0);
105    for (name, src) in samples {
106        let (stream, legend) = rain(src);
107        let rl = format!("{legend}\n{stream}"); // rain + legend (one-off, reversible)
108        let bytes = src.as_bytes();
109        let base = b64(bytes);
110        let row = |label: &str, s: &str| {
111            println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "", label, cl.count(s), o2.count(s), chars(s));
112        };
113        println!("[{name}]");
114        row("source", src);
115        row("rain(stream)", &stream);
116        row("rain+legend", &rl);
117        row("base64(bytes)", &base);
118        s_cl += cl.count(src); s_o += o2.count(src);
119        r_cl += cl.count(&stream); r_o += o2.count(&stream);
120        rl_cl += cl.count(&rl); rl_o += o2.count(&rl);
121    }
122
123    println!("\nTOTALS (3 samples) — token ratio vs source (>1.0 = rain is WORSE):");
124    println!("  source            cl100k {s_cl:>4}   o200k {s_o:>4}");
125    println!("  rain(stream)      cl100k {r_cl:>4} ({:.2}x)   o200k {r_o:>4} ({:.2}x)", r_cl as f64 / s_cl as f64, r_o as f64 / s_o as f64);
126    println!("  rain+legend       cl100k {rl_cl:>4} ({:.2}x)   o200k {rl_o:>4} ({:.2}x)", rl_cl as f64 / s_cl as f64, rl_o as f64 / s_o as f64);
127
128    println!("\nVERDICT");
129    println!("  Digital rain shrinks CHARACTERS (~3x — the Matrix look) but the dense glyph");
130    println!("  stream costs MORE BPE tokens than the ASCII source (ratios above), because the");
131    println!("  tokenizer splits each rare multi-byte glyph into several tokens. Adding the legend");
132    println!("  (needed for reversibility) makes a one-off snippet far worse still. Even an");
133    println!("  amortized shared codebook (stream only) does not beat source on tokens.");
134    println!("  This is the project's token-floor finding, re-confirmed on the dense-symbol idea:");
135    println!("  an LLM emits TOKENS, not glyphs/bytes — the information (names/ops/dims) is the floor.");
136    if !exact {
137        println!("\n  (heuristic run: it counts ~1 token/char and UNDER-counts CJK — the real cl100k/o200k");
138        println!("   gap is larger. Rerun with --features real-tokens for the exact, even-worse numbers.)");
139    }
140}
examples/design_tokens.rs (line 24)
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Token-efficiency design levers (real cl100k + o200k BPE) ===");
22    println!(
23        "tokenizer: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
25    );
26
27    // Each task: (name, ceremony-heavy, current-ish, ab-initio).
28    let tasks: &[(&str, &str, &str, &str)] = &[
29        (
30            "word-count",
31            // A — ceremony-heavy
32            "use std::collections::HashMap;\n\nfn count_words(text: &str) -> HashMap<String, u32> {\n    let mut counts: HashMap<String, u32> = HashMap::new();\n    for word in text.split_whitespace() {\n        *counts.entry(word.to_string()).or_insert(0) += 1;\n    }\n    counts\n}",
33            // B — current-MechGen-ish (sigils, var, some inference)
34            "fn count_words(text: &str) -> {s: u32} {\n    var counts = {s: u32}.new()\n    for word in text.split() {\n        counts.entry(word).or(0) += 1\n    }\n    counts\n}",
35            // C — ab-initio (inference + layout + ambient builtins)
36            "count_words text =\n  counts = {}\n  for w in split text\n    counts[w] += 1\n  counts",
37        ),
38        (
39            "factorial",
40            "fn factorial(n: u64) -> u64 {\n    if n <= 1 {\n        return 1;\n    }\n    n * factorial(n - 1)\n}",
41            "fn factorial(n: u64) -> u64 {\n    if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
42            "fact n =\n  if n <= 1: 1\n  else: n * fact (n - 1)",
43        ),
44        (
45            "safe-divide", // returns optional/result — safety ceremony vs sigil
46            "fn safe_div(a: i32, b: i32) -> Option<i32> {\n    if b == 0 {\n        return None;\n    }\n    Some(a / b)\n}",
47            "fn safe_div(a: i32, b: i32) -> ?i32 {\n    if b == 0 { none } else { a / b }\n}",
48            "div a b =\n  if b == 0: none\n  else: a / b",
49        ),
50    ];
51
52    println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "task", "form", "cl100k", "o200k", "chars");
53    let (mut a_cl, mut b_cl, mut c_cl) = (0, 0, 0);
54    let (mut a_o, mut b_o, mut c_o) = (0, 0, 0);
55    for (name, a, b, c) in tasks {
56        let row = |label: &str, s: &str| {
57            println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "", label, cl.count(s), o2.count(s), s.chars().count());
58        };
59        println!("[{name}]");
60        row("A heavy", a);
61        row("B curr", b);
62        row("C abinit", c);
63        a_cl += cl.count(a); b_cl += cl.count(b); c_cl += cl.count(c);
64        a_o += o2.count(a); b_o += o2.count(b); c_o += o2.count(c);
65    }
66
67    println!("\nTOTALS (3 tasks):");
68    println!("  A ceremony-heavy   cl100k {a_cl:>3}   o200k {a_o:>3}   (baseline)");
69    println!("  B current-ish      cl100k {b_cl:>3} ({:.0}%)   o200k {b_o:>3} ({:.0}%)", 100.0 * b_cl as f64 / a_cl as f64, 100.0 * b_o as f64 / a_o as f64);
70    println!("  C ab-initio        cl100k {c_cl:>3} ({:.0}%)   o200k {c_o:>3} ({:.0}%)", 100.0 * c_cl as f64 / a_cl as f64, 100.0 * c_o as f64 / a_o as f64);
71    println!("\n  → ab-initio cuts ~{:.0}% of cl100k tokens vs ceremony-heavy by REMOVING ceremony",
72        100.0 * (1.0 - c_cl as f64 / a_cl as f64));
73    println!("    (types/mutability/return/imports inferred; layout replaces braces+`;`; terse safety");
74    println!("    sigils; ambient builtins). The remaining tokens are the irreducible payload —");
75    println!("    names/ops/literals — which no design can remove. That residue IS the token floor.");
76}
Source

pub fn count(self, text: &str) -> usize

Count the tokens in text under this model.

Examples found in repository?
examples/tokens_of.rs (line 22)
11fn main() {
12    let cl = Model::OpenAiGpt4;
13    let o2 = Model::OpenAiGpt4o;
14    println!(
15        "tokenizer exact: cl100k={} o200k={}",
16        cl.is_exact(),
17        o2.is_exact()
18    );
19    println!("{:>7} {:>7}   file", "cl100k", "o200k");
20    for path in std::env::args().skip(1) {
21        match fs::read_to_string(&path) {
22            Ok(s) => println!("{:>7} {:>7}   {}", cl.count(&s), o2.count(&s), path),
23            Err(e) => println!("    ERR     ERR   {path}: {e}"),
24        }
25    }
26}
More examples
Hide additional examples
examples/vocabulary_audit.rs (line 33)
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Standard-vocabulary tokenizer audit (§8b) ===");
22    println!(
23        "tokenizer: {}   names: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
25        VOCAB.len()
26    );
27
28    // Agents emit a name with a leading space; BPE is space-aware.
29    let mut single = 0usize;
30    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
31    for &name in VOCAB {
32        let ctx = format!(" {name}");
33        let (c, o) = (cl.count(&ctx), o2.count(&ctx));
34        if c <= 1 && o <= 1 {
35            single += 1;
36        } else {
37            offenders.push((name, c, o));
38        }
39    }
40
41    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", VOCAB.len());
42    if offenders.is_empty() {
43        println!("  ✓ every vocabulary name is a single token — the §8b discipline holds.");
44    } else {
45        println!("\nOFFENDERS (rename or drop — a multi-token name negates the saving):");
46        for (n, c, o) in &offenders {
47            println!("  {n:<12} cl100k {c}  o200k {o}");
48        }
49    }
50
51    println!("\nWHY IT MATTERS");
52    println!("  The vocabulary's win is naming an intent in ~1 token. A 2-token name (e.g.");
53    println!("  `frequencies` = 'frequ'+'encies') halves that. Picking `freq`/`map`/`fold` over");
54    println!("  `frequencies`/`transform`/`accumulate` is tokenizer co-design, audited here.");
55}
examples/keyword_audit.rs (line 42)
25fn main() {
26    let cl = Model::OpenAiGpt4;
27    let o2 = Model::OpenAiGpt4o;
28    println!("=== MechGen keyword tokenizer audit (migration step 4) ===");
29    println!(
30        "tokenizer: {}   keywords: {}\n",
31        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" },
32        KEYWORDS.len()
33    );
34
35    // A keyword usually appears with a leading space in code; BPE is space-aware,
36    // so " return" can differ from "return". Audit the in-context form (leading
37    // space) — that is what an agent actually emits.
38    let mut offenders: Vec<(&str, usize, usize)> = Vec::new();
39    let mut single = 0usize;
40    for &kw in KEYWORDS {
41        let ctx = format!(" {kw}");
42        let c = cl.count(&ctx);
43        let o = o2.count(&ctx);
44        if c <= 1 && o <= 1 {
45            single += 1;
46        } else {
47            offenders.push((kw, c, o));
48        }
49    }
50
51    println!("SINGLE BPE TOKEN (both tokenizers): {single}/{}", KEYWORDS.len());
52    println!("\nOFFENDERS (>1 token in cl100k or o200k):");
53    if offenders.is_empty() {
54        println!("  (none)");
55    } else {
56        offenders.sort_by(|a, b| b.1.cmp(&a.1));
57        for (kw, c, o) in &offenders {
58            println!("  {kw:<20} cl100k {c}  o200k {o}");
59        }
60    }
61
62    println!("\nVERDICT");
63    println!(
64        "  {}/{} keywords are already single-token (the agent-mode single/double-char forms",
65        single, KEYWORDS.len()
66    );
67    println!("  f/m/v/u/… and common words if/for/match/… cost exactly one token).");
68    if !offenders.is_empty() {
69        println!("  The {} offenders are compound/rare words (snake_case splits on `_`); each should", offenders.len());
70        println!("  get a single-token agent-mode alias. They are specialized (swarm combinators,");
71        println!("  grammar extension) — rare in practice, so the realized token cost is small, but");
72        println!("  the surface is not yet uniformly single-token. This is the concrete step-4 work-list.");
73    }
74}
examples/abstraction_tokens.rs (line 52)
16fn main() {
17    let cl = Model::OpenAiGpt4;
18    let o2 = Model::OpenAiGpt4o;
19    println!("=== Abstraction as the post-floor token lever (real BPE) ===");
20    println!(
21        "tokenizer: {}\n",
22        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
23    );
24
25    // (intent, hand-rolled [compiles today], with-vocabulary [proposed primitive])
26    let cases: &[(&str, &str, &str)] = &[
27        (
28            "sum a list",
29            "f sum(xs)\n  var t = 0\n  for x in xs\n    t = t + x\n  t",
30            "f sum(xs)\n  fold(xs, 0, +)",
31        ),
32        (
33            "word frequencies",
34            "f wc(ws)\n  var m = {}\n  for w in ws\n    m[w] = m[w] + 1\n  m",
35            "f wc(ws)\n  freq(ws)",
36        ),
37        (
38            "evens, doubled",
39            "f f(xs)\n  var out = []\n  for x in xs\n    if x % 2 == 0\n      out.push(x * 2)\n  out",
40            "f f(xs)\n  xs | filter even | map double",
41        ),
42        (
43            "max of a list",
44            "f max(xs)\n  var m = xs[0]\n  for x in xs\n    if x > m\n      m = x\n  m",
45            "f max(xs)\n  reduce(xs, max)",
46        ),
47    ];
48
49    println!("{:<18} {:>9} {:>9} {:>7}", "intent", "handrolled", "vocab", "saved");
50    let (mut h_cl, mut v_cl, mut h_o, mut v_o) = (0, 0, 0, 0);
51    for (name, hand, vocab) in cases {
52        let (h, v) = (cl.count(hand), cl.count(vocab));
53        println!("{name:<18} {h:>9} {v:>9} {:>6}%", 100 - 100 * v / h);
54        h_cl += h; v_cl += v;
55        h_o += o2.count(hand); v_o += o2.count(vocab);
56    }
57    println!("\nTOTAL  cl100k {h_cl} → {v_cl} ({}% saved)   o200k {h_o} → {v_o} ({}% saved)",
58        100 - 100 * v_cl / h_cl, 100 - 100 * v_o / h_o);
59
60    println!("\nFINDING");
61    println!("  At the surface floor, abstraction is the only per-call token lever left, and it is");
62    println!("  POSITIVE-SUM: a single-token, total, capability-typed primitive (a) cuts payload");
63    println!("  tokens (above), (b) RAISES reliability (no hand-rolled off-by-one / empty-list bug),");
64    println!("  and (c) preserves safety (the primitive's effect rides its type to the boundary).");
65    println!("  Encoding tricks (binary, dense UTF-8) and layout were all token-neutral-or-worse —");
66    println!("  vocabulary is the one that pays. The discipline: name primitives as single BPE tokens,");
67    println!("  make them total, and choose them by the empirical frequency of SWE intents.");
68}
examples/rain_tokens.rs (line 111)
87fn main() {
88    let exact = Model::OpenAiGpt4.is_exact();
89    println!("=== MechGen 'digital rain' vs token streams (cl100k + o200k BPE) ===");
90    println!(
91        "tokenizer: {}\n",
92        if exact { "REAL tiktoken (exact)" } else { "HEURISTIC (CJK undercounted — rerun with --features real-tokens)" }
93    );
94
95    let samples: &[(&str, &str)] = &[
96        ("net", "net MLP { layer fc1: Linear(8, 16); layer act: ReLU; layer fc2: Linear(16, 4); forward { fc1 } }"),
97        ("fn", "fn factorial(n: u64) -> u64 { if n <= 1 { return 1; } n * factorial(n - 1) }"),
98        ("kb", "kb Family { fact parent(alice, bob); fact parent(bob, carol); rule gp(x: i32, z: i32) where parent(x, y), parent(y, z) { x } }"),
99    ];
100
101    println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "kind", "form", "cl100k tok", "o200k tok", "chars");
102    let cl = Model::OpenAiGpt4;
103    let o2 = Model::OpenAiGpt4o;
104    let (mut s_cl, mut s_o, mut r_cl, mut r_o, mut rl_cl, mut rl_o) = (0, 0, 0, 0, 0, 0);
105    for (name, src) in samples {
106        let (stream, legend) = rain(src);
107        let rl = format!("{legend}\n{stream}"); // rain + legend (one-off, reversible)
108        let bytes = src.as_bytes();
109        let base = b64(bytes);
110        let row = |label: &str, s: &str| {
111            println!("{:<5} {:>10} {:>22} {:>22} {:>14}", "", label, cl.count(s), o2.count(s), chars(s));
112        };
113        println!("[{name}]");
114        row("source", src);
115        row("rain(stream)", &stream);
116        row("rain+legend", &rl);
117        row("base64(bytes)", &base);
118        s_cl += cl.count(src); s_o += o2.count(src);
119        r_cl += cl.count(&stream); r_o += o2.count(&stream);
120        rl_cl += cl.count(&rl); rl_o += o2.count(&rl);
121    }
122
123    println!("\nTOTALS (3 samples) — token ratio vs source (>1.0 = rain is WORSE):");
124    println!("  source            cl100k {s_cl:>4}   o200k {s_o:>4}");
125    println!("  rain(stream)      cl100k {r_cl:>4} ({:.2}x)   o200k {r_o:>4} ({:.2}x)", r_cl as f64 / s_cl as f64, r_o as f64 / s_o as f64);
126    println!("  rain+legend       cl100k {rl_cl:>4} ({:.2}x)   o200k {rl_o:>4} ({:.2}x)", rl_cl as f64 / s_cl as f64, rl_o as f64 / s_o as f64);
127
128    println!("\nVERDICT");
129    println!("  Digital rain shrinks CHARACTERS (~3x — the Matrix look) but the dense glyph");
130    println!("  stream costs MORE BPE tokens than the ASCII source (ratios above), because the");
131    println!("  tokenizer splits each rare multi-byte glyph into several tokens. Adding the legend");
132    println!("  (needed for reversibility) makes a one-off snippet far worse still. Even an");
133    println!("  amortized shared codebook (stream only) does not beat source on tokens.");
134    println!("  This is the project's token-floor finding, re-confirmed on the dense-symbol idea:");
135    println!("  an LLM emits TOKENS, not glyphs/bytes — the information (names/ops/dims) is the floor.");
136    if !exact {
137        println!("\n  (heuristic run: it counts ~1 token/char and UNDER-counts CJK — the real cl100k/o200k");
138        println!("   gap is larger. Rerun with --features real-tokens for the exact, even-worse numbers.)");
139    }
140}
examples/design_tokens.rs (line 57)
18fn main() {
19    let cl = Model::OpenAiGpt4;
20    let o2 = Model::OpenAiGpt4o;
21    println!("=== Token-efficiency design levers (real cl100k + o200k BPE) ===");
22    println!(
23        "tokenizer: {}\n",
24        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
25    );
26
27    // Each task: (name, ceremony-heavy, current-ish, ab-initio).
28    let tasks: &[(&str, &str, &str, &str)] = &[
29        (
30            "word-count",
31            // A — ceremony-heavy
32            "use std::collections::HashMap;\n\nfn count_words(text: &str) -> HashMap<String, u32> {\n    let mut counts: HashMap<String, u32> = HashMap::new();\n    for word in text.split_whitespace() {\n        *counts.entry(word.to_string()).or_insert(0) += 1;\n    }\n    counts\n}",
33            // B — current-MechGen-ish (sigils, var, some inference)
34            "fn count_words(text: &str) -> {s: u32} {\n    var counts = {s: u32}.new()\n    for word in text.split() {\n        counts.entry(word).or(0) += 1\n    }\n    counts\n}",
35            // C — ab-initio (inference + layout + ambient builtins)
36            "count_words text =\n  counts = {}\n  for w in split text\n    counts[w] += 1\n  counts",
37        ),
38        (
39            "factorial",
40            "fn factorial(n: u64) -> u64 {\n    if n <= 1 {\n        return 1;\n    }\n    n * factorial(n - 1)\n}",
41            "fn factorial(n: u64) -> u64 {\n    if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
42            "fact n =\n  if n <= 1: 1\n  else: n * fact (n - 1)",
43        ),
44        (
45            "safe-divide", // returns optional/result — safety ceremony vs sigil
46            "fn safe_div(a: i32, b: i32) -> Option<i32> {\n    if b == 0 {\n        return None;\n    }\n    Some(a / b)\n}",
47            "fn safe_div(a: i32, b: i32) -> ?i32 {\n    if b == 0 { none } else { a / b }\n}",
48            "div a b =\n  if b == 0: none\n  else: a / b",
49        ),
50    ];
51
52    println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "task", "form", "cl100k", "o200k", "chars");
53    let (mut a_cl, mut b_cl, mut c_cl) = (0, 0, 0);
54    let (mut a_o, mut b_o, mut c_o) = (0, 0, 0);
55    for (name, a, b, c) in tasks {
56        let row = |label: &str, s: &str| {
57            println!("{:<13} {:>4} {:>9} {:>8} {:>7}", "", label, cl.count(s), o2.count(s), s.chars().count());
58        };
59        println!("[{name}]");
60        row("A heavy", a);
61        row("B curr", b);
62        row("C abinit", c);
63        a_cl += cl.count(a); b_cl += cl.count(b); c_cl += cl.count(c);
64        a_o += o2.count(a); b_o += o2.count(b); c_o += o2.count(c);
65    }
66
67    println!("\nTOTALS (3 tasks):");
68    println!("  A ceremony-heavy   cl100k {a_cl:>3}   o200k {a_o:>3}   (baseline)");
69    println!("  B current-ish      cl100k {b_cl:>3} ({:.0}%)   o200k {b_o:>3} ({:.0}%)", 100.0 * b_cl as f64 / a_cl as f64, 100.0 * b_o as f64 / a_o as f64);
70    println!("  C ab-initio        cl100k {c_cl:>3} ({:.0}%)   o200k {c_o:>3} ({:.0}%)", 100.0 * c_cl as f64 / a_cl as f64, 100.0 * c_o as f64 / a_o as f64);
71    println!("\n  → ab-initio cuts ~{:.0}% of cl100k tokens vs ceremony-heavy by REMOVING ceremony",
72        100.0 * (1.0 - c_cl as f64 / a_cl as f64));
73    println!("    (types/mutability/return/imports inferred; layout replaces braces+`;`; terse safety");
74    println!("    sigils; ambient builtins). The remaining tokens are the irreducible payload —");
75    println!("    names/ops/literals — which no design can remove. That residue IS the token floor.");
76}

Trait Implementations§

Source§

impl Clone for Model

Source§

fn clone(&self) -> Model

Returns a duplicate of the value. Read more
1.0.0 (const: unstable) · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Copy for Model

Source§

impl Debug for Model

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Eq for Model

Source§

impl PartialEq for Model

Source§

fn eq(&self, other: &Model) -> bool

Tests for self and other values to be equal, and is used by ==.
1.0.0 (const: unstable) · Source§

fn ne(&self, other: &Rhs) -> bool

Tests for !=. The default implementation is almost always sufficient, and should not be overridden without very good reason.
Source§

impl Serialize for Model

Source§

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>
where __S: Serializer,

Serialize this value into the given Serde serializer. Read more
Source§

impl StructuralPartialEq for Model

Auto Trait Implementations§

§

impl Freeze for Model

§

impl RefUnwindSafe for Model

§

impl Send for Model

§

impl Sync for Model

§

impl Unpin for Model

§

impl UnsafeUnpin for Model

§

impl UnwindSafe for Model

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.