swe_token_benchmark/
swe_token_benchmark.rs

1//! Extensive agentic-SWE **token** benchmark: the same three tasks written
2//! idiomatically in six languages, measured with the REAL cl100k + o200k BPE.
3//! MechGen uses its landed ab-initio surface (inferred signatures, layout, no
4//! `;`) — every MechGen snippet here is accepted by the compiler (`--check`).
5//!
6//! This is the *objective* token axis (the other three agentic axes —
7//! determinism, reliability, safety — are in `swe_lang_profiles`). The token
8//! floor is the payload; this measures how close each language's surface gets to
9//! it on real SWE micro-tasks.
10//!
11//!   cargo run -p agentic-eval --example swe_token_benchmark --features real-tokens
12
13use agentic_eval::tokens::Model;
14
15fn main() {
16    let cl = Model::OpenAiGpt4;
17    let o2 = Model::OpenAiGpt4o;
18    println!("=== Agentic-SWE token benchmark — 6 languages × 3 tasks (real BPE) ===");
19    println!(
20        "tokenizer: {}\n",
21        if cl.is_exact() { "REAL tiktoken (exact)" } else { "HEURISTIC — rerun with --features real-tokens" }
22    );
23
24    // [language] = [factorial, sum-loop, point+dist2]
25    let langs: &[(&str, [&str; 3])] = &[
26        (
27            "MechGen",
28            [
29                "f factorial(n)\n  if n <= 1\n    1\n  else\n    n * factorial(n - 1)",
30                "f sum(xs)\n  var t = 0\n  for x in xs\n    t = t + x\n  t",
31                "S Point { x: f64, y: f64 }\nf dist2(p: Point)\n  p.x * p.x + p.y * p.y",
32            ],
33        ),
34        (
35            "Python",
36            [
37                "def factorial(n):\n    return 1 if n <= 1 else n * factorial(n - 1)",
38                "def sum_list(xs):\n    t = 0\n    for x in xs:\n        t += x\n    return t",
39                "from dataclasses import dataclass\n@dataclass\nclass Point:\n    x: float\n    y: float\ndef dist2(p):\n    return p.x * p.x + p.y * p.y",
40            ],
41        ),
42        (
43            "Rust",
44            [
45                "fn factorial(n: u64) -> u64 {\n    if n <= 1 { 1 } else { n * factorial(n - 1) }\n}",
46                "fn sum_list(xs: &[i64]) -> i64 {\n    let mut t = 0;\n    for x in xs {\n        t += x;\n    }\n    t\n}",
47                "struct Point {\n    x: f64,\n    y: f64,\n}\nfn dist2(p: &Point) -> f64 {\n    p.x * p.x + p.y * p.y\n}",
48            ],
49        ),
50        (
51            "Go",
52            [
53                "func factorial(n int) int {\n\tif n <= 1 {\n\t\treturn 1\n\t}\n\treturn n * factorial(n-1)\n}",
54                "func sumList(xs []int) int {\n\tt := 0\n\tfor _, x := range xs {\n\t\tt += x\n\t}\n\treturn t\n}",
55                "type Point struct {\n\tX, Y float64\n}\nfunc dist2(p Point) float64 {\n\treturn p.X*p.X + p.Y*p.Y\n}",
56            ],
57        ),
58        (
59            "TypeScript",
60            [
61                "function factorial(n: number): number {\n  return n <= 1 ? 1 : n * factorial(n - 1);\n}",
62                "function sumList(xs: number[]): number {\n  let t = 0;\n  for (const x of xs) {\n    t += x;\n  }\n  return t;\n}",
63                "interface Point {\n  x: number;\n  y: number;\n}\nfunction dist2(p: Point): number {\n  return p.x * p.x + p.y * p.y;\n}",
64            ],
65        ),
66        (
67            "Java",
68            [
69                "static long factorial(long n) {\n    return n <= 1 ? 1 : n * factorial(n - 1);\n}",
70                "static long sumList(long[] xs) {\n    long t = 0;\n    for (long x : xs) {\n        t += x;\n    }\n    return t;\n}",
71                "record Point(double x, double y) {}\nstatic double dist2(Point p) {\n    return p.x() * p.x() + p.y() * p.y();\n}",
72            ],
73        ),
74    ];
75
76    println!("{:<12} {:>9} {:>9} {:>9} {:>9}", "language", "factori", "sum", "point", "TOTAL cl");
77    let mut totals: Vec<(&str, usize, usize)> = Vec::new();
78    for (name, progs) in langs {
79        let c: Vec<usize> = progs.iter().map(|p| cl.count(p)).collect();
80        let o: usize = progs.iter().map(|p| o2.count(p)).sum();
81        let tot: usize = c.iter().sum();
82        println!("{name:<12} {:>9} {:>9} {:>9} {:>9}", c[0], c[1], c[2], tot);
83        totals.push((name, tot, o));
84    }
85
86    println!("\nRANK by total cl100k tokens (lower = terser):");
87    totals.sort_by_key(|t| t.1);
88    let best = totals[0].1 as f64;
89    let mg = totals.iter().find(|t| t.0 == "MechGen").unwrap().1;
90    for (i, (name, tot, o)) in totals.iter().enumerate() {
91        let mark = if *name == "MechGen" { "  ← landed ab-initio surface" } else { "" };
92        println!("  {}. {name:<11} {tot:>3} cl100k  {o:>3} o200k  ({:.2}x){mark}", i + 1, *tot as f64 / best);
93    }
94
95    println!("\nREADING");
96    let py = totals.iter().find(|t| t.0 == "Python").unwrap().1;
97    println!("  MechGen total {mg} cl100k vs Python {py}, Rust {}, Go {}, TS {}, Java {}.",
98        totals.iter().find(|t| t.0 == "Rust").unwrap().1,
99        totals.iter().find(|t| t.0 == "Go").unwrap().1,
100        totals.iter().find(|t| t.0 == "TypeScript").unwrap().1,
101        totals.iter().find(|t| t.0 == "Java").unwrap().1);
102    println!("  Every MechGen snippet compiles (--check). The terseness is from inference +");
103    println!("  `;`-removal (real, landed), NOT layout (token-neutral) — names/ops/literals are");
104    println!("  the irreducible payload floor that bounds all of them.");
105}
swe_token_benchmark/swe_token_benchmark.rs

swe_token_benchmark/
swe_token_benchmark.rs