pub fn compare_languages(a: Language, b: Language) -> LanguageComparisonExpand description
Compare language a against baseline b across all four axes.
Examples found in repository?
examples/swe_languages.rs (line 80)
25fn main() {
26 println!("=== Agentic-SWE language comparison ===\n");
27
28 // ── 1. Canonical agentic fitness (unweighted mean) ────────────────────────
29 println!("CANONICAL agentic fitness (unweighted mean of the four axes)");
30 println!(
31 "{:<12} {:>7} {:>6} {:>6} {:>6} {:>6}",
32 "language", "fitness", "token", "determ", "reliab", "safety"
33 );
34 for p in rank_languages() {
35 let tag = match p.language {
36 Language::Ideal => " ← design-target ceiling (not a real language)",
37 Language::MechGen => " ← this project's language (bias-audited)",
38 _ => "",
39 };
40 println!(
41 "{:<12} {:>7.3} {:>6.2} {:>6.2} {:>6.2} {:>6.2}{}",
42 p.language.name(),
43 p.fitness(),
44 p.token_efficiency,
45 p.determinism,
46 p.reliability,
47 p.safety,
48 tag,
49 );
50 }
51
52 // ── 2. SWE-weighted ranking ───────────────────────────────────────────────
53 println!("\nSWE-WEIGHTED ranking (reliability .35, determinism .30, safety .20, token .15)");
54 let mut ranked: Vec<LanguageProfile> = Language::all().iter().map(|&l| profile(l)).collect();
55 ranked.sort_by(|a, b| swe_score(b).partial_cmp(&swe_score(a)).unwrap());
56 println!("{:<12} {:>9} vs canonical", "language", "swe-score");
57 for p in &ranked {
58 let delta = swe_score(p) - p.fitness();
59 let note = if p.language == Language::Ideal {
60 " (design target)"
61 } else {
62 ""
63 };
64 println!(
65 "{:<12} {:>9.3} {:+.3}{}",
66 p.language.name(),
67 swe_score(p),
68 delta,
69 note,
70 );
71 }
72 println!(
73 " (the SWE lens lifts strongly-typed/reproducible languages — Rust, MechGen, Go —\n \
74 and demotes terse-but-unsafe ones — Python, Bash — vs the unweighted mean.)"
75 );
76
77 // ── 3. Head-to-head: MechGen vs the popular real languages ─────────────────
78 println!("\nHEAD-TO-HEAD (positive = MechGen fits agentic SWE better)");
79 for other in [Language::Rust, Language::Python, Language::Go, Language::TypeScript] {
80 let c = compare_languages(Language::MechGen, other);
81 print!("{c}");
82 }
83
84 // ── 4. Reading + honesty ──────────────────────────────────────────────────
85 let mg = profile(Language::MechGen);
86 let rust = profile(Language::Rust);
87 let py = profile(Language::Python);
88 println!("READING");
89 println!(
90 " Among IMPLEMENTED languages MechGen ranks #1 ({:.3}); only the `ideal` DESIGN TARGET\n \
91 ({:.3}, token-floored, unreachable for any text language) sits above it.",
92 mg.fitness(),
93 profile(Language::Ideal).fitness()
94 );
95 println!(
96 " Under the SWE weighting it stays #1 among real languages: its reliability ({:.2}) and\n \
97 determinism ({:.2}) — sound effects, exhaustiveness, machine-readable fixes, byte-stable IR —\n \
98 are exactly what the build→test→debug loop rewards.",
99 mg.reliability, mg.determinism
100 );
101 println!("\nHONESTY (this is the project's own language — bias is the risk):");
102 println!(
103 " • Scores move on EVIDENCE, both ways: token was corrected DOWN 0.73→0.60 (a C/Go\n \
104 head-to-head exposed the old surface as MORE verbose), then RAISED 0.60→0.80 after the\n \
105 ab-initio migration LANDED type inference + `;`-removal (1166 tests green) and measured\n \
106 #1 of six on the real-BPE swe_token_benchmark. Composite was also corrected 0.95→0.865.");
107 println!(
108 " • Falsifiable guards still hold: token ({:.2}) ≤ Python ({:.2}) — measured tersest but\n \
109 scored at parity, not above; reliability ({:.2}) ≤ Rust ({:.2}); no axis ≥ 0.98 (prototype).",
110 mg.token_efficiency, py.token_efficiency, mg.reliability, rust.reliability
111 );
112 println!(
113 " • The advantage is real on the axes SWE cares about (reliability/determinism/safety),\n \
114 NOT on tokens — and it is a young prototype vs battle-tested Rust on correctness maturity."
115 );
116}