use agentic_eval::languages::{compare_languages, profile, rank_languages, Language, LanguageProfile};
fn swe_score(p: &LanguageProfile) -> f64 {
0.35 * p.reliability + 0.30 * p.determinism + 0.20 * p.safety + 0.15 * p.token_efficiency
}
fn main() {
println!("=== Agentic-SWE language comparison ===\n");
println!("CANONICAL agentic fitness (unweighted mean of the four axes)");
println!(
"{:<12} {:>7} {:>6} {:>6} {:>6} {:>6}",
"language", "fitness", "token", "determ", "reliab", "safety"
);
for p in rank_languages() {
let tag = match p.language {
Language::Ideal => " ← design-target ceiling (not a real language)",
Language::MechGen => " ← this project's language (bias-audited)",
_ => "",
};
println!(
"{:<12} {:>7.3} {:>6.2} {:>6.2} {:>6.2} {:>6.2}{}",
p.language.name(),
p.fitness(),
p.token_efficiency,
p.determinism,
p.reliability,
p.safety,
tag,
);
}
println!("\nSWE-WEIGHTED ranking (reliability .35, determinism .30, safety .20, token .15)");
let mut ranked: Vec<LanguageProfile> = Language::all().iter().map(|&l| profile(l)).collect();
ranked.sort_by(|a, b| swe_score(b).partial_cmp(&swe_score(a)).unwrap());
println!("{:<12} {:>9} vs canonical", "language", "swe-score");
for p in &ranked {
let delta = swe_score(p) - p.fitness();
let note = if p.language == Language::Ideal {
" (design target)"
} else {
""
};
println!(
"{:<12} {:>9.3} {:+.3}{}",
p.language.name(),
swe_score(p),
delta,
note,
);
}
println!(
" (the SWE lens lifts strongly-typed/reproducible languages — Rust, MechGen, Go —\n \
and demotes terse-but-unsafe ones — Python, Bash — vs the unweighted mean.)"
);
println!("\nHEAD-TO-HEAD (positive = MechGen fits agentic SWE better)");
for other in [Language::Rust, Language::Python, Language::Go, Language::TypeScript] {
let c = compare_languages(Language::MechGen, other);
print!("{c}");
}
let mg = profile(Language::MechGen);
let rust = profile(Language::Rust);
let py = profile(Language::Python);
println!("READING");
println!(
" Among IMPLEMENTED languages MechGen ranks #1 ({:.3}); only the `ideal` DESIGN TARGET\n \
({:.3}, token-floored, unreachable for any text language) sits above it.",
mg.fitness(),
profile(Language::Ideal).fitness()
);
println!(
" Under the SWE weighting it stays #1 among real languages: its reliability ({:.2}) and\n \
determinism ({:.2}) — sound effects, exhaustiveness, machine-readable fixes, byte-stable IR —\n \
are exactly what the build→test→debug loop rewards.",
mg.reliability, mg.determinism
);
println!("\nHONESTY (this is the project's own language — bias is the risk):");
println!(
" • Scores move on EVIDENCE, both ways: token was corrected DOWN 0.73→0.60 (a C/Go\n \
head-to-head exposed the old surface as MORE verbose), then RAISED 0.60→0.80 after the\n \
ab-initio migration LANDED type inference + `;`-removal (1166 tests green) and measured\n \
#1 of six on the real-BPE swe_token_benchmark. Composite was also corrected 0.95→0.865.");
println!(
" • Falsifiable guards still hold: token ({:.2}) ≤ Python ({:.2}) — measured tersest but\n \
scored at parity, not above; reliability ({:.2}) ≤ Rust ({:.2}); no axis ≥ 0.98 (prototype).",
mg.token_efficiency, py.token_efficiency, mg.reliability, rust.reliability
);
println!(
" • The advantage is real on the axes SWE cares about (reliability/determinism/safety),\n \
NOT on tokens — and it is a young prototype vs battle-tested Rust on correctness maturity."
);
}