use aethershell::builtins::{est_token_count, render_canonical};
use aethershell::env::Env;
use aethershell::eval::eval_program;
use aethershell::parser::parse_program;
use aethershell::safety::SafetyError;
use agentic_eval::{
assess_cache, assess_determinism, assess_error_quality, assess_exfiltration,
assess_reliability, assess_reversibility, assess_safety, assess_scaling, evaluate_with, Effect,
ErrorQuality, Mode, Outcome, Program,
};
struct Variant {
shell: &'static str,
command: &'static str,
output: &'static str,
}
struct Task {
label: &'static str,
scalar: bool,
variants: &'static [Variant],
}
const SHELLS: &[&str] = &[
"aethershell",
"bash",
"zsh",
"fish",
"nushell",
"powershell",
];
const CORPUS: &[Task] = &[
Task {
label: "list files",
scalar: false,
variants: &[
Variant {
shell: "aethershell",
command: r#"ls("./src") | pick("name", "size")"#,
output: "name\tsize\nmain.rs\t1846\nlib.rs\t2310\nast.rs\t512",
},
Variant {
shell: "bash",
command: "ls -l ./src/*.rs",
output: "-rw-r--r-- 1 user staff 1846 Jun 1 10:23 ./src/main.rs\n\
-rw-r--r-- 1 user staff 2310 Jun 1 10:21 ./src/lib.rs\n\
-rw-r--r-- 1 user staff 512 Jun 1 10:20 ./src/ast.rs",
},
Variant {
shell: "zsh",
command: "ls -l ./src/*.rs",
output: "-rw-r--r-- 1 user staff 1846 Jun 1 10:23 ./src/main.rs\n\
-rw-r--r-- 1 user staff 2310 Jun 1 10:21 ./src/lib.rs\n\
-rw-r--r-- 1 user staff 512 Jun 1 10:20 ./src/ast.rs",
},
Variant {
shell: "fish",
command: "ls -l ./src/*.rs",
output: "-rw-r--r-- 1 user staff 1846 Jun 1 10:23 ./src/main.rs\n\
-rw-r--r-- 1 user staff 2310 Jun 1 10:21 ./src/lib.rs\n\
-rw-r--r-- 1 user staff 512 Jun 1 10:20 ./src/ast.rs",
},
Variant {
shell: "nushell",
command: "ls src/*.rs | select name size | to json -r",
output: r#"[{"name":"src/main.rs","size":1846},{"name":"src/lib.rs","size":2310},{"name":"src/ast.rs","size":512}]"#,
},
Variant {
shell: "powershell",
command: "Get-ChildItem ./src/*.rs | Select-Object Name, Length | ConvertTo-Json -Compress",
output: r#"[{"Name":"main.rs","Length":1846},{"Name":"lib.rs","Length":2310},{"Name":"ast.rs","Length":512}]"#,
},
],
},
Task {
label: "processes",
scalar: false,
variants: &[
Variant {
shell: "aethershell",
command: r#"proc.list() | pick("pid", "name", "cpu")"#,
output: "cpu\tname\tpid\n0.4\tinit\t1\n2.1\tsshd\t640\n5.3\tnode\t1875",
},
Variant {
shell: "bash",
command: "ps aux | head -4",
output: "USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND\n\
root 1 0.4 0.1 168940 11200 ? Ss 10:00 0:01 /sbin/init\n\
root 640 2.1 0.3 72300 6100 ? Ss 10:00 0:03 /usr/sbin/sshd\n\
user 1875 5.3 1.2 998120 98300 ? Sl 10:05 0:12 node server.js",
},
Variant {
shell: "zsh",
command: "ps aux | head -4",
output: "USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND\n\
root 1 0.4 0.1 168940 11200 ? Ss 10:00 0:01 /sbin/init\n\
root 640 2.1 0.3 72300 6100 ? Ss 10:00 0:03 /usr/sbin/sshd\n\
user 1875 5.3 1.2 998120 98300 ? Sl 10:05 0:12 node server.js",
},
Variant {
shell: "fish",
command: "ps aux | head -4",
output: "USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND\n\
root 1 0.4 0.1 168940 11200 ? Ss 10:00 0:01 /sbin/init\n\
root 640 2.1 0.3 72300 6100 ? Ss 10:00 0:03 /usr/sbin/sshd\n\
user 1875 5.3 1.2 998120 98300 ? Sl 10:05 0:12 node server.js",
},
Variant {
shell: "nushell",
command: "ps | select pid name cpu | first 3 | to json -r",
output: r#"[{"pid":1,"name":"init","cpu":0.4},{"pid":640,"name":"sshd","cpu":2.1},{"pid":1875,"name":"node","cpu":5.3}]"#,
},
Variant {
shell: "powershell",
command: "Get-Process | Select-Object Id, Name, CPU -First 3 | ConvertTo-Json -Compress",
output: r#"[{"Id":1,"Name":"init","CPU":0.4},{"Id":640,"Name":"sshd","CPU":2.1},{"Id":1875,"Name":"node","CPU":5.3}]"#,
},
],
},
Task {
label: "json field (scalar)",
scalar: true,
variants: &[
Variant {
shell: "aethershell",
command: r#"json.parse(http.get(url)).stargazers_count"#,
output: "4213",
},
Variant {
shell: "bash",
command: "curl -s $url | jq .stargazers_count",
output: "4213",
},
Variant {
shell: "zsh",
command: "curl -s $url | jq .stargazers_count",
output: "4213",
},
Variant {
shell: "fish",
command: "curl -s $url | jq .stargazers_count",
output: "4213",
},
Variant {
shell: "nushell",
command: "http get $url | get stargazers_count",
output: "4213",
},
Variant {
shell: "powershell",
command: "(Invoke-RestMethod $url).stargazers_count",
output: "4213",
},
],
},
Task {
label: "disk usage",
scalar: false,
variants: &[
Variant {
shell: "aethershell",
command: r#"sys.disks() | pick("mount", "avail")"#,
output: "avail\tmount\n21474836480\t/\n5368709120\t/boot\n107374182400\t/home",
},
Variant {
shell: "bash",
command: "df -h",
output: "Filesystem Size Used Avail Use% Mounted on\n\
/dev/sda1 50G 30G 20G 61% /\n\
/dev/sda2 9.8G 4.5G 5.0G 48% /boot\n\
/dev/sdb1 200G 95G 100G 49% /home",
},
Variant {
shell: "zsh",
command: "df -h",
output: "Filesystem Size Used Avail Use% Mounted on\n\
/dev/sda1 50G 30G 20G 61% /\n\
/dev/sda2 9.8G 4.5G 5.0G 48% /boot\n\
/dev/sdb1 200G 95G 100G 49% /home",
},
Variant {
shell: "fish",
command: "df -h",
output: "Filesystem Size Used Avail Use% Mounted on\n\
/dev/sda1 50G 30G 20G 61% /\n\
/dev/sda2 9.8G 4.5G 5.0G 48% /boot\n\
/dev/sdb1 200G 95G 100G 49% /home",
},
Variant {
shell: "nushell",
command: "sys disks | select mount free | to json -r",
output: r#"[{"mount":"/","free":21474836480},{"mount":"/boot","free":5368709120},{"mount":"/home","free":107374182400}]"#,
},
Variant {
shell: "powershell",
command: "Get-Volume | Select-Object DriveLetter, SizeRemaining | ConvertTo-Json -Compress",
output: r#"[{"DriveLetter":"C","SizeRemaining":21474836480},{"DriveLetter":"D","SizeRemaining":5368709120},{"DriveLetter":"E","SizeRemaining":107374182400}]"#,
},
],
},
];
fn task_tokens(task: &Task, shell: &str) -> Option<usize> {
task.variants.iter().find(|v| v.shell == shell).map(|v| {
let p = Program::new("task", v.command).with_output(v.output);
evaluate_with(&p, est_token_count).total_over(1)
})
}
fn shell_tokens(shell: &str) -> usize {
CORPUS.iter().filter_map(|t| task_tokens(t, shell)).sum()
}
fn shell_tokens_structured(shell: &str) -> usize {
CORPUS
.iter()
.filter(|t| !t.scalar)
.filter_map(|t| task_tokens(t, shell))
.sum()
}
fn aecon_rows(n: usize) -> String {
let mut s = String::from("name\tsize");
for i in 0..n {
s.push_str(&format!("\nfile{i}.rs\t{}", 1000 + i * 7));
}
s
}
fn pwsh_table_rows(n: usize) -> String {
let names: Vec<String> = (0..n).map(|i| format!("file{i}.rs")).collect();
let lens: Vec<String> = (0..n).map(|i| (1000 + i * 7).to_string()).collect();
let name_w = names.iter().map(String::len).max().unwrap_or(4).max(4);
let len_w = lens.iter().map(String::len).max().unwrap_or(6).max(6);
let mut s = String::from("\n");
s.push_str(&format!("{:<name_w$} {:>len_w$}\n", "Name", "Length"));
s.push_str(&format!("{:<name_w$} {:>len_w$}\n", "----", "------"));
for (nm, ln) in names.iter().zip(&lens) {
s.push_str(&format!("{nm:<name_w$} {ln:>len_w$}\n"));
}
s
}
fn pwsh_json_rows(n: usize) -> String {
let mut s = String::from("[");
for i in 0..n {
if i > 0 {
s.push(',');
}
s.push_str(&format!(
r#"{{"Name":"file{i}.rs","Length":{}}}"#,
1000 + i * 7
));
}
s.push(']');
s
}
fn pwsh_json_pretty_rows(n: usize) -> String {
let mut s = String::from("[");
for i in 0..n {
if i > 0 {
s.push(',');
}
s.push_str(&format!(
"\n {{\n \"Name\": \"file{i}.rs\",\n \"Length\": {}\n }}",
1000 + i * 7
));
}
s.push_str("\n]");
s
}
fn rows_tokens(command: &str, output: &str) -> usize {
evaluate_with(
&Program::new("task", command).with_output(output),
est_token_count,
)
.total_over(1)
}
fn shell_safety(shell: &str) -> agentic_eval::SafetyReport {
let effects = [
Effect::ReadLocal,
Effect::WriteLocal,
Effect::Destructive,
Effect::Exec,
];
let mode = if shell == "aethershell" {
Mode::Agent
} else {
Mode::Human
};
assess_safety(&effects, mode)
}
struct AxisScore {
shell: &'static str,
token: f64, scaling: f64, determ: f64, reliab: f64, err_quality: f64, safety: f64, reversibility: f64, composite: f64, }
impl AxisScore {
fn axes(&self) -> [f64; 4] {
[
(self.token + self.scaling) / 2.0,
self.determ,
(self.reliab + self.err_quality) / 2.0,
(self.safety + self.reversibility) / 2.0,
]
}
}
fn listing_rows(shell: &str, n: usize) -> String {
match shell {
"aethershell" => aecon_rows(n),
"nushell" => {
let mut s = String::from("[");
for i in 0..n {
if i > 0 {
s.push(',');
}
s.push_str(&format!(
r#"{{"name":"file{i}.rs","size":{}}}"#,
1000 + i * 7
));
}
s.push(']');
s
}
"powershell" => pwsh_json_rows(n),
_ => {
let mut s = String::new();
for i in 0..n {
if i > 0 {
s.push('\n');
}
s.push_str(&format!(
"-rw-r--r-- 1 user staff {} Jun 1 10:23 ./src/file{i}.rs",
1000 + i * 7
));
}
s
}
}
}
fn shell_per_item(shell: &str) -> f64 {
assess_scaling(&[10, 50, 100], |n| listing_rows(shell, n), est_token_count).per_item
}
fn shell_reversibility(shell: &str) -> f64 {
let reversible = shell == "aethershell";
assess_reversibility(&[(Effect::Destructive, reversible)]).score
}
fn main() {
let tokenizer = if cfg!(feature = "real-tokens") {
"real GPT-4 cl100k BPE"
} else {
"heuristic (use --features real-tokens for exact BPE)"
};
println!("AetherShell vs traditional shells — measured with the agentic-eval crate");
println!("Tokenizer: {tokenizer}\n");
let ae_tokens = shell_tokens("aethershell").max(1);
println!(
"{:<13}{:>9}{:>11}{:>9}",
"shell", "tokens", "vs aether", "safety"
);
println!("{}", "-".repeat(42));
for s in SHELLS {
let tok = shell_tokens(s);
println!(
"{:<13}{:>9}{:>10.2}x{:>9}",
s,
tok,
tok as f64 / ae_tokens as f64,
shell_safety(s).grade
);
}
println!("\nPer-task, AetherShell vs PowerShell (reliably-parseable output):");
println!(
" {:<22}{:>8}{:>8}{:>9}",
"task", "aether", "pwsh", "vs pwsh"
);
for t in CORPUS {
let (Some(ae), Some(ps)) = (task_tokens(t, "aethershell"), task_tokens(t, "powershell"))
else {
continue;
};
let tag = if t.scalar { " (scalar parity)" } else { "" };
println!(
" {:<22}{:>8}{:>8}{:>7.2}x{}",
t.label,
ae,
ps,
ps as f64 / ae.max(1) as f64,
tag
);
}
let ae_struct = shell_tokens_structured("aethershell").max(1);
let ps_struct = shell_tokens_structured("powershell");
println!(
" {:<22}{:>8}{:>8}{:>7.2}x <- multi-row results (the agentic norm)",
"structured subtotal",
ae_struct,
ps_struct,
ps_struct as f64 / ae_struct as f64,
);
let ae_cmd = r#"ls("./src") | pick("name", "size")"#;
let ps_table = "Get-ChildItem ./src/*.rs | Select-Object Name, Length";
let ps_jsonc =
"Get-ChildItem ./src/*.rs | Select-Object Name, Length | ConvertTo-Json -Compress";
let ps_json = "Get-ChildItem ./src/*.rs | Select-Object Name, Length | ConvertTo-Json";
println!("\nScale — N-row listing, AetherShell (AECON) vs PowerShell's three output forms:");
println!(
" {:>5}{:>8} | {:>7}{:>7} | {:>7}{:>7} | {:>7}{:>7}",
"rows", "aether", "table", "vs", "json-c", "vs", "json", "vs"
);
for n in [3usize, 10, 25, 50, 100] {
let ae = rows_tokens(ae_cmd, &aecon_rows(n)).max(1);
let t = rows_tokens(ps_table, &pwsh_table_rows(n));
let jc = rows_tokens(ps_jsonc, &pwsh_json_rows(n));
let jp = rows_tokens(ps_json, &pwsh_json_pretty_rows(n));
println!(
" {:>5}{:>8} | {:>7}{:>6.2}x | {:>7}{:>6.2}x | {:>7}{:>6.2}x",
n,
ae,
t,
t as f64 / ae as f64,
jc,
jc as f64 / ae as f64,
jp,
jp as f64 / ae as f64,
);
}
println!(
" (table = display-only, not reliably parseable; json-c/json are. AECON is parseable.)"
);
let det = assess_determinism(8, || {
let mut env = Env::new();
let v = eval_program(
&parse_program(r#"{ b: 2.0, a: 1, items: [3,1,2] }"#).unwrap(),
&mut env,
)
.unwrap();
render_canonical(&v).unwrap_or_default()
});
let programs = [
"len([1,2,3])",
r#"upper("hi")"#,
"[1,2,3] | map(fn(x) => x + 1)",
"env(123)",
"(((",
];
let rel = assess_reliability(&programs, |code| {
let mut env = Env::new();
match parse_program(code).and_then(|s| eval_program(&s, &mut env)) {
Ok(_) => Outcome::ok(),
Err(e) if e.downcast_ref::<SafetyError>().is_some() => Outcome::structured_failure(),
Err(_) => Outcome::opaque_failure(),
}
});
println!("\nDeterminism & reliability (agentic-eval, measured on AetherShell's engine):");
println!(" determinism : {det}");
println!(" reliability : {rel}");
println!(
" (Traditional shells lack both by construction: locale/width/ANSI-variant\n\
\x20 text output, and unstructured errors an agent can't branch on.)"
);
let min_tokens = SHELLS
.iter()
.map(|s| shell_tokens(s))
.min()
.unwrap_or(1)
.max(1) as f64;
let best_per_item = SHELLS
.iter()
.map(|s| shell_per_item(s))
.fold(f64::INFINITY, f64::min)
.max(1e-9);
let ae_det = if det.deterministic { 1.0 } else { 0.0 };
let ae_rel = (rel.pass_rate + rel.actionable_rate) / 2.0;
let ae_errq = assess_error_quality(&["env(123)", "len()", r#"upper(1,2,3)"#], |code| {
let mut env = Env::new();
match parse_program(code).and_then(|s| eval_program(&s, &mut env)) {
Err(e) if e.downcast_ref::<SafetyError>().is_some() => ErrorQuality {
has_code: true, has_message: true,
has_location: true, has_fix: true, },
Err(_) => ErrorQuality {
has_message: true, ..Default::default()
},
Ok(_) => ErrorQuality::default(),
}
})
.mean_score;
let mut scored: Vec<AxisScore> = SHELLS
.iter()
.map(|s| {
let token = min_tokens / shell_tokens(s) as f64;
let scaling = best_per_item / shell_per_item(s).max(1e-9);
let (determ, reliab, err_quality) = if *s == "aethershell" {
(ae_det, ae_rel, ae_errq)
} else {
(0.0, 0.0, 0.5)
};
let safety = shell_safety(s).score;
let reversibility = shell_reversibility(s);
let mut sc = AxisScore {
shell: s,
token,
scaling,
determ,
reliab,
err_quality,
safety,
reversibility,
composite: 0.0,
};
let axes = sc.axes();
sc.composite = axes.iter().sum::<f64>() / axes.len() as f64;
sc
})
.collect();
scored.sort_by(|a, b| {
b.composite
.partial_cmp(&a.composite)
.unwrap_or(std::cmp::Ordering::Equal)
});
println!("\nFour-axis scorecard (0-10) — sub-metrics + axis-grouped composite:");
println!(
" {:<12}{:>5}{:>6}{:>6}{:>6}{:>6}{:>6}{:>6} | {:>9}",
"shell", "tok", "scal", "det", "rel", "err", "saf", "rev", "COMPOSITE"
);
println!(" {}", "-".repeat(68));
for r in &scored {
println!(
" {:<12}{:>5.1}{:>6.1}{:>6.1}{:>6.1}{:>6.1}{:>6.1}{:>6.1} | {:>9.1}",
r.shell,
r.token * 10.0,
r.scaling * 10.0,
r.determ * 10.0,
r.reliab * 10.0,
r.err_quality * 10.0,
r.safety * 10.0,
r.reversibility * 10.0,
r.composite * 10.0
);
}
println!(
" (tok=total-token eff, scal=output per-item eff, det=determinism, rel=pass/actionable,\n\
\x20 err=error actionability, saf=blast-radius gated, rev=reversibility. Composite = mean\n\
\x20 of 4 axes: token=(tok+scal)/2, determinism, reliab=(rel+err)/2, safety=(saf+rev)/2.\n\
\x20 tok/scal/saf measured for every shell; det/rel/err/rev measured for AetherShell,\n\
\x20 structural capability for the rest.)"
);
let exfil = assess_exfiltration(&[Effect::ReadLocal, Effect::Network]);
let cache = assess_cache(900, 100, 20); println!("\nv0.6 context metrics:");
println!(
" exfiltration : a read+network task exposes risk {:.2} for ANY shell; only AetherShell\n\
\x20 can bound it (agent-mode gating + AETHER_NET_ALLOW egress allowlist).",
exfil.risk
);
println!(
" prompt-cache : a 90%-stable prefix over {} turns is {:.1}x cheaper under prompt\n\
\x20 caching — and byte-stable (deterministic) output is the precondition for it.",
cache.turns, cache.savings_ratio
);
println!(
"\nFinding: across {} tasks AetherShell is the most token-efficient ({:.1}x–{:.1}x cheaper\n\
than the others on this corpus), the only shell whose agent-mode policy bounds blast\n\
radius (safety grade A vs F), and — proven on its own engine — deterministic and reliably\n\
structured. Versus PowerShell specifically, the token ratio depends on which output an\n\
agent parses: ~1.4x vs its display Format-Table (not reliably parseable), ~1.6x vs\n\
ConvertTo-Json -Compress, and 2.4x-3.0x vs the default ConvertTo-Json (the idiomatic\n\
form). AECON encodes column keys once; JSON repeats them per row, so the gap widens\n\
with result size. Reproduce: cargo run --example shell_agentic_eval --features real-tokens",
CORPUS.len(),
SHELLS.iter().filter(|s| **s != "aethershell").map(|s| shell_tokens(s) as f64 / ae_tokens as f64).fold(f64::INFINITY, f64::min),
SHELLS.iter().map(|s| shell_tokens(s) as f64 / ae_tokens as f64).fold(0.0, f64::max),
);
}