1use agentic_eval::determinism::assess_determinism;
8use agentic_eval::reliability::{assess_reliability, Outcome};
9use agentic_eval::safety::{assess_safety, Effect, Mode};
10use agentic_eval::tokens::{compare, Model, Program};
11
12fn main() {
13 println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15 let legible = Program::new(
17 "legible",
18 r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19 )
20 .with_standing_context("ls/where/map are standard, high-probability names")
21 .with_output("name\nfoo.rs\nbar.rs");
22 let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23 .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24 .with_output("name\nfoo.rs\nbar.rs")
25 .with_retries(8); println!("[1] Token efficiency (amortized over 30 turns):");
29 for model in [
30 Model::OpenAiGpt4,
31 Model::OpenAiGpt4o,
32 Model::AnthropicClaude,
33 ] {
34 let cmp = compare(&legible, &cipher, model, 30);
35 println!(
36 " {:<28} legible={:>6} cipher={:>6} → {} wins ({:.2}x){}",
37 model.name(),
38 cmp.a_total,
39 cmp.b_total,
40 if cmp.winner_is_a { "legible" } else { "cipher" },
41 cmp.ratio,
42 if model.is_exact() { "" } else { " [est]" },
43 );
44 }
45
46 let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49 let mut t = 0u64;
50 let noisy = assess_determinism(5, || {
51 t += 1;
52 format!("name\nfoo.rs\nbar.rs # at {t}")
53 });
54 println!("\n[2] Determinism:");
55 println!(
56 " canonical output : deterministic={} ({} distinct / {} runs)",
57 canonical.deterministic, canonical.distinct, canonical.runs
58 );
59 println!(
60 " timestamped output: deterministic={} ({} distinct / {} runs)",
61 noisy.deterministic, noisy.distinct, noisy.runs
62 );
63
64 let samples = [0, 1, 2, 3, 4, 5];
68 let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69 let cipher_rel = assess_reliability(&samples, |&i| match i {
70 4 => Outcome::structured_failure(),
71 5 => Outcome::opaque_failure(),
72 _ => Outcome::ok(),
73 });
74 println!("\n[3] Reliability:");
75 println!(
76 " legible: pass {:.0}% actionable {:.0}%",
77 legible_rel.pass_rate * 100.0,
78 legible_rel.actionable_rate * 100.0
79 );
80 println!(
81 " cipher : pass {:.0}% actionable {:.0}%",
82 cipher_rel.pass_rate * 100.0,
83 cipher_rel.actionable_rate * 100.0
84 );
85
86 let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90 let destructive = assess_safety(
91 &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92 Mode::Agent,
93 );
94 println!("\n[4] Safety (agent policy):");
95 println!(
96 " read+write task : grade {} (bounded={}, {} approval-gated)",
97 read_only.grade, read_only.bounded, read_only.approval_gated
98 );
99 println!(
100 " rm+exec task : grade {} (bounded={}, {} approval-gated, {} denied)",
101 destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102 );
103
104 println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105 println!("context counts, more deterministic and reliable to parse, and the agent policy");
106 println!("bounds the blast radius of even the destructive variant.");
107}