evaluate/
evaluate.rs

1//! Demonstrate agentic-eval across all four axes on two encodings of the same
2//! task: a legible form vs. a terse "cipher" form.
3//!
4//! Run: `cargo run -p agentic-eval --example evaluate`
5//!  or: `cargo run -p agentic-eval --example evaluate --features real-tokens`
6
7use agentic_eval::determinism::assess_determinism;
8use agentic_eval::reliability::{assess_reliability, Outcome};
9use agentic_eval::safety::{assess_safety, Effect, Mode};
10use agentic_eval::tokens::{compare, Model, Program};
11
12fn main() {
13    println!("agentic-eval — four-axis evaluation of programs for agentic AI\n");
14
15    // Two encodings of "read a file and keep the large entries".
16    let legible = Program::new(
17        "legible",
18        r#"ls("./src") | where(fn(f) => f.size > 1000) | map(fn(f) => f.name)"#,
19    )
20    .with_standing_context("ls/where/map are standard, high-probability names")
21    .with_output("name\nfoo.rs\nbar.rs");
22    let cipher = Program::new("cipher", r#"l./src|w~.size>1k|m~.name"#)
23        .with_standing_context("single-letter+sigil cheatsheet line; ".repeat(120))
24        .with_output("name\nfoo.rs\nbar.rs")
25        .with_retries(8); // terse cipher is mis-emitted more often
26
27    // ── 1. Token efficiency ──────────────────────────────────────────────
28    println!("[1] Token efficiency (amortized over 30 turns):");
29    for model in [
30        Model::OpenAiGpt4,
31        Model::OpenAiGpt4o,
32        Model::AnthropicClaude,
33    ] {
34        let cmp = compare(&legible, &cipher, model, 30);
35        println!(
36            "  {:<28} legible={:>6}  cipher={:>6}  → {} wins ({:.2}x){}",
37            model.name(),
38            cmp.a_total,
39            cmp.b_total,
40            if cmp.winner_is_a { "legible" } else { "cipher" },
41            cmp.ratio,
42            if model.is_exact() { "" } else { " [est]" },
43        );
44    }
45
46    // ── 2. Determinism ───────────────────────────────────────────────────
47    // A canonical renderer (byte-stable) vs. one that embeds a timestamp.
48    let canonical = assess_determinism(5, || "name\nfoo.rs\nbar.rs".to_string());
49    let mut t = 0u64;
50    let noisy = assess_determinism(5, || {
51        t += 1;
52        format!("name\nfoo.rs\nbar.rs  # at {t}")
53    });
54    println!("\n[2] Determinism:");
55    println!(
56        "  canonical output : deterministic={} ({} distinct / {} runs)",
57        canonical.deterministic, canonical.distinct, canonical.runs
58    );
59    println!(
60        "  timestamped output: deterministic={} ({} distinct / {} runs)",
61        noisy.deterministic, noisy.distinct, noisy.runs
62    );
63
64    // ── 3. Reliability ───────────────────────────────────────────────────
65    // The legible form parses on all 6 sample invocations; the cipher mis-parses
66    // twice but at least returns a structured error once.
67    let samples = [0, 1, 2, 3, 4, 5];
68    let legible_rel = assess_reliability(&samples, |_| Outcome::ok());
69    let cipher_rel = assess_reliability(&samples, |&i| match i {
70        4 => Outcome::structured_failure(),
71        5 => Outcome::opaque_failure(),
72        _ => Outcome::ok(),
73    });
74    println!("\n[3] Reliability:");
75    println!(
76        "  legible: pass {:.0}%  actionable {:.0}%",
77        legible_rel.pass_rate * 100.0,
78        legible_rel.actionable_rate * 100.0
79    );
80    println!(
81        "  cipher : pass {:.0}%  actionable {:.0}%",
82        cipher_rel.pass_rate * 100.0,
83        cipher_rel.actionable_rate * 100.0
84    );
85
86    // ── 4. Safety ────────────────────────────────────────────────────────
87    // The task reads + lists (ReadLocal); a "rm the small files" variant adds a
88    // Destructive effect. Score the gating under the agent policy.
89    let read_only = assess_safety(&[Effect::ReadLocal, Effect::WriteLocal], Mode::Agent);
90    let destructive = assess_safety(
91        &[Effect::ReadLocal, Effect::Destructive, Effect::Exec],
92        Mode::Agent,
93    );
94    println!("\n[4] Safety (agent policy):");
95    println!(
96        "  read+write task : grade {} (bounded={}, {} approval-gated)",
97        read_only.grade, read_only.bounded, read_only.approval_gated
98    );
99    println!(
100        "  rm+exec task    : grade {} (bounded={}, {} approval-gated, {} denied)",
101        destructive.grade, destructive.bounded, destructive.approval_gated, destructive.denied
102    );
103
104    println!("\nSummary: the legible form is competitive-or-cheaper on tokens once standing");
105    println!("context counts, more deterministic and reliable to parse, and the agent policy");
106    println!("bounds the blast radius of even the destructive variant.");
107}
evaluate/evaluate.rs

evaluate/
evaluate.rs