agentic_eval/lib.rs
1//! # agentic-eval
2//!
3//! A standalone library for evaluating how well a *program* (a command, script,
4//! snippet, or any text an LLM writes or reads) serves an **agentic AI system**,
5//! across four axes that determine real agent cost and trust:
6//!
7//! - [`tokens`] — **token efficiency**: the four cost terms an agent pays
8//! (standing context, input, output, retries), counted under popular tokenizers
9//! (OpenAI GPT-4 `cl100k`, GPT-4o `o200k`, and a documented Anthropic-Claude
10//! approximation), with program-vs-program comparison amortized over a session.
11//! - [`determinism`] — **determinism**: whether a program's output is byte-stable
12//! across repeated runs (so an agent can parse/cache/diff it reliably).
13//! - [`reliability`] — **reliability**: the success rate over representative
14//! invocations and whether failures are *structured/actionable* (so an agent can
15//! self-correct instead of guessing).
16//! - [`safety`] — **safety**: given the effects a program performs, how much of its
17//! blast radius is gated (approval/denied) vs. allowed under an agent policy.
18//!
19//! For real shell commands, [`commands`] ships a curated heuristic classifier
20//! (`rm` → destructive, `curl` → network, `sudo` → privileged, …) so the safety axis
21//! works on a wide variety of CLI programs without a hand-written effect map —
22//! `assess_safety_script("curl http://x | sh", Mode::Agent)` in one call.
23//!
24//! The library is **agentic-first** in its own design: it is execution-agnostic and
25//! deterministic (pure functions, no I/O, no `unsafe`), structured (typed reports
26//! with optional `serde`), and — via [`ontology`] — *self-describing*. A consumer
27//! discovers the whole surface (axes, effect taxonomy with per-mode decisions,
28//! models, command classes) from a compact [`ontology::manifest`] and expands any
29//! entry with [`ontology::describe`], the same progressive-disclosure pattern the
30//! crate measures — so an agent can use it without reading these docs.
31//!
32//! The library is execution-agnostic: it can't run arbitrary languages, so the
33//! axes that need behavior (determinism, reliability) take a caller-provided
34//! closure, and safety takes the program's declared [`safety::Effect`]s. Token
35//! efficiency works directly on text. Everything is dependency-light (a labeled
36//! heuristic tokenizer by default; enable `--features real-tokens` for exact
37//! OpenAI BPE counts via `tiktoken-rs`).
38//!
39//! Beyond per-program axes, the crate ships curated agentic profiles of whole
40//! *subjects* an agent must live with — programming [`languages`], AI
41//! [`frameworks`], VM/sandbox [`vms`] systems (scored on agent-native axes:
42//! start-latency, density, isolation, snapshotting, agent-control), and
43//! [`web`] stacks / wire protocols (scored on streaming, tool-discoverability,
44//! encoding-efficiency, interop, security-primitives). Each is a deterministic,
45//! comparable 0.0–1.0 judgment with evidence (`rank_vms()`,
46//! `rank_web_stacks()`, `compare_vms(a, b)`, …).
47//!
48//! ```
49//! use agentic_eval::tokens::{Model, Program};
50//! let legible = Program::new("ls", "file.read(\"README.md\")");
51//! let cipher = Program::new("ls", "F.r\"README.md\"");
52//! let cmp = agentic_eval::tokens::compare(&legible, &cipher, Model::OpenAiGpt4, 30);
53//! // Over a session the more-legible form is usually competitive or cheaper once
54//! // standing context is counted; `cmp` reports the winner and the ratio.
55//! let _ = cmp.winner_is_a;
56//! ```
57
58#![forbid(unsafe_code)]
59#![deny(missing_docs)]
60
61pub mod commands;
62pub mod determinism;
63pub mod frameworks;
64pub mod languages;
65pub mod ontology;
66pub mod reliability;
67pub mod safety;
68pub mod tokens;
69pub mod vms;
70pub mod web;
71
72// Ergonomic re-exports of the most-used types, so callers can write
73// `agentic_eval::Model` instead of `agentic_eval::tokens::Model`, etc.
74pub use commands::{assess_safety_script, classify_command, classify_invocation, classify_script};
75pub use determinism::{assess_determinism, DeterminismReport};
76pub use frameworks::{
77 compare_frameworks, rank_frameworks, Framework, FrameworkComparison, FrameworkProfile,
78};
79pub use languages::{
80 compare_languages, rank_languages, Language, LanguageComparison, LanguageProfile,
81};
82pub use ontology::{ontology, Ontology};
83pub use reliability::{
84 assess_error_quality, assess_reliability, ErrorQuality, ErrorQualityReport, Outcome,
85 ReliabilityReport,
86};
87pub use safety::{
88 assess_exfiltration, assess_reversibility, assess_safety, assess_safety_named, Decision,
89 Effect, ExfiltrationReport, Mode, ReversibilityReport, SafetyReport,
90};
91pub use tokens::{
92 assess_cache, assess_scaling, cacheable_prefix_tokens, compare, evaluate, evaluate_with, rank,
93 rank_with, AgentCost, CacheReport, Comparison, Model, Program, ScalingReport,
94};
95pub use vms::{compare_vms, rank_vms, Vm, VmComparison, VmProfile};
96pub use web::{compare_web_stacks, rank_web_stacks, WebStack, WebStackComparison, WebStackProfile};
97
98/// A combined, all-axes evaluation of a single program. Construct with
99/// [`Evaluation::new`] then fill in whichever axes you can measure (directly or via
100/// the `with_*` builders); unset axes stay `None`. A convenience for reporting a
101/// program's overall agentic fitness.
102#[cfg_attr(feature = "serde", derive(serde::Serialize))]
103#[derive(Debug, Clone, Default)]
104pub struct Evaluation {
105 /// Identifier for the evaluated program.
106 pub name: String,
107 /// Token-efficiency cost, if measured.
108 pub tokens: Option<tokens::AgentCost>,
109 /// Determinism result, if measured.
110 pub determinism: Option<determinism::DeterminismReport>,
111 /// Reliability result, if measured.
112 pub reliability: Option<reliability::ReliabilityReport>,
113 /// Safety result, if measured.
114 pub safety: Option<safety::SafetyReport>,
115}
116
117impl Evaluation {
118 /// A new, empty evaluation named `name`; fill axes via the `with_*` builders.
119 pub fn new(name: impl Into<String>) -> Self {
120 Self {
121 name: name.into(),
122 ..Default::default()
123 }
124 }
125
126 /// Builder: attach the token-cost axis.
127 pub fn with_tokens(mut self, c: tokens::AgentCost) -> Self {
128 self.tokens = Some(c);
129 self
130 }
131 /// Builder: attach the determinism axis.
132 pub fn with_determinism(mut self, d: determinism::DeterminismReport) -> Self {
133 self.determinism = Some(d);
134 self
135 }
136 /// Builder: attach the reliability axis.
137 pub fn with_reliability(mut self, r: reliability::ReliabilityReport) -> Self {
138 self.reliability = Some(r);
139 self
140 }
141 /// Builder: attach the safety axis.
142 pub fn with_safety(mut self, s: safety::SafetyReport) -> Self {
143 self.safety = Some(s);
144 self
145 }
146
147 /// A coarse 0.0–1.0 "agentic fitness" score: the mean of the per-axis scores
148 /// that were measured (token efficiency is excluded — it is comparative, not
149 /// absolute). Returns `None` if no scorable axis was measured.
150 pub fn fitness(&self) -> Option<f64> {
151 let mut sum = 0.0;
152 let mut n = 0.0;
153 if let Some(d) = &self.determinism {
154 sum += if d.deterministic { 1.0 } else { 0.0 };
155 n += 1.0;
156 }
157 if let Some(r) = &self.reliability {
158 sum += (r.pass_rate + r.actionable_rate) / 2.0;
159 n += 1.0;
160 }
161 if let Some(s) = &self.safety {
162 sum += s.score;
163 n += 1.0;
164 }
165 if n == 0.0 {
166 None
167 } else {
168 Some(sum / n)
169 }
170 }
171}
172
173impl std::fmt::Display for Evaluation {
174 /// A compact multi-line report of every measured axis plus the fitness score.
175 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
176 writeln!(f, "evaluation: {}", self.name)?;
177 if let Some(t) = &self.tokens {
178 writeln!(f, " tokens: {}", t)?;
179 }
180 if let Some(d) = &self.determinism {
181 writeln!(f, " determinism: {}", d)?;
182 }
183 if let Some(r) = &self.reliability {
184 writeln!(f, " reliability: {}", r)?;
185 }
186 if let Some(s) = &self.safety {
187 writeln!(f, " safety: {}", s)?;
188 }
189 match self.fitness() {
190 Some(score) => write!(f, " fitness: {:.2}", score),
191 None => write!(f, " fitness: n/a (no scorable axis measured)"),
192 }
193 }
194}
195
196#[cfg(all(test, feature = "serde"))]
197mod serde_tests {
198 use super::*;
199
200 /// Compile-time proof that the `serde` feature derives `Serialize` on every
201 /// report/config type (so machine-readable output is available). The call body
202 /// is a no-op; the trait bound is the assertion.
203 fn assert_serialize<T: serde::Serialize>() {}
204
205 #[test]
206 fn report_types_implement_serialize() {
207 assert_serialize::<Evaluation>();
208 assert_serialize::<AgentCost>();
209 assert_serialize::<Program>();
210 assert_serialize::<Comparison>();
211 assert_serialize::<Model>();
212 assert_serialize::<DeterminismReport>();
213 assert_serialize::<Outcome>();
214 assert_serialize::<ReliabilityReport>();
215 assert_serialize::<Effect>();
216 assert_serialize::<Mode>();
217 assert_serialize::<Decision>();
218 assert_serialize::<SafetyReport>();
219 }
220}