Skip to main content

agentic_eval/
reliability.rs

1//! Reliability: does a program parse/run without ambiguity, and when it fails, is
2//! the failure *actionable*?
3//!
4//! Two things drive an agent's retry-token blowup. First, ambiguity: a form the
5//! model mis-emits and has to redo. Second, dead-end errors: prose like
6//! `"error near line 3"` tells the model nothing to branch on, so it guesses.
7//! This module aggregates the outcomes of running a program over a set of
8//! representative invocations into a pass rate and an *actionable-failure* rate
9//! (failures that carry a structured, machine-branchable code + hint).
10
11/// The outcome of one invocation, as classified by the caller.
12#[cfg_attr(feature = "serde", derive(serde::Serialize))]
13#[derive(Debug, Clone, Copy)]
14pub struct Outcome {
15    /// Did the invocation succeed (parse + run without error)?
16    pub ok: bool,
17    /// If it failed, was the error *structured/actionable* (stable code + hint an
18    /// agent can branch on) rather than opaque prose? Ignored when `ok`.
19    pub structured_error: bool,
20}
21
22impl Outcome {
23    /// A successful invocation.
24    pub fn ok() -> Self {
25        Self {
26            ok: true,
27            structured_error: false,
28        }
29    }
30    /// A failure carrying a structured, actionable error.
31    pub fn structured_failure() -> Self {
32        Self {
33            ok: false,
34            structured_error: true,
35        }
36    }
37    /// A failure with only opaque prose (a dead end for self-correction).
38    pub fn opaque_failure() -> Self {
39        Self {
40            ok: false,
41            structured_error: false,
42        }
43    }
44}
45
46/// Aggregate reliability over a set of invocations.
47#[cfg_attr(feature = "serde", derive(serde::Serialize))]
48#[derive(Debug, Clone)]
49pub struct ReliabilityReport {
50    /// Number of invocations assessed.
51    pub total: usize,
52    /// Number that succeeded.
53    pub passed: usize,
54    /// Number that failed (`total - passed`).
55    pub failed: usize,
56    /// Of the failures, how many carried a structured/actionable error.
57    pub structured_failures: usize,
58    /// `passed / total` (1.0 = never fails). 1.0 for an empty set.
59    pub pass_rate: f64,
60    /// Fraction of *all* cases that either passed or failed actionably — i.e. the
61    /// agent was never left at a dead end. 1.0 for an empty set.
62    pub actionable_rate: f64,
63}
64
65/// Assess reliability by running `run` over each case and aggregating outcomes.
66pub fn assess_reliability<I>(cases: &[I], run: impl Fn(&I) -> Outcome) -> ReliabilityReport {
67    let total = cases.len();
68    let mut passed = 0usize;
69    let mut structured_failures = 0usize;
70    for case in cases {
71        let o = run(case);
72        if o.ok {
73            passed += 1;
74        } else if o.structured_error {
75            structured_failures += 1;
76        }
77    }
78    let failed = total - passed;
79    let (pass_rate, actionable_rate) = if total == 0 {
80        (1.0, 1.0)
81    } else {
82        (
83            passed as f64 / total as f64,
84            (passed + structured_failures) as f64 / total as f64,
85        )
86    };
87    ReliabilityReport {
88        total,
89        passed,
90        failed,
91        structured_failures,
92        pass_rate,
93        actionable_rate,
94    }
95}
96
97impl std::fmt::Display for ReliabilityReport {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        write!(
100            f,
101            "pass {:.0}% actionable {:.0}% ({}/{} ok, {} structured failures)",
102            self.pass_rate * 100.0,
103            self.actionable_rate * 100.0,
104            self.passed,
105            self.total,
106            self.structured_failures
107        )
108    }
109}
110
111/// A **graded** assessment of how *actionable* a failure is — refining the binary
112/// `structured_error` of [`Outcome`]. Each component an agent can use to
113/// self-correct contributes equally to the score.
114#[cfg_attr(feature = "serde", derive(serde::Serialize))]
115#[derive(Debug, Clone, Copy, Default)]
116pub struct ErrorQuality {
117    /// A stable, machine-branchable error *code* (e.g. `E_BAD_ARG`).
118    pub has_code: bool,
119    /// A human-readable *message*.
120    pub has_message: bool,
121    /// *Where* it failed (line, argument index, field path).
122    pub has_location: bool,
123    /// A remediation *hint/suggestion* the agent can act on.
124    pub has_fix: bool,
125}
126
127impl ErrorQuality {
128    /// 0.0–1.0 actionability: the fraction of the four components present.
129    pub fn score(&self) -> f64 {
130        let present = [
131            self.has_code,
132            self.has_message,
133            self.has_location,
134            self.has_fix,
135        ]
136        .into_iter()
137        .filter(|b| *b)
138        .count();
139        present as f64 / 4.0
140    }
141}
142
143/// Aggregate [`ErrorQuality`] over a set of failures.
144#[cfg_attr(feature = "serde", derive(serde::Serialize))]
145#[derive(Debug, Clone)]
146pub struct ErrorQualityReport {
147    /// Number of failures graded.
148    pub failures: usize,
149    /// How many carried a stable code.
150    pub with_code: usize,
151    /// How many pinpointed a location.
152    pub with_location: usize,
153    /// How many carried a remediation hint.
154    pub with_fix: usize,
155    /// Mean actionability score over the failures (1.0 for an empty set — no dead ends).
156    pub mean_score: f64,
157}
158
159/// Grade each failure with `grade` and aggregate. Pass only the *failing* cases (the
160/// successes carry no error to grade); an empty set scores 1.0 vacuously.
161pub fn assess_error_quality<I>(
162    failures: &[I],
163    grade: impl Fn(&I) -> ErrorQuality,
164) -> ErrorQualityReport {
165    let (mut with_code, mut with_location, mut with_fix, mut total) = (0, 0, 0, 0.0);
166    for case in failures {
167        let q = grade(case);
168        if q.has_code {
169            with_code += 1;
170        }
171        if q.has_location {
172            with_location += 1;
173        }
174        if q.has_fix {
175            with_fix += 1;
176        }
177        total += q.score();
178    }
179    let n = failures.len();
180    ErrorQualityReport {
181        failures: n,
182        with_code,
183        with_location,
184        with_fix,
185        mean_score: if n == 0 { 1.0 } else { total / n as f64 },
186    }
187}
188
189impl std::fmt::Display for ErrorQualityReport {
190    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
191        write!(
192            f,
193            "error quality {:.0}% over {} failures (code={} location={} fix={})",
194            self.mean_score * 100.0,
195            self.failures,
196            self.with_code,
197            self.with_location,
198            self.with_fix
199        )
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn all_pass_is_perfect() {
209        let cases = ["a", "b", "c"];
210        let r = assess_reliability(&cases, |_| Outcome::ok());
211        assert_eq!(r.passed, 3);
212        assert_eq!(r.failed, 0);
213        assert_eq!(r.pass_rate, 1.0);
214        assert_eq!(r.actionable_rate, 1.0);
215    }
216
217    #[test]
218    fn structured_failures_are_actionable_even_when_not_passing() {
219        // 2 pass, 1 structured failure, 1 opaque failure of 4.
220        let cases = [0, 1, 2, 3];
221        let r = assess_reliability(&cases, |&i| match i {
222            0 | 1 => Outcome::ok(),
223            2 => Outcome::structured_failure(),
224            _ => Outcome::opaque_failure(),
225        });
226        assert_eq!(r.passed, 2);
227        assert_eq!(r.failed, 2);
228        assert_eq!(r.structured_failures, 1);
229        assert_eq!(r.pass_rate, 0.5);
230        // passed (2) + structured failure (1) = 3/4 were not dead ends.
231        assert_eq!(r.actionable_rate, 0.75);
232    }
233
234    #[test]
235    fn empty_set_is_vacuously_reliable() {
236        let cases: [&str; 0] = [];
237        let r = assess_reliability(&cases, |_| Outcome::ok());
238        assert_eq!(r.pass_rate, 1.0);
239        assert_eq!(r.actionable_rate, 1.0);
240    }
241
242    #[test]
243    fn error_quality_grades_actionability_components() {
244        // Full E_*-code-plus-hint-plus-location error scores 1.0; bare prose 0.25.
245        let rich = ErrorQuality {
246            has_code: true,
247            has_message: true,
248            has_location: true,
249            has_fix: true,
250        };
251        assert_eq!(rich.score(), 1.0);
252        let prose = ErrorQuality {
253            has_message: true,
254            ..Default::default()
255        };
256        assert_eq!(prose.score(), 0.25);
257
258        // Aggregate over two failures: mean of 1.0 and 0.25 = 0.625.
259        let failures = [rich, prose];
260        let r = assess_error_quality(&failures, |q| *q);
261        assert_eq!(r.failures, 2);
262        assert_eq!(r.with_code, 1);
263        assert!((r.mean_score - 0.625).abs() < 1e-9);
264        // No failures → vacuously perfect.
265        let empty: [ErrorQuality; 0] = [];
266        assert_eq!(assess_error_quality(&empty, |q| *q).mean_score, 1.0);
267    }
268}