Skip to main content

roder_evals/runner/
reliability.rs

1use std::collections::BTreeMap;
2
3use roder_api::events::{RoderEvent, ThreadId, TurnFailed, TurnId};
4use roder_api::inference::InferenceEvent;
5use roder_api::reliability::{
6    ReliabilityContext, ReliabilityDetails, ReliabilityErrorClass, ReliabilityFailureRecorded,
7    ReliabilityLimitDecision, ReliabilityLimitKind, ReliabilityLimitRecorded,
8    ReliabilityRetryDecision, ReliabilityRetryRecorded,
9};
10use serde::{Deserialize, Serialize};
11use time::OffsetDateTime;
12
13use crate::{EvalFailureClass, EvalFixture, EvalMetric, EvalMetricKind, EvalOutcome};
14
15use super::report::{EvalFixtureResult, EvalSuiteReport};
16
17#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
18#[serde(rename_all = "camelCase")]
19pub struct ReliabilityReportSummary {
20    #[serde(default)]
21    pub error_class_counts: BTreeMap<String, u64>,
22    pub retry_attempts: u64,
23    pub retry_recoveries: u64,
24    pub failure_limit_stops: u64,
25    pub unknown_errors: u64,
26}
27
28pub(super) struct FixtureReliabilityInjection {
29    pub events: Vec<RoderEvent>,
30    pub outcome: Option<EvalOutcome>,
31    pub failure_class: Option<EvalFailureClass>,
32    pub failure_message: Option<String>,
33}
34
35pub(super) fn fixture_reliability_injection(
36    fixture: &EvalFixture,
37    thread_id: &ThreadId,
38    turn_id: &TurnId,
39) -> Option<FixtureReliabilityInjection> {
40    let tag = fixture
41        .tags
42        .iter()
43        .find_map(|tag| tag.strip_prefix("reliability:"))?;
44    let context = context(thread_id, turn_id);
45    let timestamp = OffsetDateTime::now_utc();
46    match tag {
47        "invalid_arguments" => Some(FixtureReliabilityInjection {
48            events: vec![RoderEvent::ReliabilityFailureRecorded(
49                ReliabilityFailureRecorded {
50                    context,
51                    error_class: ReliabilityErrorClass::InvalidArguments,
52                    details: ReliabilityDetails::redacted("missing required tool field path"),
53                    timestamp,
54                },
55            )],
56            outcome: Some(EvalOutcome::Fail),
57            failure_class: Some(EvalFailureClass::ToolSchema),
58            failure_message: Some("invalid tool arguments were classified".to_string()),
59        }),
60        "missing_file" => Some(FixtureReliabilityInjection {
61            events: vec![RoderEvent::ReliabilityFailureRecorded(
62                ReliabilityFailureRecorded {
63                    context,
64                    error_class: ReliabilityErrorClass::UnexpectedEnvironment,
65                    details: ReliabilityDetails::redacted("missing file src/missing.rs"),
66                    timestamp,
67                },
68            )],
69            outcome: Some(EvalOutcome::Fail),
70            failure_class: Some(EvalFailureClass::Environment),
71            failure_message: Some("missing file was classified as environment failure".to_string()),
72        }),
73        "provider_empty_body" => Some(FixtureReliabilityInjection {
74            events: vec![retry_event(
75                context,
76                1,
77                2,
78                Some(0),
79                "empty provider body",
80                timestamp,
81            )],
82            outcome: None,
83            failure_class: None,
84            failure_message: None,
85        }),
86        "provider_429" => Some(FixtureReliabilityInjection {
87            events: vec![retry_event(context, 1, 3, Some(0), "status_429", timestamp)],
88            outcome: None,
89            failure_class: None,
90            failure_message: None,
91        }),
92        "repeated_timeout" => Some(FixtureReliabilityInjection {
93            events: vec![
94                RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
95                    context: context.clone(),
96                    error_class: ReliabilityErrorClass::Timeout,
97                    limit_kind: ReliabilityLimitKind::ModelCallsPerTurn,
98                    decision: ReliabilityLimitDecision::StopTurn,
99                    current: 3,
100                    limit: 3,
101                    details: ReliabilityDetails::redacted("repeated timeout limit reached"),
102                    timestamp,
103                }),
104                RoderEvent::TurnFailed(TurnFailed {
105                    thread_id: thread_id.clone(),
106                    turn_id: turn_id.clone(),
107                    error: "repeated timeout limit reached".to_string(),
108                    error_kind: Some("reliability_limit".to_string()),
109                    usage: None,
110                    timestamp,
111                }),
112            ],
113            outcome: Some(EvalOutcome::Timeout),
114            failure_class: Some(EvalFailureClass::Runtime),
115            failure_message: Some("repeated timeout limit reached".to_string()),
116        }),
117        "unknown_panic" => Some(FixtureReliabilityInjection {
118            events: vec![RoderEvent::ReliabilityFailureRecorded(
119                ReliabilityFailureRecorded {
120                    context,
121                    error_class: ReliabilityErrorClass::Unknown,
122                    details: ReliabilityDetails::redacted("panic converted to unknown failure"),
123                    timestamp,
124                },
125            )],
126            outcome: Some(EvalOutcome::HarnessError),
127            failure_class: Some(EvalFailureClass::Unknown),
128            failure_message: Some("unknown panic conversion was classified".to_string()),
129        }),
130        _ => None,
131    }
132}
133
134pub(super) fn reliability_metrics(events: &[RoderEvent], outcome: &EvalOutcome) -> Vec<EvalMetric> {
135    let summary = summarize_events(events, outcome);
136    let mut metrics = vec![
137        count_metric("reliability_retry_attempts", summary.retry_attempts),
138        count_metric("reliability_retry_recoveries", summary.retry_recoveries),
139        count_metric(
140            "reliability_failure_limit_stops",
141            summary.failure_limit_stops,
142        ),
143        count_metric("reliability_unknown_errors", summary.unknown_errors),
144    ];
145    for (class, count) in summary.error_class_counts {
146        metrics.push(count_metric(
147            &format!("reliability_error_class_{class}"),
148            count,
149        ));
150    }
151    metrics
152}
153
154pub(super) fn reliability_summary(report: &EvalSuiteReport) -> ReliabilityReportSummary {
155    report.results.iter().fold(
156        ReliabilityReportSummary::default(),
157        |mut summary, result| {
158            let current = summarize_result(result);
159            merge_summary(&mut summary, current);
160            summary
161        },
162    )
163}
164
165pub(super) fn reliability_markdown(report: &EvalSuiteReport) -> String {
166    let summary = reliability_summary(report);
167    let mut text = String::from("\n## Reliability Metrics\n\n| Metric | Value |\n| --- | ---: |\n");
168    text.push_str(&format!(
169        "| Retry attempts | {} |\n| Retry recoveries | {} |\n| Failure-limit stops | {} |\n| Unknown errors | {} |\n",
170        summary.retry_attempts,
171        summary.retry_recoveries,
172        summary.failure_limit_stops,
173        summary.unknown_errors
174    ));
175    text.push_str("\n| Error class | Count |\n| --- | ---: |\n");
176    for (class, count) in &summary.error_class_counts {
177        text.push_str(&format!("| `{class}` | {count} |\n"));
178    }
179    text.push_str(
180        "\n| Fixture | Outcome | Retry attempts | Limit stops | Unknown errors |\n| --- | --- | ---: | ---: | ---: |\n",
181    );
182    for result in &report.results {
183        let current = summarize_result(result);
184        text.push_str(&format!(
185            "| `{}` | `{:?}` | {} | {} | {} |\n",
186            result.fixture_id,
187            result.report.outcome,
188            current.retry_attempts,
189            current.failure_limit_stops,
190            current.unknown_errors
191        ));
192    }
193    text
194}
195
196fn summarize_result(result: &EvalFixtureResult) -> ReliabilityReportSummary {
197    let mut summary = ReliabilityReportSummary::default();
198    for metric in &result.report.metrics {
199        let value = metric.value.max(0.0) as u64;
200        match metric.name.as_str() {
201            "reliability_retry_attempts" => summary.retry_attempts = value,
202            "reliability_retry_recoveries" => summary.retry_recoveries = value,
203            "reliability_failure_limit_stops" => summary.failure_limit_stops = value,
204            "reliability_unknown_errors" => summary.unknown_errors = value,
205            name => {
206                if let Some(class) = name.strip_prefix("reliability_error_class_") {
207                    summary.error_class_counts.insert(class.to_string(), value);
208                }
209            }
210        }
211    }
212    summary
213}
214
215fn summarize_events(events: &[RoderEvent], outcome: &EvalOutcome) -> ReliabilityReportSummary {
216    let mut summary = ReliabilityReportSummary::default();
217    for event in events {
218        match event {
219            RoderEvent::ReliabilityRetryRecorded(retry) => {
220                summary.retry_attempts += 1;
221                add_class(&mut summary, retry.error_class);
222            }
223            RoderEvent::ReliabilityFailureRecorded(failure) => {
224                add_class(&mut summary, failure.error_class);
225                if failure.error_class == ReliabilityErrorClass::Unknown {
226                    summary.unknown_errors += 1;
227                }
228            }
229            RoderEvent::ReliabilityLimitRecorded(limit) => {
230                add_class(&mut summary, limit.error_class);
231                if limit.decision != ReliabilityLimitDecision::Continue {
232                    summary.failure_limit_stops += 1;
233                }
234                if limit.error_class == ReliabilityErrorClass::Unknown {
235                    summary.unknown_errors += 1;
236                }
237            }
238            RoderEvent::InferenceEventReceived(received)
239                if provider_metadata_is_retry(&received.event) =>
240            {
241                summary.retry_attempts += 1;
242                *summary
243                    .error_class_counts
244                    .entry(error_class_key(ReliabilityErrorClass::ProviderError))
245                    .or_insert(0) += 1;
246            }
247            _ => {}
248        }
249    }
250    if *outcome == EvalOutcome::Pass && summary.retry_attempts > 0 {
251        summary.retry_recoveries = 1;
252    }
253    summary
254}
255
256fn provider_metadata_is_retry(event: &InferenceEvent) -> bool {
257    matches!(
258        event,
259        InferenceEvent::ProviderMetadata(metadata)
260            if metadata.get("kind").and_then(serde_json::Value::as_str)
261                == Some("reliability_retry_attempt")
262    )
263}
264
265fn merge_summary(target: &mut ReliabilityReportSummary, source: ReliabilityReportSummary) {
266    target.retry_attempts += source.retry_attempts;
267    target.retry_recoveries += source.retry_recoveries;
268    target.failure_limit_stops += source.failure_limit_stops;
269    target.unknown_errors += source.unknown_errors;
270    for (class, count) in source.error_class_counts {
271        *target.error_class_counts.entry(class).or_insert(0) += count;
272    }
273}
274
275fn retry_event(
276    context: ReliabilityContext,
277    attempt: u32,
278    max_attempts: u32,
279    delay_ms: Option<u64>,
280    details: &str,
281    timestamp: OffsetDateTime,
282) -> RoderEvent {
283    RoderEvent::ReliabilityRetryRecorded(ReliabilityRetryRecorded {
284        context,
285        error_class: ReliabilityErrorClass::ProviderError,
286        decision: ReliabilityRetryDecision::Retry,
287        attempt,
288        max_attempts,
289        delay_ms,
290        details: ReliabilityDetails::redacted(details),
291        timestamp,
292    })
293}
294
295fn context(thread_id: &ThreadId, turn_id: &TurnId) -> ReliabilityContext {
296    ReliabilityContext {
297        thread_id: thread_id.clone(),
298        turn_id: turn_id.clone(),
299        tool_id: None,
300        tool_name: None,
301        provider: Some("mock".to_string()),
302        model: Some("mock".to_string()),
303    }
304}
305
306fn add_class(summary: &mut ReliabilityReportSummary, class: ReliabilityErrorClass) {
307    *summary
308        .error_class_counts
309        .entry(error_class_key(class))
310        .or_insert(0) += 1;
311}
312
313fn error_class_key(class: ReliabilityErrorClass) -> String {
314    serde_json::to_value(class)
315        .ok()
316        .and_then(|value| value.as_str().map(str::to_string))
317        .unwrap_or_else(|| format!("{class:?}"))
318}
319
320fn count_metric(name: &str, value: u64) -> EvalMetric {
321    EvalMetric {
322        name: name.to_string(),
323        kind: EvalMetricKind::Count,
324        value: value as f64,
325        unit: None,
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    use crate::{EvalReport, EvalRun, EvalTrajectory};
333
334    #[test]
335    fn reliability_fixture_injection_classifies_unknown_panics() {
336        let fixture = EvalFixture {
337            id: "unknown".to_string(),
338            title: "Unknown".to_string(),
339            prompt: "Classify unknown panic".to_string(),
340            tags: vec!["reliability:unknown_panic".to_string()],
341            workspace: Default::default(),
342            timeout_ms: None,
343            expected: Default::default(),
344            constraints: Vec::new(),
345            lazy_discovery: None,
346        };
347
348        let injection =
349            fixture_reliability_injection(&fixture, &"thread".to_string(), &"turn".to_string())
350                .unwrap();
351
352        assert_eq!(injection.outcome, Some(EvalOutcome::HarnessError));
353        assert_eq!(injection.failure_class, Some(EvalFailureClass::Unknown));
354        assert!(matches!(
355            injection.events[0],
356            RoderEvent::ReliabilityFailureRecorded(_)
357        ));
358    }
359
360    #[test]
361    fn reliability_summary_counts_retries_limits_and_unknowns() {
362        let thread_id = "thread".to_string();
363        let turn_id = "turn".to_string();
364        let events = vec![
365            retry_event(
366                context(&thread_id, &turn_id),
367                1,
368                3,
369                Some(0),
370                "status_429",
371                OffsetDateTime::UNIX_EPOCH,
372            ),
373            RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
374                context: context(&thread_id, &turn_id),
375                error_class: ReliabilityErrorClass::Unknown,
376                limit_kind: ReliabilityLimitKind::ModelCallsPerTurn,
377                decision: ReliabilityLimitDecision::StopTurn,
378                current: 1,
379                limit: 1,
380                details: ReliabilityDetails::redacted("unknown"),
381                timestamp: OffsetDateTime::UNIX_EPOCH,
382            }),
383        ];
384        let trajectory = EvalTrajectory::from_events(&thread_id, &turn_id, &events);
385        let result = EvalFixtureResult {
386            fixture_id: "provider-429".to_string(),
387            title: "Provider 429".to_string(),
388            workspace: std::path::PathBuf::from("/tmp/workspace"),
389            final_answer: String::new(),
390            report: EvalReport {
391                run: EvalRun {
392                    suite_id: "reliability".to_string(),
393                    run_id: "run".to_string(),
394                    provider: "mock".to_string(),
395                    model: "mock".to_string(),
396                    started_at: OffsetDateTime::UNIX_EPOCH,
397                    tags: vec!["reliability".to_string()],
398                },
399                outcome: EvalOutcome::Pass,
400                failure_class: None,
401                trajectory,
402                metrics: reliability_metrics(&events, &EvalOutcome::Pass),
403            },
404            trace_excerpt: Vec::new(),
405            failure_message: None,
406        };
407        let report = EvalSuiteReport {
408            suite_id: "reliability".to_string(),
409            fixture_dir: std::path::PathBuf::from("evals/fixtures/reliability"),
410            output_dir: std::path::PathBuf::from("/tmp/reports"),
411            offline: true,
412            generated_at: OffsetDateTime::UNIX_EPOCH,
413            results: vec![result],
414        };
415
416        let summary = reliability_summary(&report);
417
418        assert_eq!(summary.retry_attempts, 1);
419        assert_eq!(summary.retry_recoveries, 1);
420        assert_eq!(summary.failure_limit_stops, 1);
421        assert_eq!(summary.unknown_errors, 1);
422        assert_eq!(summary.error_class_counts["provider_error"], 1);
423        assert_eq!(summary.error_class_counts["unknown"], 1);
424    }
425}