1use std::collections::BTreeMap;
2
3use roder_api::events::{RoderEvent, ThreadId, TurnFailed, TurnId};
4use roder_api::inference::InferenceEvent;
5use roder_api::reliability::{
6 ReliabilityContext, ReliabilityDetails, ReliabilityErrorClass, ReliabilityFailureRecorded,
7 ReliabilityLimitDecision, ReliabilityLimitKind, ReliabilityLimitRecorded,
8 ReliabilityRetryDecision, ReliabilityRetryRecorded,
9};
10use serde::{Deserialize, Serialize};
11use time::OffsetDateTime;
12
13use crate::{EvalFailureClass, EvalFixture, EvalMetric, EvalMetricKind, EvalOutcome};
14
15use super::report::{EvalFixtureResult, EvalSuiteReport};
16
17#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
18#[serde(rename_all = "camelCase")]
19pub struct ReliabilityReportSummary {
20 #[serde(default)]
21 pub error_class_counts: BTreeMap<String, u64>,
22 pub retry_attempts: u64,
23 pub retry_recoveries: u64,
24 pub failure_limit_stops: u64,
25 pub unknown_errors: u64,
26}
27
28pub(super) struct FixtureReliabilityInjection {
29 pub events: Vec<RoderEvent>,
30 pub outcome: Option<EvalOutcome>,
31 pub failure_class: Option<EvalFailureClass>,
32 pub failure_message: Option<String>,
33}
34
35pub(super) fn fixture_reliability_injection(
36 fixture: &EvalFixture,
37 thread_id: &ThreadId,
38 turn_id: &TurnId,
39) -> Option<FixtureReliabilityInjection> {
40 let tag = fixture
41 .tags
42 .iter()
43 .find_map(|tag| tag.strip_prefix("reliability:"))?;
44 let context = context(thread_id, turn_id);
45 let timestamp = OffsetDateTime::now_utc();
46 match tag {
47 "invalid_arguments" => Some(FixtureReliabilityInjection {
48 events: vec![RoderEvent::ReliabilityFailureRecorded(
49 ReliabilityFailureRecorded {
50 context,
51 error_class: ReliabilityErrorClass::InvalidArguments,
52 details: ReliabilityDetails::redacted("missing required tool field path"),
53 timestamp,
54 },
55 )],
56 outcome: Some(EvalOutcome::Fail),
57 failure_class: Some(EvalFailureClass::ToolSchema),
58 failure_message: Some("invalid tool arguments were classified".to_string()),
59 }),
60 "missing_file" => Some(FixtureReliabilityInjection {
61 events: vec![RoderEvent::ReliabilityFailureRecorded(
62 ReliabilityFailureRecorded {
63 context,
64 error_class: ReliabilityErrorClass::UnexpectedEnvironment,
65 details: ReliabilityDetails::redacted("missing file src/missing.rs"),
66 timestamp,
67 },
68 )],
69 outcome: Some(EvalOutcome::Fail),
70 failure_class: Some(EvalFailureClass::Environment),
71 failure_message: Some("missing file was classified as environment failure".to_string()),
72 }),
73 "provider_empty_body" => Some(FixtureReliabilityInjection {
74 events: vec![retry_event(
75 context,
76 1,
77 2,
78 Some(0),
79 "empty provider body",
80 timestamp,
81 )],
82 outcome: None,
83 failure_class: None,
84 failure_message: None,
85 }),
86 "provider_429" => Some(FixtureReliabilityInjection {
87 events: vec![retry_event(context, 1, 3, Some(0), "status_429", timestamp)],
88 outcome: None,
89 failure_class: None,
90 failure_message: None,
91 }),
92 "repeated_timeout" => Some(FixtureReliabilityInjection {
93 events: vec![
94 RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
95 context: context.clone(),
96 error_class: ReliabilityErrorClass::Timeout,
97 limit_kind: ReliabilityLimitKind::ModelCallsPerTurn,
98 decision: ReliabilityLimitDecision::StopTurn,
99 current: 3,
100 limit: 3,
101 details: ReliabilityDetails::redacted("repeated timeout limit reached"),
102 timestamp,
103 }),
104 RoderEvent::TurnFailed(TurnFailed {
105 thread_id: thread_id.clone(),
106 turn_id: turn_id.clone(),
107 error: "repeated timeout limit reached".to_string(),
108 error_kind: Some("reliability_limit".to_string()),
109 usage: None,
110 timestamp,
111 }),
112 ],
113 outcome: Some(EvalOutcome::Timeout),
114 failure_class: Some(EvalFailureClass::Runtime),
115 failure_message: Some("repeated timeout limit reached".to_string()),
116 }),
117 "unknown_panic" => Some(FixtureReliabilityInjection {
118 events: vec![RoderEvent::ReliabilityFailureRecorded(
119 ReliabilityFailureRecorded {
120 context,
121 error_class: ReliabilityErrorClass::Unknown,
122 details: ReliabilityDetails::redacted("panic converted to unknown failure"),
123 timestamp,
124 },
125 )],
126 outcome: Some(EvalOutcome::HarnessError),
127 failure_class: Some(EvalFailureClass::Unknown),
128 failure_message: Some("unknown panic conversion was classified".to_string()),
129 }),
130 _ => None,
131 }
132}
133
134pub(super) fn reliability_metrics(events: &[RoderEvent], outcome: &EvalOutcome) -> Vec<EvalMetric> {
135 let summary = summarize_events(events, outcome);
136 let mut metrics = vec![
137 count_metric("reliability_retry_attempts", summary.retry_attempts),
138 count_metric("reliability_retry_recoveries", summary.retry_recoveries),
139 count_metric(
140 "reliability_failure_limit_stops",
141 summary.failure_limit_stops,
142 ),
143 count_metric("reliability_unknown_errors", summary.unknown_errors),
144 ];
145 for (class, count) in summary.error_class_counts {
146 metrics.push(count_metric(
147 &format!("reliability_error_class_{class}"),
148 count,
149 ));
150 }
151 metrics
152}
153
154pub(super) fn reliability_summary(report: &EvalSuiteReport) -> ReliabilityReportSummary {
155 report.results.iter().fold(
156 ReliabilityReportSummary::default(),
157 |mut summary, result| {
158 let current = summarize_result(result);
159 merge_summary(&mut summary, current);
160 summary
161 },
162 )
163}
164
165pub(super) fn reliability_markdown(report: &EvalSuiteReport) -> String {
166 let summary = reliability_summary(report);
167 let mut text = String::from("\n## Reliability Metrics\n\n| Metric | Value |\n| --- | ---: |\n");
168 text.push_str(&format!(
169 "| Retry attempts | {} |\n| Retry recoveries | {} |\n| Failure-limit stops | {} |\n| Unknown errors | {} |\n",
170 summary.retry_attempts,
171 summary.retry_recoveries,
172 summary.failure_limit_stops,
173 summary.unknown_errors
174 ));
175 text.push_str("\n| Error class | Count |\n| --- | ---: |\n");
176 for (class, count) in &summary.error_class_counts {
177 text.push_str(&format!("| `{class}` | {count} |\n"));
178 }
179 text.push_str(
180 "\n| Fixture | Outcome | Retry attempts | Limit stops | Unknown errors |\n| --- | --- | ---: | ---: | ---: |\n",
181 );
182 for result in &report.results {
183 let current = summarize_result(result);
184 text.push_str(&format!(
185 "| `{}` | `{:?}` | {} | {} | {} |\n",
186 result.fixture_id,
187 result.report.outcome,
188 current.retry_attempts,
189 current.failure_limit_stops,
190 current.unknown_errors
191 ));
192 }
193 text
194}
195
196fn summarize_result(result: &EvalFixtureResult) -> ReliabilityReportSummary {
197 let mut summary = ReliabilityReportSummary::default();
198 for metric in &result.report.metrics {
199 let value = metric.value.max(0.0) as u64;
200 match metric.name.as_str() {
201 "reliability_retry_attempts" => summary.retry_attempts = value,
202 "reliability_retry_recoveries" => summary.retry_recoveries = value,
203 "reliability_failure_limit_stops" => summary.failure_limit_stops = value,
204 "reliability_unknown_errors" => summary.unknown_errors = value,
205 name => {
206 if let Some(class) = name.strip_prefix("reliability_error_class_") {
207 summary.error_class_counts.insert(class.to_string(), value);
208 }
209 }
210 }
211 }
212 summary
213}
214
215fn summarize_events(events: &[RoderEvent], outcome: &EvalOutcome) -> ReliabilityReportSummary {
216 let mut summary = ReliabilityReportSummary::default();
217 for event in events {
218 match event {
219 RoderEvent::ReliabilityRetryRecorded(retry) => {
220 summary.retry_attempts += 1;
221 add_class(&mut summary, retry.error_class);
222 }
223 RoderEvent::ReliabilityFailureRecorded(failure) => {
224 add_class(&mut summary, failure.error_class);
225 if failure.error_class == ReliabilityErrorClass::Unknown {
226 summary.unknown_errors += 1;
227 }
228 }
229 RoderEvent::ReliabilityLimitRecorded(limit) => {
230 add_class(&mut summary, limit.error_class);
231 if limit.decision != ReliabilityLimitDecision::Continue {
232 summary.failure_limit_stops += 1;
233 }
234 if limit.error_class == ReliabilityErrorClass::Unknown {
235 summary.unknown_errors += 1;
236 }
237 }
238 RoderEvent::InferenceEventReceived(received)
239 if provider_metadata_is_retry(&received.event) =>
240 {
241 summary.retry_attempts += 1;
242 *summary
243 .error_class_counts
244 .entry(error_class_key(ReliabilityErrorClass::ProviderError))
245 .or_insert(0) += 1;
246 }
247 _ => {}
248 }
249 }
250 if *outcome == EvalOutcome::Pass && summary.retry_attempts > 0 {
251 summary.retry_recoveries = 1;
252 }
253 summary
254}
255
256fn provider_metadata_is_retry(event: &InferenceEvent) -> bool {
257 matches!(
258 event,
259 InferenceEvent::ProviderMetadata(metadata)
260 if metadata.get("kind").and_then(serde_json::Value::as_str)
261 == Some("reliability_retry_attempt")
262 )
263}
264
265fn merge_summary(target: &mut ReliabilityReportSummary, source: ReliabilityReportSummary) {
266 target.retry_attempts += source.retry_attempts;
267 target.retry_recoveries += source.retry_recoveries;
268 target.failure_limit_stops += source.failure_limit_stops;
269 target.unknown_errors += source.unknown_errors;
270 for (class, count) in source.error_class_counts {
271 *target.error_class_counts.entry(class).or_insert(0) += count;
272 }
273}
274
275fn retry_event(
276 context: ReliabilityContext,
277 attempt: u32,
278 max_attempts: u32,
279 delay_ms: Option<u64>,
280 details: &str,
281 timestamp: OffsetDateTime,
282) -> RoderEvent {
283 RoderEvent::ReliabilityRetryRecorded(ReliabilityRetryRecorded {
284 context,
285 error_class: ReliabilityErrorClass::ProviderError,
286 decision: ReliabilityRetryDecision::Retry,
287 attempt,
288 max_attempts,
289 delay_ms,
290 details: ReliabilityDetails::redacted(details),
291 timestamp,
292 })
293}
294
295fn context(thread_id: &ThreadId, turn_id: &TurnId) -> ReliabilityContext {
296 ReliabilityContext {
297 thread_id: thread_id.clone(),
298 turn_id: turn_id.clone(),
299 tool_id: None,
300 tool_name: None,
301 provider: Some("mock".to_string()),
302 model: Some("mock".to_string()),
303 }
304}
305
306fn add_class(summary: &mut ReliabilityReportSummary, class: ReliabilityErrorClass) {
307 *summary
308 .error_class_counts
309 .entry(error_class_key(class))
310 .or_insert(0) += 1;
311}
312
313fn error_class_key(class: ReliabilityErrorClass) -> String {
314 serde_json::to_value(class)
315 .ok()
316 .and_then(|value| value.as_str().map(str::to_string))
317 .unwrap_or_else(|| format!("{class:?}"))
318}
319
320fn count_metric(name: &str, value: u64) -> EvalMetric {
321 EvalMetric {
322 name: name.to_string(),
323 kind: EvalMetricKind::Count,
324 value: value as f64,
325 unit: None,
326 }
327}
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332 use crate::{EvalReport, EvalRun, EvalTrajectory};
333
334 #[test]
335 fn reliability_fixture_injection_classifies_unknown_panics() {
336 let fixture = EvalFixture {
337 id: "unknown".to_string(),
338 title: "Unknown".to_string(),
339 prompt: "Classify unknown panic".to_string(),
340 tags: vec!["reliability:unknown_panic".to_string()],
341 workspace: Default::default(),
342 timeout_ms: None,
343 expected: Default::default(),
344 constraints: Vec::new(),
345 lazy_discovery: None,
346 };
347
348 let injection =
349 fixture_reliability_injection(&fixture, &"thread".to_string(), &"turn".to_string())
350 .unwrap();
351
352 assert_eq!(injection.outcome, Some(EvalOutcome::HarnessError));
353 assert_eq!(injection.failure_class, Some(EvalFailureClass::Unknown));
354 assert!(matches!(
355 injection.events[0],
356 RoderEvent::ReliabilityFailureRecorded(_)
357 ));
358 }
359
360 #[test]
361 fn reliability_summary_counts_retries_limits_and_unknowns() {
362 let thread_id = "thread".to_string();
363 let turn_id = "turn".to_string();
364 let events = vec![
365 retry_event(
366 context(&thread_id, &turn_id),
367 1,
368 3,
369 Some(0),
370 "status_429",
371 OffsetDateTime::UNIX_EPOCH,
372 ),
373 RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
374 context: context(&thread_id, &turn_id),
375 error_class: ReliabilityErrorClass::Unknown,
376 limit_kind: ReliabilityLimitKind::ModelCallsPerTurn,
377 decision: ReliabilityLimitDecision::StopTurn,
378 current: 1,
379 limit: 1,
380 details: ReliabilityDetails::redacted("unknown"),
381 timestamp: OffsetDateTime::UNIX_EPOCH,
382 }),
383 ];
384 let trajectory = EvalTrajectory::from_events(&thread_id, &turn_id, &events);
385 let result = EvalFixtureResult {
386 fixture_id: "provider-429".to_string(),
387 title: "Provider 429".to_string(),
388 workspace: std::path::PathBuf::from("/tmp/workspace"),
389 final_answer: String::new(),
390 report: EvalReport {
391 run: EvalRun {
392 suite_id: "reliability".to_string(),
393 run_id: "run".to_string(),
394 provider: "mock".to_string(),
395 model: "mock".to_string(),
396 started_at: OffsetDateTime::UNIX_EPOCH,
397 tags: vec!["reliability".to_string()],
398 },
399 outcome: EvalOutcome::Pass,
400 failure_class: None,
401 trajectory,
402 metrics: reliability_metrics(&events, &EvalOutcome::Pass),
403 },
404 trace_excerpt: Vec::new(),
405 failure_message: None,
406 };
407 let report = EvalSuiteReport {
408 suite_id: "reliability".to_string(),
409 fixture_dir: std::path::PathBuf::from("evals/fixtures/reliability"),
410 output_dir: std::path::PathBuf::from("/tmp/reports"),
411 offline: true,
412 generated_at: OffsetDateTime::UNIX_EPOCH,
413 results: vec![result],
414 };
415
416 let summary = reliability_summary(&report);
417
418 assert_eq!(summary.retry_attempts, 1);
419 assert_eq!(summary.retry_recoveries, 1);
420 assert_eq!(summary.failure_limit_stops, 1);
421 assert_eq!(summary.unknown_errors, 1);
422 assert_eq!(summary.error_class_counts["provider_error"], 1);
423 assert_eq!(summary.error_class_counts["unknown"], 1);
424 }
425}