1use std::collections::BTreeMap;
2use std::path::Path;
3
4use serde::{Deserialize, Serialize};
5
6use super::report::{EvalFixtureResult, EvalSuiteReport, list_eval_reports};
7
8#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "camelCase")]
10pub struct ReliabilityBaseline {
11 pub version: u32,
12 #[serde(default)]
13 pub unknown_error_blocker_threshold: u64,
14 #[serde(default)]
15 pub expectations: Vec<ReliabilityBaselineExpectation>,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19#[serde(rename_all = "camelCase")]
20pub struct ReliabilityBaselineExpectation {
21 pub scope: String,
22 pub metric: String,
23 #[serde(default)]
24 pub max_count: u64,
25 #[serde(default)]
26 pub max_increase: u64,
27}
28
29#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
30#[serde(rename_all = "camelCase")]
31pub struct ReliabilityBaselineComparison {
32 pub status: ReliabilityBaselineStatus,
33 pub rows: Vec<ReliabilityBaselineRow>,
34 pub unknown_errors: u64,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
38#[serde(rename_all = "snake_case")]
39pub enum ReliabilityBaselineStatus {
40 Ok,
41 Attention,
42 Blocked,
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(rename_all = "camelCase")]
47pub struct ReliabilityBaselineRow {
48 pub scope: String,
49 pub metric: String,
50 pub current: u64,
51 pub allowed: u64,
52 pub status: ReliabilityBaselineStatus,
53}
54
55pub fn compare_eval_report_to_baseline(
56 output_dir: &Path,
57 report_id: &str,
58 baseline_path: &Path,
59) -> anyhow::Result<String> {
60 let report = read_suite_report(output_dir, report_id)?;
61 let baseline: ReliabilityBaseline =
62 serde_json::from_str(&std::fs::read_to_string(baseline_path)?)?;
63 let comparison = compare_reliability_baseline(&report, &baseline);
64 Ok(reliability_baseline_markdown(&comparison))
65}
66
67pub fn compare_reliability_baseline(
68 report: &EvalSuiteReport,
69 baseline: &ReliabilityBaseline,
70) -> ReliabilityBaselineComparison {
71 let scoped = scoped_reliability_metrics(report);
72 let unknown_errors = metric_value(&scoped, "suite", "reliability_unknown_errors");
73 let mut rows = Vec::new();
74 let mut status = if unknown_errors > baseline.unknown_error_blocker_threshold {
75 ReliabilityBaselineStatus::Blocked
76 } else {
77 ReliabilityBaselineStatus::Ok
78 };
79 for expectation in &baseline.expectations {
80 let current = metric_value(&scoped, &expectation.scope, &expectation.metric);
81 let allowed = expectation
82 .max_count
83 .saturating_add(expectation.max_increase);
84 let row_status = if expectation.metric == "reliability_unknown_errors"
85 && current > expectation.max_count
86 {
87 ReliabilityBaselineStatus::Blocked
88 } else if current > allowed {
89 ReliabilityBaselineStatus::Attention
90 } else {
91 ReliabilityBaselineStatus::Ok
92 };
93 status = merge_status(status, &row_status);
94 rows.push(ReliabilityBaselineRow {
95 scope: expectation.scope.clone(),
96 metric: expectation.metric.clone(),
97 current,
98 allowed,
99 status: row_status,
100 });
101 }
102 ReliabilityBaselineComparison {
103 status,
104 rows,
105 unknown_errors,
106 }
107}
108
109fn read_suite_report(output_dir: &Path, report_id: &str) -> anyhow::Result<EvalSuiteReport> {
110 let reports = list_eval_reports(output_dir)?;
111 let summary = reports
112 .into_iter()
113 .find(|report| report.id == report_id)
114 .ok_or_else(|| anyhow::anyhow!("eval report not found: {report_id}"))?;
115 let path = summary.path.join("eval-run.json");
116 Ok(serde_json::from_str(&std::fs::read_to_string(path)?)?)
117}
118
119fn reliability_baseline_markdown(comparison: &ReliabilityBaselineComparison) -> String {
120 let mut text = format!(
121 "\n## Reliability Baseline Comparison\n\nStatus: `{:?}`\n\nUnknown errors: `{}`\n\n| Scope | Metric | Current | Allowed | Status |\n| --- | --- | ---: | ---: | --- |\n",
122 comparison.status, comparison.unknown_errors
123 );
124 for row in &comparison.rows {
125 text.push_str(&format!(
126 "| `{}` | `{}` | {} | {} | `{:?}` |\n",
127 row.scope, row.metric, row.current, row.allowed, row.status
128 ));
129 }
130 text
131}
132
133fn scoped_reliability_metrics(report: &EvalSuiteReport) -> BTreeMap<(String, String), u64> {
134 let mut scoped = BTreeMap::new();
135 for result in &report.results {
136 add_result_metrics(&mut scoped, "suite", result);
137 let model = format!(
138 "model:{}/{}",
139 result.report.run.provider, result.report.run.model
140 );
141 add_result_metrics(&mut scoped, &model, result);
142 for tag in result
143 .report
144 .run
145 .tags
146 .iter()
147 .filter(|tag| tag.starts_with("tool:"))
148 {
149 add_result_metrics(&mut scoped, tag, result);
150 }
151 }
152 scoped
153}
154
155fn add_result_metrics(
156 scoped: &mut BTreeMap<(String, String), u64>,
157 scope: &str,
158 result: &EvalFixtureResult,
159) {
160 for metric in &result.report.metrics {
161 if !metric.name.starts_with("reliability_") {
162 continue;
163 }
164 *scoped
165 .entry((scope.to_string(), metric.name.clone()))
166 .or_insert(0) += metric.value.max(0.0) as u64;
167 }
168}
169
170fn metric_value(scoped: &BTreeMap<(String, String), u64>, scope: &str, metric: &str) -> u64 {
171 scoped
172 .get(&(scope.to_string(), metric.to_string()))
173 .copied()
174 .unwrap_or(0)
175}
176
177fn merge_status(
178 current: ReliabilityBaselineStatus,
179 next: &ReliabilityBaselineStatus,
180) -> ReliabilityBaselineStatus {
181 match (current, next) {
182 (ReliabilityBaselineStatus::Blocked, _) | (_, ReliabilityBaselineStatus::Blocked) => {
183 ReliabilityBaselineStatus::Blocked
184 }
185 (ReliabilityBaselineStatus::Attention, _) | (_, ReliabilityBaselineStatus::Attention) => {
186 ReliabilityBaselineStatus::Attention
187 }
188 _ => ReliabilityBaselineStatus::Ok,
189 }
190}
191
192#[cfg(test)]
193mod tests {
194 use std::path::PathBuf;
195
196 use roder_api::events::{RoderEvent, ThreadId, TurnId};
197 use roder_api::reliability::{
198 ReliabilityContext, ReliabilityDetails, ReliabilityErrorClass, ReliabilityFailureRecorded,
199 };
200 use time::OffsetDateTime;
201
202 use crate::{EvalMetric, EvalMetricKind, EvalOutcome, EvalReport, EvalRun, EvalTrajectory};
203
204 use super::*;
205
206 #[test]
207 fn reliability_baseline_flags_unknown_errors_as_blockers() {
208 let report = suite_report(vec![fixture_result(
209 "unknown-panic",
210 vec![
211 metric("reliability_unknown_errors", 1.0),
212 metric("reliability_error_class_unknown", 1.0),
213 ],
214 EvalOutcome::HarnessError,
215 )]);
216 let baseline = ReliabilityBaseline {
217 version: 1,
218 unknown_error_blocker_threshold: 0,
219 expectations: vec![ReliabilityBaselineExpectation {
220 scope: "suite".to_string(),
221 metric: "reliability_unknown_errors".to_string(),
222 max_count: 0,
223 max_increase: 0,
224 }],
225 };
226
227 let comparison = compare_reliability_baseline(&report, &baseline);
228
229 assert_eq!(comparison.status, ReliabilityBaselineStatus::Blocked);
230 assert_eq!(comparison.unknown_errors, 1);
231 assert_eq!(
232 comparison.rows[0].status,
233 ReliabilityBaselineStatus::Blocked
234 );
235 }
236
237 #[test]
238 fn reliability_baseline_flags_per_model_spikes() {
239 let report = suite_report(vec![fixture_result(
240 "provider-429",
241 vec![metric("reliability_error_class_provider_error", 3.0)],
242 EvalOutcome::Pass,
243 )]);
244 let baseline = ReliabilityBaseline {
245 version: 1,
246 unknown_error_blocker_threshold: 0,
247 expectations: vec![ReliabilityBaselineExpectation {
248 scope: "model:mock/mock".to_string(),
249 metric: "reliability_error_class_provider_error".to_string(),
250 max_count: 1,
251 max_increase: 1,
252 }],
253 };
254
255 let comparison = compare_reliability_baseline(&report, &baseline);
256
257 assert_eq!(comparison.status, ReliabilityBaselineStatus::Attention);
258 assert_eq!(comparison.rows[0].current, 3);
259 assert_eq!(comparison.rows[0].allowed, 2);
260 }
261
262 fn suite_report(results: Vec<EvalFixtureResult>) -> EvalSuiteReport {
263 EvalSuiteReport {
264 suite_id: "reliability".to_string(),
265 fixture_dir: PathBuf::from("evals/fixtures/reliability"),
266 output_dir: PathBuf::from("/tmp/roder-evals"),
267 offline: true,
268 generated_at: OffsetDateTime::UNIX_EPOCH,
269 results,
270 }
271 }
272
273 fn fixture_result(
274 fixture_id: &str,
275 metrics: Vec<EvalMetric>,
276 outcome: EvalOutcome,
277 ) -> EvalFixtureResult {
278 EvalFixtureResult {
279 fixture_id: fixture_id.to_string(),
280 title: fixture_id.to_string(),
281 workspace: PathBuf::from("/tmp/workspace"),
282 final_answer: String::new(),
283 report: EvalReport {
284 run: EvalRun {
285 suite_id: "reliability".to_string(),
286 run_id: "run".to_string(),
287 provider: "mock".to_string(),
288 model: "mock".to_string(),
289 started_at: OffsetDateTime::UNIX_EPOCH,
290 tags: vec!["reliability".to_string(), "tool:read_file".to_string()],
291 },
292 outcome,
293 failure_class: None,
294 trajectory: EvalTrajectory::from_events(
295 ThreadId::from("thread"),
296 TurnId::from("turn"),
297 &[RoderEvent::ReliabilityFailureRecorded(
298 ReliabilityFailureRecorded {
299 context: ReliabilityContext {
300 thread_id: "thread".to_string(),
301 turn_id: "turn".to_string(),
302 tool_id: None,
303 tool_name: None,
304 provider: Some("mock".to_string()),
305 model: Some("mock".to_string()),
306 },
307 error_class: ReliabilityErrorClass::Unknown,
308 details: ReliabilityDetails::redacted("test"),
309 timestamp: OffsetDateTime::UNIX_EPOCH,
310 },
311 )],
312 ),
313 metrics,
314 },
315 trace_excerpt: Vec::new(),
316 failure_message: None,
317 }
318 }
319
320 fn metric(name: &str, value: f64) -> EvalMetric {
321 EvalMetric {
322 name: name.to_string(),
323 kind: EvalMetricKind::Count,
324 value,
325 unit: None,
326 }
327 }
328}