1use std::cmp::Reverse;
2use std::collections::BTreeMap;
3use std::path::{Path, PathBuf};
4
5use roder_api::events::RoderEvent;
6use roder_api::inference::InferenceEvent;
7use serde::{Deserialize, Serialize};
8use time::OffsetDateTime;
9
10use crate::retrieval_router::retrieval_router_markdown;
11use crate::{EvalMetric, EvalMetricKind, EvalOutcome, EvalTrajectory, EvalTrajectoryEvent};
12
13use super::lazy_discovery::lazy_discovery_markdown;
14use super::reliability::{
15 ReliabilityReportSummary, reliability_markdown, reliability_metrics, reliability_summary,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
19#[serde(rename_all = "camelCase")]
20pub struct EvalSuiteReport {
21 pub suite_id: String,
22 pub fixture_dir: PathBuf,
23 pub output_dir: PathBuf,
24 pub offline: bool,
25 #[serde(with = "time::serde::rfc3339")]
26 pub generated_at: OffsetDateTime,
27 pub results: Vec<EvalFixtureResult>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
31#[serde(rename_all = "camelCase")]
32pub struct EvalFixtureResult {
33 pub fixture_id: String,
34 pub title: String,
35 pub workspace: PathBuf,
36 pub final_answer: String,
37 pub report: crate::EvalReport,
38 #[serde(default)]
39 pub trace_excerpt: Vec<EvalTrajectoryEvent>,
40 #[serde(default, skip_serializing_if = "Option::is_none")]
41 pub failure_message: Option<String>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
45#[serde(rename_all = "camelCase")]
46pub struct EvalReportSummary {
47 pub id: String,
48 pub path: PathBuf,
49 pub suite_id: String,
50 pub fixture_count: usize,
51 pub passed: usize,
52 pub failed: usize,
53 #[serde(default)]
54 pub reliability: ReliabilityReportSummary,
55 #[serde(with = "time::serde::rfc3339")]
56 pub generated_at: OffsetDateTime,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60#[serde(rename_all = "camelCase")]
61pub struct EvalReportDocument {
62 pub summary: EvalReportSummary,
63 pub markdown: String,
64 pub truncated: bool,
65}
66
67pub fn write_eval_report_files(report: &EvalSuiteReport, output_dir: &Path) -> anyhow::Result<()> {
68 std::fs::create_dir_all(output_dir)?;
69 std::fs::write(
70 output_dir.join("eval-run.json"),
71 serde_json::to_string_pretty(report)?,
72 )?;
73 std::fs::write(
74 output_dir.join("eval-report.md"),
75 eval_report_markdown(report),
76 )?;
77 Ok(())
78}
79
80pub fn list_eval_reports(output_dir: &Path) -> anyhow::Result<Vec<EvalReportSummary>> {
81 let mut reports = Vec::new();
82 collect_eval_reports(output_dir, output_dir, &mut reports)?;
83 reports.sort_by_key(|report| Reverse(report.generated_at));
84 Ok(reports)
85}
86
87pub fn read_eval_report(
88 output_dir: &Path,
89 report_id: &str,
90 max_bytes: usize,
91) -> anyhow::Result<EvalReportDocument> {
92 let reports = list_eval_reports(output_dir)?;
93 let summary = reports
94 .into_iter()
95 .find(|report| report.id == report_id)
96 .ok_or_else(|| anyhow::anyhow!("eval report not found: {report_id}"))?;
97 let markdown_path = summary.path.join("eval-report.md");
98 let markdown = std::fs::read_to_string(&markdown_path)?;
99 let truncated = markdown.len() > max_bytes;
100 let markdown = if truncated {
101 markdown.chars().take(max_bytes).collect()
102 } else {
103 markdown
104 };
105 Ok(EvalReportDocument {
106 summary,
107 markdown,
108 truncated,
109 })
110}
111
112pub(super) fn eval_metrics(
113 events: &[RoderEvent],
114 wall_time_ms: u128,
115 outcome: &EvalOutcome,
116) -> Vec<EvalMetric> {
117 let search = search_metrics(events);
118 let model_calls = events
119 .iter()
120 .filter(|event| matches!(event, RoderEvent::InferenceStarted(_)))
121 .count();
122 let tool_calls = events
123 .iter()
124 .filter(|event| matches!(event, RoderEvent::ToolCallRequested(_)))
125 .count();
126 let tool_errors = events
127 .iter()
128 .filter(|event| {
129 matches!(
130 event,
131 RoderEvent::ToolCallCompleted(completed) if completed.is_error
132 )
133 })
134 .count();
135 let child_tasks = events
136 .iter()
137 .filter(|event| {
138 matches!(
139 event,
140 RoderEvent::TaskStarted(_)
141 | RoderEvent::SubagentStarted(_)
142 | RoderEvent::TeamMemberStarted(_)
143 )
144 })
145 .count();
146 let deadline_remaining_seconds = events
147 .iter()
148 .filter_map(|event| match event {
149 RoderEvent::InferenceStarted(started) => started.deadline_remaining_seconds,
150 _ => None,
151 })
152 .next_back()
153 .unwrap_or(0);
154 let total_tokens = events
155 .iter()
156 .filter_map(|event| match event {
157 RoderEvent::InferenceEventReceived(received) => match &received.event {
158 InferenceEvent::Usage(usage) => Some(u64::from(usage.total_tokens)),
159 _ => None,
160 },
161 _ => None,
162 })
163 .sum::<u64>();
164 let context_tokens = events
165 .iter()
166 .filter_map(|event| match event {
167 RoderEvent::ContextAssemblyCompleted(completed) => {
168 Some(u64::from(completed.estimated_tokens))
169 }
170 _ => None,
171 })
172 .max()
173 .unwrap_or(0);
174 let context_bytes = events
175 .iter()
176 .filter_map(|event| match event {
177 RoderEvent::ContextAssemblyCompleted(completed) => Some(completed.total_byte_count),
178 _ => None,
179 })
180 .max()
181 .unwrap_or(0);
182 let entrypoint_candidates = events
183 .iter()
184 .filter_map(|event| match event {
185 RoderEvent::ContextEntrypointCandidatesInjected(injected) => {
186 Some(injected.candidate_count)
187 }
188 _ => None,
189 })
190 .sum::<u64>();
191 let entrypoint_injection_event = events
192 .iter()
193 .position(|event| matches!(event, RoderEvent::ContextEntrypointCandidatesInjected(_)))
194 .map(|index| index as u64 + 1)
195 .unwrap_or(0);
196 let first_relevant_file_read = events
197 .iter()
198 .position(is_relevant_file_read)
199 .map(|index| index as u64 + 1)
200 .unwrap_or(0);
201 let irrelevant_file_reads = events
202 .iter()
203 .filter(|event| is_file_read(event) && !is_relevant_file_read(event))
204 .count() as u64;
205 let truncation_follow_ups = count_truncation_follow_ups(events);
206 let tool_output_truncations = events
207 .iter()
208 .filter(|event| matches!(event, RoderEvent::ToolOutputTruncated(_)))
209 .count() as u64;
210 let task_ledger_updates = events
211 .iter()
212 .filter(|event| matches!(event, RoderEvent::TaskLedgerUpdated(_)))
213 .count() as u64;
214 let task_ledger_tasks = events
215 .iter()
216 .filter_map(|event| match event {
217 RoderEvent::TaskLedgerUpdated(updated) => Some(updated.tasks.len() as u64),
218 _ => None,
219 })
220 .next_back()
221 .unwrap_or(0);
222 let task_ledger_completed = events
223 .iter()
224 .filter_map(|event| match event {
225 RoderEvent::TaskLedgerUpdated(updated) => Some(updated.completed_count),
226 _ => None,
227 })
228 .next_back()
229 .unwrap_or(0);
230 let verification_required = events
231 .iter()
232 .filter(|event| matches!(event, RoderEvent::VerificationRequired(_)))
233 .count() as u64;
234 let verification_completed = events
235 .iter()
236 .filter(|event| {
237 matches!(
238 event,
239 RoderEvent::VerificationCompleted(completed) if completed.passed
240 )
241 })
242 .count() as u64;
243 let verification_failed = events
244 .iter()
245 .filter(|event| {
246 matches!(
247 event,
248 RoderEvent::VerificationCompleted(completed) if !completed.passed
249 )
250 })
251 .count() as u64;
252 let verification_skipped = events
253 .iter()
254 .filter(|event| matches!(event, RoderEvent::VerificationSkipped(_)))
255 .count() as u64;
256 let verification_open_gaps = events
257 .iter()
258 .filter_map(|event| match event {
259 RoderEvent::VerificationCompleted(completed) => Some(completed.open_gaps.len() as u64),
260 RoderEvent::VerificationRequired(required) => Some(required.open_gaps.len() as u64),
261 _ => None,
262 })
263 .next_back()
264 .unwrap_or(0);
265 let mut metrics = vec![
266 EvalMetric {
267 name: "outcome_pass".to_string(),
268 kind: EvalMetricKind::Outcome,
269 value: if outcome == &EvalOutcome::Pass {
270 1.0
271 } else {
272 0.0
273 },
274 unit: None,
275 },
276 EvalMetric {
277 name: "wall_time_ms".to_string(),
278 kind: EvalMetricKind::Duration,
279 value: wall_time_ms as f64,
280 unit: Some("ms".to_string()),
281 },
282 EvalMetric {
283 name: "model_calls".to_string(),
284 kind: EvalMetricKind::Count,
285 value: model_calls as f64,
286 unit: None,
287 },
288 EvalMetric {
289 name: "tool_calls".to_string(),
290 kind: EvalMetricKind::Count,
291 value: tool_calls as f64,
292 unit: None,
293 },
294 EvalMetric {
295 name: "child_task_count".to_string(),
296 kind: EvalMetricKind::Count,
297 value: child_tasks as f64,
298 unit: None,
299 },
300 EvalMetric {
301 name: "deadline_remaining_seconds".to_string(),
302 kind: EvalMetricKind::Duration,
303 value: deadline_remaining_seconds as f64,
304 unit: Some("s".to_string()),
305 },
306 EvalMetric {
307 name: "tool_errors".to_string(),
308 kind: EvalMetricKind::Count,
309 value: tool_errors as f64,
310 unit: None,
311 },
312 EvalMetric {
313 name: "total_tokens".to_string(),
314 kind: EvalMetricKind::Tokens,
315 value: total_tokens as f64,
316 unit: Some("tokens".to_string()),
317 },
318 EvalMetric {
319 name: "context_estimated_tokens".to_string(),
320 kind: EvalMetricKind::Tokens,
321 value: context_tokens as f64,
322 unit: Some("tokens".to_string()),
323 },
324 EvalMetric {
325 name: "context_bytes".to_string(),
326 kind: EvalMetricKind::Bytes,
327 value: context_bytes as f64,
328 unit: Some("bytes".to_string()),
329 },
330 EvalMetric {
331 name: "entrypoint_candidates".to_string(),
332 kind: EvalMetricKind::Count,
333 value: entrypoint_candidates as f64,
334 unit: None,
335 },
336 EvalMetric {
337 name: "entrypoint_injection_event".to_string(),
338 kind: EvalMetricKind::Count,
339 value: entrypoint_injection_event as f64,
340 unit: None,
341 },
342 EvalMetric {
343 name: "first_relevant_file_read_event".to_string(),
344 kind: EvalMetricKind::Count,
345 value: first_relevant_file_read as f64,
346 unit: None,
347 },
348 EvalMetric {
349 name: "irrelevant_file_reads".to_string(),
350 kind: EvalMetricKind::Count,
351 value: irrelevant_file_reads as f64,
352 unit: None,
353 },
354 EvalMetric {
355 name: "truncation_follow_ups".to_string(),
356 kind: EvalMetricKind::Count,
357 value: truncation_follow_ups as f64,
358 unit: None,
359 },
360 EvalMetric {
361 name: "tool_output_truncations".to_string(),
362 kind: EvalMetricKind::Count,
363 value: tool_output_truncations as f64,
364 unit: None,
365 },
366 EvalMetric {
367 name: "grep_calls".to_string(),
368 kind: EvalMetricKind::Count,
369 value: search.calls as f64,
370 unit: None,
371 },
372 EvalMetric {
373 name: "grep_indexed_calls".to_string(),
374 kind: EvalMetricKind::Count,
375 value: search.indexed_calls as f64,
376 unit: None,
377 },
378 EvalMetric {
379 name: "grep_scan_calls".to_string(),
380 kind: EvalMetricKind::Count,
381 value: search.scan_calls as f64,
382 unit: None,
383 },
384 EvalMetric {
385 name: "grep_fallback_calls".to_string(),
386 kind: EvalMetricKind::Count,
387 value: search.fallback_calls as f64,
388 unit: None,
389 },
390 EvalMetric {
391 name: "grep_candidate_files".to_string(),
392 kind: EvalMetricKind::Count,
393 value: search.candidate_files as f64,
394 unit: None,
395 },
396 EvalMetric {
397 name: "grep_verified_files".to_string(),
398 kind: EvalMetricKind::Count,
399 value: search.verified_files as f64,
400 unit: None,
401 },
402 EvalMetric {
403 name: "grep_elapsed_ms".to_string(),
404 kind: EvalMetricKind::Duration,
405 value: search.elapsed_ms as f64,
406 unit: Some("ms".to_string()),
407 },
408 EvalMetric {
409 name: "grep_index_bytes".to_string(),
410 kind: EvalMetricKind::Bytes,
411 value: search.index_bytes as f64,
412 unit: Some("bytes".to_string()),
413 },
414 EvalMetric {
415 name: "grep_index_build_time_ms".to_string(),
416 kind: EvalMetricKind::Duration,
417 value: search.index_build_time_ms as f64,
418 unit: Some("ms".to_string()),
419 },
420 EvalMetric {
421 name: "task_ledger_updates".to_string(),
422 kind: EvalMetricKind::Count,
423 value: task_ledger_updates as f64,
424 unit: None,
425 },
426 EvalMetric {
427 name: "task_ledger_tasks".to_string(),
428 kind: EvalMetricKind::Count,
429 value: task_ledger_tasks as f64,
430 unit: None,
431 },
432 EvalMetric {
433 name: "task_ledger_completed".to_string(),
434 kind: EvalMetricKind::Count,
435 value: task_ledger_completed as f64,
436 unit: None,
437 },
438 EvalMetric {
439 name: "verification_required".to_string(),
440 kind: EvalMetricKind::Count,
441 value: verification_required as f64,
442 unit: None,
443 },
444 EvalMetric {
445 name: "verification_completed".to_string(),
446 kind: EvalMetricKind::Count,
447 value: verification_completed as f64,
448 unit: None,
449 },
450 EvalMetric {
451 name: "verification_failed".to_string(),
452 kind: EvalMetricKind::Count,
453 value: verification_failed as f64,
454 unit: None,
455 },
456 EvalMetric {
457 name: "verification_skipped".to_string(),
458 kind: EvalMetricKind::Count,
459 value: verification_skipped as f64,
460 unit: None,
461 },
462 EvalMetric {
463 name: "verification_open_gaps".to_string(),
464 kind: EvalMetricKind::Count,
465 value: verification_open_gaps as f64,
466 unit: None,
467 },
468 ];
469 metrics.extend(reliability_metrics(events, outcome));
470 metrics
471}
472
473#[derive(Default)]
474struct SearchEvalMetrics {
475 calls: u64,
476 indexed_calls: u64,
477 scan_calls: u64,
478 fallback_calls: u64,
479 candidate_files: u64,
480 verified_files: u64,
481 elapsed_ms: u64,
482 index_bytes: u64,
483 index_build_time_ms: u64,
484}
485
486fn search_metrics(events: &[RoderEvent]) -> SearchEvalMetrics {
487 let mut metrics = SearchEvalMetrics::default();
488 for event in events {
489 let RoderEvent::ToolCallCompleted(completed) = event else {
490 continue;
491 };
492 if completed.tool_name.as_deref() != Some("grep") {
493 continue;
494 }
495 metrics.calls += 1;
496 let Some(payload) = completed.display_payload.as_ref() else {
497 continue;
498 };
499 match payload.get("engine").and_then(serde_json::Value::as_str) {
500 Some("indexed") => metrics.indexed_calls += 1,
501 Some("scan") => metrics.scan_calls += 1,
502 Some("fallback") => metrics.fallback_calls += 1,
503 _ => {}
504 }
505 metrics.candidate_files += u64_payload(payload, "candidate_files");
506 metrics.verified_files += u64_payload(payload, "verified_files");
507 metrics.elapsed_ms += u64_payload(payload, "elapsed_ms");
508 metrics.index_bytes = metrics.index_bytes.max(u64_payload(payload, "index_bytes"));
509 metrics.index_build_time_ms = metrics
510 .index_build_time_ms
511 .max(u64_payload(payload, "index_build_time_ms"));
512 }
513 metrics
514}
515
516fn u64_payload(payload: &serde_json::Value, key: &str) -> u64 {
517 payload
518 .get(key)
519 .and_then(serde_json::Value::as_u64)
520 .unwrap_or_default()
521}
522
523fn is_file_read(event: &RoderEvent) -> bool {
524 matches!(
525 event,
526 RoderEvent::ToolCallCompleted(completed)
527 if completed.tool_name.as_deref() == Some("read_file")
528 )
529}
530
531fn is_relevant_file_read(event: &RoderEvent) -> bool {
532 matches!(
533 event,
534 RoderEvent::ToolCallCompleted(completed)
535 if completed.tool_name.as_deref() == Some("read_file")
536 && completed
537 .display_payload
538 .as_ref()
539 .is_some_and(|payload| payload.to_string().contains("relevant"))
540 )
541}
542
543fn count_truncation_follow_ups(events: &[RoderEvent]) -> u64 {
544 let mut saw_truncation = false;
545 let mut follow_ups = 0u64;
546 for event in events {
547 match event {
548 RoderEvent::ToolOutputTruncated(_) => saw_truncation = true,
549 RoderEvent::ToolCallRequested(requested)
550 if saw_truncation
551 && matches!(requested.tool_name.as_str(), "read_file" | "grep" | "glob") =>
552 {
553 follow_ups += 1;
554 saw_truncation = false;
555 }
556 _ => {}
557 }
558 }
559 follow_ups
560}
561
562fn collect_eval_reports(
563 root: &Path,
564 dir: &Path,
565 reports: &mut Vec<EvalReportSummary>,
566) -> anyhow::Result<()> {
567 if !dir.exists() {
568 return Ok(());
569 }
570 let run_path = dir.join("eval-run.json");
571 if run_path.exists() {
572 let report: EvalSuiteReport = serde_json::from_str(&std::fs::read_to_string(&run_path)?)?;
573 let id = if dir == root {
574 "eval-run".to_string()
575 } else {
576 dir.strip_prefix(root)
577 .unwrap_or(dir)
578 .to_string_lossy()
579 .replace(std::path::MAIN_SEPARATOR, "/")
580 };
581 reports.push(summary_from_report(id, dir.to_path_buf(), &report));
582 }
583 for entry in std::fs::read_dir(dir)? {
584 let path = entry?.path();
585 if path.is_dir() {
586 collect_eval_reports(root, &path, reports)?;
587 }
588 }
589 Ok(())
590}
591
592fn summary_from_report(id: String, path: PathBuf, report: &EvalSuiteReport) -> EvalReportSummary {
593 let passed = report
594 .results
595 .iter()
596 .filter(|result| result.report.outcome == EvalOutcome::Pass)
597 .count();
598 EvalReportSummary {
599 id,
600 path,
601 suite_id: report.suite_id.clone(),
602 fixture_count: report.results.len(),
603 passed,
604 failed: report.results.len().saturating_sub(passed),
605 reliability: reliability_summary(report),
606 generated_at: report.generated_at,
607 }
608}
609
610pub(super) fn trajectory_excerpt(trajectory: &EvalTrajectory) -> Vec<EvalTrajectoryEvent> {
611 let start = trajectory.events.len().saturating_sub(8);
612 trajectory.events[start..].to_vec()
613}
614
615fn eval_report_markdown(report: &EvalSuiteReport) -> String {
616 let passed = report
617 .results
618 .iter()
619 .filter(|result| result.report.outcome == EvalOutcome::Pass)
620 .count();
621 let mut text = format!(
622 "# Roder Eval Report\n\n- Suite: `{}`\n- Fixtures: {}\n- Passed: {}\n- Failed: {}\n",
623 report.suite_id,
624 report.results.len(),
625 passed,
626 report.results.len().saturating_sub(passed)
627 );
628 text.push_str(
629 "\n## Pass Rates\n\n| Scope | Passed | Total | Pass rate |\n| --- | ---: | ---: | ---: |\n",
630 );
631 for (scope, passed, total) in pass_rate_rows(report) {
632 let rate = if total == 0 {
633 0.0
634 } else {
635 (passed as f64 / total as f64) * 100.0
636 };
637 text.push_str(&format!(
638 "| `{scope}` | {passed} | {total} | {rate:.1}% |\n"
639 ));
640 }
641 text.push_str(
642 "\n## Fixtures\n\n| Fixture | Outcome | Failure class | Trace excerpt |\n| --- | --- | --- | --- |\n",
643 );
644 for result in &report.results {
645 let class = result
646 .report
647 .failure_class
648 .as_ref()
649 .map(|class| format!("{class:?}"))
650 .unwrap_or_else(|| "-".to_string());
651 let excerpt = result
652 .trace_excerpt
653 .iter()
654 .map(|event| event.event_type.as_str())
655 .collect::<Vec<_>>()
656 .join(" -> ");
657 text.push_str(&format!(
658 "| `{}` | `{:?}` | `{}` | {} |\n",
659 result.fixture_id, result.report.outcome, class, excerpt
660 ));
661 if let Some(message) = &result.failure_message {
662 text.push_str(&format!(
663 "\nFailure `{}`: {}\n\n",
664 result.fixture_id,
665 message.replace('\n', " ")
666 ));
667 }
668 }
669 text.push_str(
670 "\n## Speed Metrics\n\n| Fixture | Policy | Wall ms | Model calls | Tool calls | Child tasks | Deadline remaining s | Outcome |\n| --- | --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
671 );
672 for result in &report.results {
673 text.push_str(&format!(
674 "| `{}` | `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | `{:?}` |\n",
675 result.fixture_id,
676 speed_policy_label(result),
677 metric_value(result, "wall_time_ms"),
678 metric_value(result, "model_calls"),
679 metric_value(result, "tool_calls"),
680 metric_value(result, "child_task_count"),
681 metric_value(result, "deadline_remaining_seconds"),
682 result.report.outcome,
683 ));
684 }
685 let comparisons = speed_policy_comparisons(report);
686 if !comparisons.is_empty() {
687 text.push_str(
688 "\n## Speed Policy Comparison\n\n| Fixture | Baseline wall ms | Speed wall ms | Delta ms | Baseline model calls | Speed model calls | Baseline tool calls | Speed tool calls | Baseline child tasks | Speed child tasks | Quality |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
689 );
690 for comparison in comparisons {
691 text.push_str(&format!(
692 "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
693 comparison.fixture_id,
694 comparison.baseline_wall_ms,
695 comparison.speed_wall_ms,
696 comparison.speed_wall_ms - comparison.baseline_wall_ms,
697 comparison.baseline_model_calls,
698 comparison.speed_model_calls,
699 comparison.baseline_tool_calls,
700 comparison.speed_tool_calls,
701 comparison.baseline_child_tasks,
702 comparison.speed_child_tasks,
703 comparison.quality,
704 ));
705 }
706 }
707 text.push_str(
708 "\n## Search Metrics\n\n| Fixture | Grep calls | Indexed | Scan | Fallback | Candidate files | Verified files | Grep elapsed ms | Index bytes | Index build ms |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
709 );
710 for result in &report.results {
711 text.push_str(&format!(
712 "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
713 result.fixture_id,
714 metric_value(result, "grep_calls"),
715 metric_value(result, "grep_indexed_calls"),
716 metric_value(result, "grep_scan_calls"),
717 metric_value(result, "grep_fallback_calls"),
718 metric_value(result, "grep_candidate_files"),
719 metric_value(result, "grep_verified_files"),
720 metric_value(result, "grep_elapsed_ms"),
721 metric_value(result, "grep_index_bytes"),
722 metric_value(result, "grep_index_build_time_ms"),
723 ));
724 }
725 let profile_comparisons = model_profile_comparisons(report);
726 if !profile_comparisons.is_empty() {
727 text.push_str(
728 "\n## Model Profile Deltas\n\n| Fixture | Profile | Outcome | Failure class | Wall ms | Model calls | Tool calls |\n| --- | --- | --- | --- | ---: | ---: | ---: |\n",
729 );
730 for comparison in profile_comparisons {
731 text.push_str(&format!(
732 "| `{}` | `{}` | `{:?}` | `{}` | {:.0} | {:.0} | {:.0} |\n",
733 comparison.fixture_id,
734 comparison.profile,
735 comparison.outcome,
736 comparison.failure_class,
737 comparison.wall_ms,
738 comparison.model_calls,
739 comparison.tool_calls,
740 ));
741 }
742 text.push_str("\nRecommended profile changes should be made only when this table shows an improved failure class or equivalent quality with lower wall/model/tool cost.\n");
743 }
744 text.push_str(
745 "\n## Context Metrics\n\n| Fixture | Context tokens | Context bytes | Entrypoint candidates | Entrypoint injection event | First relevant read event | Irrelevant reads | Truncation follow-ups | Tool output truncations |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
746 );
747 for result in &report.results {
748 text.push_str(&format!(
749 "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
750 result.fixture_id,
751 metric_value(result, "context_estimated_tokens"),
752 metric_value(result, "context_bytes"),
753 metric_value(result, "entrypoint_candidates"),
754 metric_value(result, "entrypoint_injection_event"),
755 metric_value(result, "first_relevant_file_read_event"),
756 metric_value(result, "irrelevant_file_reads"),
757 metric_value(result, "truncation_follow_ups"),
758 metric_value(result, "tool_output_truncations"),
759 ));
760 }
761 text.push_str(
762 "\n## Task Ledger Metrics\n\n| Fixture | Updates | Tasks | Completed |\n| --- | ---: | ---: | ---: |\n",
763 );
764 for result in &report.results {
765 text.push_str(&format!(
766 "| `{}` | {:.0} | {:.0} | {:.0} |\n",
767 result.fixture_id,
768 metric_value(result, "task_ledger_updates"),
769 metric_value(result, "task_ledger_tasks"),
770 metric_value(result, "task_ledger_completed"),
771 ));
772 }
773 text.push_str(
774 "\n## Verification Metrics\n\n| Fixture | Required | Completed | Failed | Skipped | Open gaps | Remaining gaps |\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
775 );
776 for result in &report.results {
777 text.push_str(&format!(
778 "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
779 result.fixture_id,
780 metric_value(result, "verification_required"),
781 metric_value(result, "verification_completed"),
782 metric_value(result, "verification_failed"),
783 metric_value(result, "verification_skipped"),
784 metric_value(result, "verification_open_gaps"),
785 markdown_cell(&verification_remaining_gaps(
786 &result.report.trajectory.events
787 )),
788 ));
789 }
790 text.push_str(&retrieval_router_markdown(report));
791 text.push_str(&lazy_discovery_markdown(report));
792 text.push_str(&reliability_markdown(report));
793 let groups = failure_groups(report);
794 if !groups.is_empty() {
795 text.push_str("\n## Failure Groups\n\n| Tool | Model | Failure class | Count |\n| --- | --- | --- | --- |\n");
796 for ((tool, model, class), count) in groups {
797 text.push_str(&format!("| `{tool}` | `{model}` | `{class}` | {count} |\n"));
798 }
799 }
800 text
801}
802
803fn metric_value(result: &EvalFixtureResult, name: &str) -> f64 {
804 result
805 .report
806 .metrics
807 .iter()
808 .find(|metric| metric.name == name)
809 .map(|metric| metric.value)
810 .unwrap_or(0.0)
811}
812
813fn speed_policy_label(result: &EvalFixtureResult) -> &'static str {
814 if result
815 .report
816 .run
817 .tags
818 .iter()
819 .any(|tag| tag == "speed_policy:on")
820 {
821 "on"
822 } else {
823 "off"
824 }
825}
826
827fn profile_label(result: &EvalFixtureResult) -> Option<String> {
828 result
829 .report
830 .run
831 .tags
832 .iter()
833 .find_map(|tag| tag.strip_prefix("profile:").map(str::to_string))
834}
835
836struct ModelProfileComparison {
837 fixture_id: String,
838 profile: String,
839 outcome: EvalOutcome,
840 failure_class: String,
841 wall_ms: f64,
842 model_calls: f64,
843 tool_calls: f64,
844}
845
846fn model_profile_comparisons(report: &EvalSuiteReport) -> Vec<ModelProfileComparison> {
847 let mut rows = report
848 .results
849 .iter()
850 .filter_map(|result| {
851 let profile = profile_label(result)?;
852 Some(ModelProfileComparison {
853 fixture_id: result.fixture_id.clone(),
854 profile,
855 outcome: result.report.outcome.clone(),
856 failure_class: result
857 .report
858 .failure_class
859 .as_ref()
860 .map(|class| format!("{class:?}"))
861 .unwrap_or_else(|| "-".to_string()),
862 wall_ms: metric_value(result, "wall_time_ms"),
863 model_calls: metric_value(result, "model_calls"),
864 tool_calls: metric_value(result, "tool_calls"),
865 })
866 })
867 .collect::<Vec<_>>();
868 rows.sort_by(|left, right| {
869 left.fixture_id
870 .cmp(&right.fixture_id)
871 .then_with(|| left.profile.cmp(&right.profile))
872 });
873 rows
874}
875
876struct SpeedPolicyComparison {
877 fixture_id: String,
878 baseline_wall_ms: f64,
879 speed_wall_ms: f64,
880 baseline_model_calls: f64,
881 speed_model_calls: f64,
882 baseline_tool_calls: f64,
883 speed_tool_calls: f64,
884 baseline_child_tasks: f64,
885 speed_child_tasks: f64,
886 quality: String,
887}
888
889fn speed_policy_comparisons(report: &EvalSuiteReport) -> Vec<SpeedPolicyComparison> {
890 let mut by_fixture =
891 BTreeMap::<String, (Option<&EvalFixtureResult>, Option<&EvalFixtureResult>)>::new();
892 for result in &report.results {
893 let entry = by_fixture
894 .entry(result.fixture_id.clone())
895 .or_insert((None, None));
896 match speed_policy_label(result) {
897 "on" => entry.1 = Some(result),
898 _ => entry.0 = Some(result),
899 }
900 }
901 by_fixture
902 .into_iter()
903 .filter_map(|(fixture_id, (baseline, speed))| {
904 let baseline = baseline?;
905 let speed = speed?;
906 Some(SpeedPolicyComparison {
907 fixture_id,
908 baseline_wall_ms: metric_value(baseline, "wall_time_ms"),
909 speed_wall_ms: metric_value(speed, "wall_time_ms"),
910 baseline_model_calls: metric_value(baseline, "model_calls"),
911 speed_model_calls: metric_value(speed, "model_calls"),
912 baseline_tool_calls: metric_value(baseline, "tool_calls"),
913 speed_tool_calls: metric_value(speed, "tool_calls"),
914 baseline_child_tasks: metric_value(baseline, "child_task_count"),
915 speed_child_tasks: metric_value(speed, "child_task_count"),
916 quality: if baseline.report.outcome == speed.report.outcome {
917 format!("matched `{:?}`", speed.report.outcome)
918 } else {
919 format!(
920 "changed `{:?}` -> `{:?}`",
921 baseline.report.outcome, speed.report.outcome
922 )
923 },
924 })
925 })
926 .collect()
927}
928
929fn verification_remaining_gaps(events: &[crate::EvalTrajectoryEvent]) -> String {
930 events
931 .iter()
932 .rev()
933 .find(|event| event.event_type == "verification_completed" && event.is_error)
934 .map(|_| "see failure message or verification trace".to_string())
935 .unwrap_or_else(|| "-".to_string())
936}
937
938fn markdown_cell(value: &str) -> String {
939 value.replace('|', "\\|").replace('\n', " ")
940}
941
942fn pass_rate_rows(report: &EvalSuiteReport) -> Vec<(String, usize, usize)> {
943 let mut rows = BTreeMap::<String, (usize, usize)>::new();
944 for result in &report.results {
945 let passed = usize::from(result.report.outcome == EvalOutcome::Pass);
946 let model_scope = format!("{}/{}", result.report.run.provider, result.report.run.model);
947 let entry = rows.entry(format!("model:{model_scope}")).or_insert((0, 0));
948 entry.0 += passed;
949 entry.1 += 1;
950 for tag in result
951 .report
952 .run
953 .tags
954 .iter()
955 .filter(|tag| tag.starts_with("tool:"))
956 {
957 let entry = rows.entry(tag.clone()).or_insert((0, 0));
958 entry.0 += passed;
959 entry.1 += 1;
960 }
961 }
962 rows.into_iter()
963 .map(|(scope, (passed, total))| (scope, passed, total))
964 .collect()
965}
966
967fn failure_groups(report: &EvalSuiteReport) -> BTreeMap<(String, String, String), usize> {
968 let mut groups = BTreeMap::new();
969 for result in &report.results {
970 if result.report.outcome == EvalOutcome::Pass {
971 continue;
972 }
973 let tool = result
974 .report
975 .run
976 .tags
977 .iter()
978 .find_map(|tag| tag.strip_prefix("tool:"))
979 .unwrap_or("unknown")
980 .to_string();
981 let model = format!("{}/{}", result.report.run.provider, result.report.run.model);
982 let class = result
983 .report
984 .failure_class
985 .as_ref()
986 .map(|class| format!("{class:?}"))
987 .unwrap_or_else(|| "Unknown".to_string());
988 *groups.entry((tool, model, class)).or_insert(0) += 1;
989 }
990 groups
991}
992
993#[cfg(test)]
994mod tests {
995 use super::*;
996 use roder_api::events::{
997 ContextAssemblyCompleted, ContextEntrypointCandidatesInjected, InferenceStarted,
998 RoderEvent, ToolCallCompleted, ToolCallRequested, ToolOutputTruncated,
999 VerificationCompleted, VerificationRequired,
1000 };
1001 use roder_api::tasks::TaskStarted;
1002
1003 #[test]
1004 fn context_eval_metrics_track_budget_entrypoints_and_truncation_follow_up() {
1005 let events = vec![
1006 RoderEvent::ContextAssemblyCompleted(ContextAssemblyCompleted {
1007 thread_id: "thread-a".to_string(),
1008 turn_id: "turn-a".to_string(),
1009 block_count: 1,
1010 total_byte_count: 800,
1011 estimated_tokens: 200,
1012 prompt_estimated_tokens: 200,
1013 token_budget: Some(1_000),
1014 timestamp: OffsetDateTime::UNIX_EPOCH,
1015 }),
1016 RoderEvent::ContextEntrypointCandidatesInjected(ContextEntrypointCandidatesInjected {
1017 thread_id: "thread-a".to_string(),
1018 turn_id: "turn-a".to_string(),
1019 candidate_count: 3,
1020 block_byte_count: 120,
1021 estimated_tokens: 30,
1022 timestamp: OffsetDateTime::UNIX_EPOCH,
1023 }),
1024 RoderEvent::ToolOutputTruncated(ToolOutputTruncated {
1025 thread_id: "thread-a".to_string(),
1026 turn_id: "turn-a".to_string(),
1027 tool_id: "tool-a".to_string(),
1028 tool_name: Some("grep".to_string()),
1029 original_line_count: 1_000,
1030 original_char_count: 40_000,
1031 inline_char_count: 2_000,
1032 artifact_backed: false,
1033 timestamp: OffsetDateTime::UNIX_EPOCH,
1034 }),
1035 RoderEvent::ToolCallRequested(ToolCallRequested {
1036 thread_id: "thread-a".to_string(),
1037 turn_id: "turn-a".to_string(),
1038 tool_id: "tool-b".to_string(),
1039 tool_name: "grep".to_string(),
1040 display_payload: None,
1041 timestamp: OffsetDateTime::UNIX_EPOCH,
1042 }),
1043 ];
1044
1045 let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1046 let value = |name: &str| {
1047 metrics
1048 .iter()
1049 .find(|metric| metric.name == name)
1050 .map(|metric| metric.value)
1051 .unwrap()
1052 };
1053
1054 assert_eq!(value("context_estimated_tokens"), 200.0);
1055 assert_eq!(value("context_bytes"), 800.0);
1056 assert_eq!(value("entrypoint_candidates"), 3.0);
1057 assert_eq!(value("entrypoint_injection_event"), 2.0);
1058 assert_eq!(value("truncation_follow_ups"), 1.0);
1059 assert_eq!(value("tool_output_truncations"), 1.0);
1060 }
1061
1062 #[test]
1063 fn verification_eval_metrics_track_required_completed_and_gaps() {
1064 let events = vec![
1065 RoderEvent::VerificationRequired(VerificationRequired {
1066 thread_id: "thread-a".to_string(),
1067 turn_id: "turn-a".to_string(),
1068 reason: "code_changes_without_verification".to_string(),
1069 changed_files: vec!["src/lib.rs".to_string()],
1070 tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
1071 tests_run: Vec::new(),
1072 open_gaps: Vec::new(),
1073 timestamp: OffsetDateTime::UNIX_EPOCH,
1074 }),
1075 RoderEvent::VerificationCompleted(VerificationCompleted {
1076 thread_id: "thread-a".to_string(),
1077 turn_id: "turn-a".to_string(),
1078 passed: false,
1079 changed_files: vec!["src/lib.rs".to_string()],
1080 tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
1081 tests_run: Vec::new(),
1082 open_gaps: vec!["tests not run".to_string()],
1083 timestamp: OffsetDateTime::UNIX_EPOCH,
1084 }),
1085 ];
1086
1087 let metrics = eval_metrics(&events, 42, &EvalOutcome::Fail);
1088 let value = |name: &str| {
1089 metrics
1090 .iter()
1091 .find(|metric| metric.name == name)
1092 .map(|metric| metric.value)
1093 .unwrap()
1094 };
1095
1096 assert_eq!(value("verification_required"), 1.0);
1097 assert_eq!(value("verification_completed"), 0.0);
1098 assert_eq!(value("verification_failed"), 1.0);
1099 assert_eq!(value("verification_open_gaps"), 1.0);
1100 }
1101
1102 #[test]
1103 fn search_eval_metrics_track_grep_engine_and_latency_metadata() {
1104 let events = vec![RoderEvent::ToolCallCompleted(ToolCallCompleted {
1105 thread_id: "thread-a".to_string(),
1106 turn_id: "turn-a".to_string(),
1107 tool_id: "grep-a".to_string(),
1108 tool_name: Some("grep".to_string()),
1109 display_payload: Some(serde_json::json!({
1110 "query": "BUG_ROOT_CAUSE_TOKEN",
1111 "engine": "indexed",
1112 "candidate_files": 4,
1113 "verified_files": 2,
1114 "elapsed_ms": 7,
1115 "index_bytes": 4096,
1116 "index_build_time_ms": 3
1117 })),
1118 is_error: false,
1119 output: None,
1120 timestamp: OffsetDateTime::UNIX_EPOCH,
1121 })];
1122
1123 let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1124 let value = |name: &str| {
1125 metrics
1126 .iter()
1127 .find(|metric| metric.name == name)
1128 .map(|metric| metric.value)
1129 .unwrap()
1130 };
1131
1132 assert_eq!(value("grep_calls"), 1.0);
1133 assert_eq!(value("grep_indexed_calls"), 1.0);
1134 assert_eq!(value("grep_candidate_files"), 4.0);
1135 assert_eq!(value("grep_verified_files"), 2.0);
1136 assert_eq!(value("grep_elapsed_ms"), 7.0);
1137 assert_eq!(value("grep_index_bytes"), 4096.0);
1138 assert_eq!(value("grep_index_build_time_ms"), 3.0);
1139 }
1140
1141 #[test]
1142 fn speed_eval_metrics_track_child_tasks_and_deadline_remaining() {
1143 let events = vec![
1144 RoderEvent::InferenceStarted(InferenceStarted {
1145 thread_id: "thread-a".to_string(),
1146 turn_id: "turn-a".to_string(),
1147 engine_id: "mock".to_string(),
1148 model: roder_api::inference::ModelSelection {
1149 provider: "mock".to_string(),
1150 model: "mock".to_string(),
1151 },
1152 reasoning: roder_api::inference::ReasoningConfig::default(),
1153 speed_policy: None,
1154 deadline_remaining_seconds: Some(27),
1155 timestamp: OffsetDateTime::UNIX_EPOCH,
1156 }),
1157 RoderEvent::TaskStarted(TaskStarted {
1158 task_id: "task-a".to_string(),
1159 executor_id: "subagent".to_string(),
1160 task_kind: "subagent".to_string(),
1161 queue_depth: 0,
1162 thread_id: Some("thread-a".to_string()),
1163 turn_id: Some("turn-a".to_string()),
1164 timestamp: OffsetDateTime::UNIX_EPOCH,
1165 }),
1166 ];
1167
1168 let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1169 let value = |name: &str| {
1170 metrics
1171 .iter()
1172 .find(|metric| metric.name == name)
1173 .map(|metric| metric.value)
1174 .unwrap()
1175 };
1176
1177 assert_eq!(value("model_calls"), 1.0);
1178 assert_eq!(value("child_task_count"), 1.0);
1179 assert_eq!(value("deadline_remaining_seconds"), 27.0);
1180 }
1181}