roder_evals/runner/
report.rs

1use std::cmp::Reverse;
2use std::collections::BTreeMap;
3use std::path::{Path, PathBuf};
4
5use roder_api::events::RoderEvent;
6use roder_api::inference::InferenceEvent;
7use serde::{Deserialize, Serialize};
8use time::OffsetDateTime;
9
10use crate::retrieval_router::retrieval_router_markdown;
11use crate::{EvalMetric, EvalMetricKind, EvalOutcome, EvalTrajectory, EvalTrajectoryEvent};
12
13use super::lazy_discovery::lazy_discovery_markdown;
14use super::reliability::{
15    ReliabilityReportSummary, reliability_markdown, reliability_metrics, reliability_summary,
16};
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
19#[serde(rename_all = "camelCase")]
20pub struct EvalSuiteReport {
21    pub suite_id: String,
22    pub fixture_dir: PathBuf,
23    pub output_dir: PathBuf,
24    pub offline: bool,
25    #[serde(with = "time::serde::rfc3339")]
26    pub generated_at: OffsetDateTime,
27    pub results: Vec<EvalFixtureResult>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
31#[serde(rename_all = "camelCase")]
32pub struct EvalFixtureResult {
33    pub fixture_id: String,
34    pub title: String,
35    pub workspace: PathBuf,
36    pub final_answer: String,
37    pub report: crate::EvalReport,
38    #[serde(default)]
39    pub trace_excerpt: Vec<EvalTrajectoryEvent>,
40    #[serde(default, skip_serializing_if = "Option::is_none")]
41    pub failure_message: Option<String>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
45#[serde(rename_all = "camelCase")]
46pub struct EvalReportSummary {
47    pub id: String,
48    pub path: PathBuf,
49    pub suite_id: String,
50    pub fixture_count: usize,
51    pub passed: usize,
52    pub failed: usize,
53    #[serde(default)]
54    pub reliability: ReliabilityReportSummary,
55    #[serde(with = "time::serde::rfc3339")]
56    pub generated_at: OffsetDateTime,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60#[serde(rename_all = "camelCase")]
61pub struct EvalReportDocument {
62    pub summary: EvalReportSummary,
63    pub markdown: String,
64    pub truncated: bool,
65}
66
67pub fn write_eval_report_files(report: &EvalSuiteReport, output_dir: &Path) -> anyhow::Result<()> {
68    std::fs::create_dir_all(output_dir)?;
69    std::fs::write(
70        output_dir.join("eval-run.json"),
71        serde_json::to_string_pretty(report)?,
72    )?;
73    std::fs::write(
74        output_dir.join("eval-report.md"),
75        eval_report_markdown(report),
76    )?;
77    Ok(())
78}
79
80pub fn list_eval_reports(output_dir: &Path) -> anyhow::Result<Vec<EvalReportSummary>> {
81    let mut reports = Vec::new();
82    collect_eval_reports(output_dir, output_dir, &mut reports)?;
83    reports.sort_by_key(|report| Reverse(report.generated_at));
84    Ok(reports)
85}
86
87pub fn read_eval_report(
88    output_dir: &Path,
89    report_id: &str,
90    max_bytes: usize,
91) -> anyhow::Result<EvalReportDocument> {
92    let reports = list_eval_reports(output_dir)?;
93    let summary = reports
94        .into_iter()
95        .find(|report| report.id == report_id)
96        .ok_or_else(|| anyhow::anyhow!("eval report not found: {report_id}"))?;
97    let markdown_path = summary.path.join("eval-report.md");
98    let markdown = std::fs::read_to_string(&markdown_path)?;
99    let truncated = markdown.len() > max_bytes;
100    let markdown = if truncated {
101        markdown.chars().take(max_bytes).collect()
102    } else {
103        markdown
104    };
105    Ok(EvalReportDocument {
106        summary,
107        markdown,
108        truncated,
109    })
110}
111
112pub(super) fn eval_metrics(
113    events: &[RoderEvent],
114    wall_time_ms: u128,
115    outcome: &EvalOutcome,
116) -> Vec<EvalMetric> {
117    let search = search_metrics(events);
118    let model_calls = events
119        .iter()
120        .filter(|event| matches!(event, RoderEvent::InferenceStarted(_)))
121        .count();
122    let tool_calls = events
123        .iter()
124        .filter(|event| matches!(event, RoderEvent::ToolCallRequested(_)))
125        .count();
126    let tool_errors = events
127        .iter()
128        .filter(|event| {
129            matches!(
130                event,
131                RoderEvent::ToolCallCompleted(completed) if completed.is_error
132            )
133        })
134        .count();
135    let child_tasks = events
136        .iter()
137        .filter(|event| {
138            matches!(
139                event,
140                RoderEvent::TaskStarted(_)
141                    | RoderEvent::SubagentStarted(_)
142                    | RoderEvent::TeamMemberStarted(_)
143            )
144        })
145        .count();
146    let deadline_remaining_seconds = events
147        .iter()
148        .filter_map(|event| match event {
149            RoderEvent::InferenceStarted(started) => started.deadline_remaining_seconds,
150            _ => None,
151        })
152        .next_back()
153        .unwrap_or(0);
154    let total_tokens = events
155        .iter()
156        .filter_map(|event| match event {
157            RoderEvent::InferenceEventReceived(received) => match &received.event {
158                InferenceEvent::Usage(usage) => Some(u64::from(usage.total_tokens)),
159                _ => None,
160            },
161            _ => None,
162        })
163        .sum::<u64>();
164    let context_tokens = events
165        .iter()
166        .filter_map(|event| match event {
167            RoderEvent::ContextAssemblyCompleted(completed) => {
168                Some(u64::from(completed.estimated_tokens))
169            }
170            _ => None,
171        })
172        .max()
173        .unwrap_or(0);
174    let context_bytes = events
175        .iter()
176        .filter_map(|event| match event {
177            RoderEvent::ContextAssemblyCompleted(completed) => Some(completed.total_byte_count),
178            _ => None,
179        })
180        .max()
181        .unwrap_or(0);
182    let entrypoint_candidates = events
183        .iter()
184        .filter_map(|event| match event {
185            RoderEvent::ContextEntrypointCandidatesInjected(injected) => {
186                Some(injected.candidate_count)
187            }
188            _ => None,
189        })
190        .sum::<u64>();
191    let entrypoint_injection_event = events
192        .iter()
193        .position(|event| matches!(event, RoderEvent::ContextEntrypointCandidatesInjected(_)))
194        .map(|index| index as u64 + 1)
195        .unwrap_or(0);
196    let first_relevant_file_read = events
197        .iter()
198        .position(is_relevant_file_read)
199        .map(|index| index as u64 + 1)
200        .unwrap_or(0);
201    let irrelevant_file_reads = events
202        .iter()
203        .filter(|event| is_file_read(event) && !is_relevant_file_read(event))
204        .count() as u64;
205    let truncation_follow_ups = count_truncation_follow_ups(events);
206    let tool_output_truncations = events
207        .iter()
208        .filter(|event| matches!(event, RoderEvent::ToolOutputTruncated(_)))
209        .count() as u64;
210    let task_ledger_updates = events
211        .iter()
212        .filter(|event| matches!(event, RoderEvent::TaskLedgerUpdated(_)))
213        .count() as u64;
214    let task_ledger_tasks = events
215        .iter()
216        .filter_map(|event| match event {
217            RoderEvent::TaskLedgerUpdated(updated) => Some(updated.tasks.len() as u64),
218            _ => None,
219        })
220        .next_back()
221        .unwrap_or(0);
222    let task_ledger_completed = events
223        .iter()
224        .filter_map(|event| match event {
225            RoderEvent::TaskLedgerUpdated(updated) => Some(updated.completed_count),
226            _ => None,
227        })
228        .next_back()
229        .unwrap_or(0);
230    let verification_required = events
231        .iter()
232        .filter(|event| matches!(event, RoderEvent::VerificationRequired(_)))
233        .count() as u64;
234    let verification_completed = events
235        .iter()
236        .filter(|event| {
237            matches!(
238                event,
239                RoderEvent::VerificationCompleted(completed) if completed.passed
240            )
241        })
242        .count() as u64;
243    let verification_failed = events
244        .iter()
245        .filter(|event| {
246            matches!(
247                event,
248                RoderEvent::VerificationCompleted(completed) if !completed.passed
249            )
250        })
251        .count() as u64;
252    let verification_skipped = events
253        .iter()
254        .filter(|event| matches!(event, RoderEvent::VerificationSkipped(_)))
255        .count() as u64;
256    let verification_open_gaps = events
257        .iter()
258        .filter_map(|event| match event {
259            RoderEvent::VerificationCompleted(completed) => Some(completed.open_gaps.len() as u64),
260            RoderEvent::VerificationRequired(required) => Some(required.open_gaps.len() as u64),
261            _ => None,
262        })
263        .next_back()
264        .unwrap_or(0);
265    let mut metrics = vec![
266        EvalMetric {
267            name: "outcome_pass".to_string(),
268            kind: EvalMetricKind::Outcome,
269            value: if outcome == &EvalOutcome::Pass {
270                1.0
271            } else {
272                0.0
273            },
274            unit: None,
275        },
276        EvalMetric {
277            name: "wall_time_ms".to_string(),
278            kind: EvalMetricKind::Duration,
279            value: wall_time_ms as f64,
280            unit: Some("ms".to_string()),
281        },
282        EvalMetric {
283            name: "model_calls".to_string(),
284            kind: EvalMetricKind::Count,
285            value: model_calls as f64,
286            unit: None,
287        },
288        EvalMetric {
289            name: "tool_calls".to_string(),
290            kind: EvalMetricKind::Count,
291            value: tool_calls as f64,
292            unit: None,
293        },
294        EvalMetric {
295            name: "child_task_count".to_string(),
296            kind: EvalMetricKind::Count,
297            value: child_tasks as f64,
298            unit: None,
299        },
300        EvalMetric {
301            name: "deadline_remaining_seconds".to_string(),
302            kind: EvalMetricKind::Duration,
303            value: deadline_remaining_seconds as f64,
304            unit: Some("s".to_string()),
305        },
306        EvalMetric {
307            name: "tool_errors".to_string(),
308            kind: EvalMetricKind::Count,
309            value: tool_errors as f64,
310            unit: None,
311        },
312        EvalMetric {
313            name: "total_tokens".to_string(),
314            kind: EvalMetricKind::Tokens,
315            value: total_tokens as f64,
316            unit: Some("tokens".to_string()),
317        },
318        EvalMetric {
319            name: "context_estimated_tokens".to_string(),
320            kind: EvalMetricKind::Tokens,
321            value: context_tokens as f64,
322            unit: Some("tokens".to_string()),
323        },
324        EvalMetric {
325            name: "context_bytes".to_string(),
326            kind: EvalMetricKind::Bytes,
327            value: context_bytes as f64,
328            unit: Some("bytes".to_string()),
329        },
330        EvalMetric {
331            name: "entrypoint_candidates".to_string(),
332            kind: EvalMetricKind::Count,
333            value: entrypoint_candidates as f64,
334            unit: None,
335        },
336        EvalMetric {
337            name: "entrypoint_injection_event".to_string(),
338            kind: EvalMetricKind::Count,
339            value: entrypoint_injection_event as f64,
340            unit: None,
341        },
342        EvalMetric {
343            name: "first_relevant_file_read_event".to_string(),
344            kind: EvalMetricKind::Count,
345            value: first_relevant_file_read as f64,
346            unit: None,
347        },
348        EvalMetric {
349            name: "irrelevant_file_reads".to_string(),
350            kind: EvalMetricKind::Count,
351            value: irrelevant_file_reads as f64,
352            unit: None,
353        },
354        EvalMetric {
355            name: "truncation_follow_ups".to_string(),
356            kind: EvalMetricKind::Count,
357            value: truncation_follow_ups as f64,
358            unit: None,
359        },
360        EvalMetric {
361            name: "tool_output_truncations".to_string(),
362            kind: EvalMetricKind::Count,
363            value: tool_output_truncations as f64,
364            unit: None,
365        },
366        EvalMetric {
367            name: "grep_calls".to_string(),
368            kind: EvalMetricKind::Count,
369            value: search.calls as f64,
370            unit: None,
371        },
372        EvalMetric {
373            name: "grep_indexed_calls".to_string(),
374            kind: EvalMetricKind::Count,
375            value: search.indexed_calls as f64,
376            unit: None,
377        },
378        EvalMetric {
379            name: "grep_scan_calls".to_string(),
380            kind: EvalMetricKind::Count,
381            value: search.scan_calls as f64,
382            unit: None,
383        },
384        EvalMetric {
385            name: "grep_fallback_calls".to_string(),
386            kind: EvalMetricKind::Count,
387            value: search.fallback_calls as f64,
388            unit: None,
389        },
390        EvalMetric {
391            name: "grep_candidate_files".to_string(),
392            kind: EvalMetricKind::Count,
393            value: search.candidate_files as f64,
394            unit: None,
395        },
396        EvalMetric {
397            name: "grep_verified_files".to_string(),
398            kind: EvalMetricKind::Count,
399            value: search.verified_files as f64,
400            unit: None,
401        },
402        EvalMetric {
403            name: "grep_elapsed_ms".to_string(),
404            kind: EvalMetricKind::Duration,
405            value: search.elapsed_ms as f64,
406            unit: Some("ms".to_string()),
407        },
408        EvalMetric {
409            name: "grep_index_bytes".to_string(),
410            kind: EvalMetricKind::Bytes,
411            value: search.index_bytes as f64,
412            unit: Some("bytes".to_string()),
413        },
414        EvalMetric {
415            name: "grep_index_build_time_ms".to_string(),
416            kind: EvalMetricKind::Duration,
417            value: search.index_build_time_ms as f64,
418            unit: Some("ms".to_string()),
419        },
420        EvalMetric {
421            name: "task_ledger_updates".to_string(),
422            kind: EvalMetricKind::Count,
423            value: task_ledger_updates as f64,
424            unit: None,
425        },
426        EvalMetric {
427            name: "task_ledger_tasks".to_string(),
428            kind: EvalMetricKind::Count,
429            value: task_ledger_tasks as f64,
430            unit: None,
431        },
432        EvalMetric {
433            name: "task_ledger_completed".to_string(),
434            kind: EvalMetricKind::Count,
435            value: task_ledger_completed as f64,
436            unit: None,
437        },
438        EvalMetric {
439            name: "verification_required".to_string(),
440            kind: EvalMetricKind::Count,
441            value: verification_required as f64,
442            unit: None,
443        },
444        EvalMetric {
445            name: "verification_completed".to_string(),
446            kind: EvalMetricKind::Count,
447            value: verification_completed as f64,
448            unit: None,
449        },
450        EvalMetric {
451            name: "verification_failed".to_string(),
452            kind: EvalMetricKind::Count,
453            value: verification_failed as f64,
454            unit: None,
455        },
456        EvalMetric {
457            name: "verification_skipped".to_string(),
458            kind: EvalMetricKind::Count,
459            value: verification_skipped as f64,
460            unit: None,
461        },
462        EvalMetric {
463            name: "verification_open_gaps".to_string(),
464            kind: EvalMetricKind::Count,
465            value: verification_open_gaps as f64,
466            unit: None,
467        },
468    ];
469    metrics.extend(reliability_metrics(events, outcome));
470    metrics
471}
472
473#[derive(Default)]
474struct SearchEvalMetrics {
475    calls: u64,
476    indexed_calls: u64,
477    scan_calls: u64,
478    fallback_calls: u64,
479    candidate_files: u64,
480    verified_files: u64,
481    elapsed_ms: u64,
482    index_bytes: u64,
483    index_build_time_ms: u64,
484}
485
486fn search_metrics(events: &[RoderEvent]) -> SearchEvalMetrics {
487    let mut metrics = SearchEvalMetrics::default();
488    for event in events {
489        let RoderEvent::ToolCallCompleted(completed) = event else {
490            continue;
491        };
492        if completed.tool_name.as_deref() != Some("grep") {
493            continue;
494        }
495        metrics.calls += 1;
496        let Some(payload) = completed.display_payload.as_ref() else {
497            continue;
498        };
499        match payload.get("engine").and_then(serde_json::Value::as_str) {
500            Some("indexed") => metrics.indexed_calls += 1,
501            Some("scan") => metrics.scan_calls += 1,
502            Some("fallback") => metrics.fallback_calls += 1,
503            _ => {}
504        }
505        metrics.candidate_files += u64_payload(payload, "candidate_files");
506        metrics.verified_files += u64_payload(payload, "verified_files");
507        metrics.elapsed_ms += u64_payload(payload, "elapsed_ms");
508        metrics.index_bytes = metrics.index_bytes.max(u64_payload(payload, "index_bytes"));
509        metrics.index_build_time_ms = metrics
510            .index_build_time_ms
511            .max(u64_payload(payload, "index_build_time_ms"));
512    }
513    metrics
514}
515
516fn u64_payload(payload: &serde_json::Value, key: &str) -> u64 {
517    payload
518        .get(key)
519        .and_then(serde_json::Value::as_u64)
520        .unwrap_or_default()
521}
522
523fn is_file_read(event: &RoderEvent) -> bool {
524    matches!(
525        event,
526        RoderEvent::ToolCallCompleted(completed)
527            if completed.tool_name.as_deref() == Some("read_file")
528    )
529}
530
531fn is_relevant_file_read(event: &RoderEvent) -> bool {
532    matches!(
533        event,
534        RoderEvent::ToolCallCompleted(completed)
535            if completed.tool_name.as_deref() == Some("read_file")
536                && completed
537                    .display_payload
538                    .as_ref()
539                    .is_some_and(|payload| payload.to_string().contains("relevant"))
540    )
541}
542
543fn count_truncation_follow_ups(events: &[RoderEvent]) -> u64 {
544    let mut saw_truncation = false;
545    let mut follow_ups = 0u64;
546    for event in events {
547        match event {
548            RoderEvent::ToolOutputTruncated(_) => saw_truncation = true,
549            RoderEvent::ToolCallRequested(requested)
550                if saw_truncation
551                    && matches!(requested.tool_name.as_str(), "read_file" | "grep" | "glob") =>
552            {
553                follow_ups += 1;
554                saw_truncation = false;
555            }
556            _ => {}
557        }
558    }
559    follow_ups
560}
561
562fn collect_eval_reports(
563    root: &Path,
564    dir: &Path,
565    reports: &mut Vec<EvalReportSummary>,
566) -> anyhow::Result<()> {
567    if !dir.exists() {
568        return Ok(());
569    }
570    let run_path = dir.join("eval-run.json");
571    if run_path.exists() {
572        let report: EvalSuiteReport = serde_json::from_str(&std::fs::read_to_string(&run_path)?)?;
573        let id = if dir == root {
574            "eval-run".to_string()
575        } else {
576            dir.strip_prefix(root)
577                .unwrap_or(dir)
578                .to_string_lossy()
579                .replace(std::path::MAIN_SEPARATOR, "/")
580        };
581        reports.push(summary_from_report(id, dir.to_path_buf(), &report));
582    }
583    for entry in std::fs::read_dir(dir)? {
584        let path = entry?.path();
585        if path.is_dir() {
586            collect_eval_reports(root, &path, reports)?;
587        }
588    }
589    Ok(())
590}
591
592fn summary_from_report(id: String, path: PathBuf, report: &EvalSuiteReport) -> EvalReportSummary {
593    let passed = report
594        .results
595        .iter()
596        .filter(|result| result.report.outcome == EvalOutcome::Pass)
597        .count();
598    EvalReportSummary {
599        id,
600        path,
601        suite_id: report.suite_id.clone(),
602        fixture_count: report.results.len(),
603        passed,
604        failed: report.results.len().saturating_sub(passed),
605        reliability: reliability_summary(report),
606        generated_at: report.generated_at,
607    }
608}
609
610pub(super) fn trajectory_excerpt(trajectory: &EvalTrajectory) -> Vec<EvalTrajectoryEvent> {
611    let start = trajectory.events.len().saturating_sub(8);
612    trajectory.events[start..].to_vec()
613}
614
615fn eval_report_markdown(report: &EvalSuiteReport) -> String {
616    let passed = report
617        .results
618        .iter()
619        .filter(|result| result.report.outcome == EvalOutcome::Pass)
620        .count();
621    let mut text = format!(
622        "# Roder Eval Report\n\n- Suite: `{}`\n- Fixtures: {}\n- Passed: {}\n- Failed: {}\n",
623        report.suite_id,
624        report.results.len(),
625        passed,
626        report.results.len().saturating_sub(passed)
627    );
628    text.push_str(
629        "\n## Pass Rates\n\n| Scope | Passed | Total | Pass rate |\n| --- | ---: | ---: | ---: |\n",
630    );
631    for (scope, passed, total) in pass_rate_rows(report) {
632        let rate = if total == 0 {
633            0.0
634        } else {
635            (passed as f64 / total as f64) * 100.0
636        };
637        text.push_str(&format!(
638            "| `{scope}` | {passed} | {total} | {rate:.1}% |\n"
639        ));
640    }
641    text.push_str(
642        "\n## Fixtures\n\n| Fixture | Outcome | Failure class | Trace excerpt |\n| --- | --- | --- | --- |\n",
643    );
644    for result in &report.results {
645        let class = result
646            .report
647            .failure_class
648            .as_ref()
649            .map(|class| format!("{class:?}"))
650            .unwrap_or_else(|| "-".to_string());
651        let excerpt = result
652            .trace_excerpt
653            .iter()
654            .map(|event| event.event_type.as_str())
655            .collect::<Vec<_>>()
656            .join(" -> ");
657        text.push_str(&format!(
658            "| `{}` | `{:?}` | `{}` | {} |\n",
659            result.fixture_id, result.report.outcome, class, excerpt
660        ));
661        if let Some(message) = &result.failure_message {
662            text.push_str(&format!(
663                "\nFailure `{}`: {}\n\n",
664                result.fixture_id,
665                message.replace('\n', " ")
666            ));
667        }
668    }
669    text.push_str(
670        "\n## Speed Metrics\n\n| Fixture | Policy | Wall ms | Model calls | Tool calls | Child tasks | Deadline remaining s | Outcome |\n| --- | --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
671    );
672    for result in &report.results {
673        text.push_str(&format!(
674            "| `{}` | `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | `{:?}` |\n",
675            result.fixture_id,
676            speed_policy_label(result),
677            metric_value(result, "wall_time_ms"),
678            metric_value(result, "model_calls"),
679            metric_value(result, "tool_calls"),
680            metric_value(result, "child_task_count"),
681            metric_value(result, "deadline_remaining_seconds"),
682            result.report.outcome,
683        ));
684    }
685    let comparisons = speed_policy_comparisons(report);
686    if !comparisons.is_empty() {
687        text.push_str(
688            "\n## Speed Policy Comparison\n\n| Fixture | Baseline wall ms | Speed wall ms | Delta ms | Baseline model calls | Speed model calls | Baseline tool calls | Speed tool calls | Baseline child tasks | Speed child tasks | Quality |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
689        );
690        for comparison in comparisons {
691            text.push_str(&format!(
692                "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
693                comparison.fixture_id,
694                comparison.baseline_wall_ms,
695                comparison.speed_wall_ms,
696                comparison.speed_wall_ms - comparison.baseline_wall_ms,
697                comparison.baseline_model_calls,
698                comparison.speed_model_calls,
699                comparison.baseline_tool_calls,
700                comparison.speed_tool_calls,
701                comparison.baseline_child_tasks,
702                comparison.speed_child_tasks,
703                comparison.quality,
704            ));
705        }
706    }
707    text.push_str(
708        "\n## Search Metrics\n\n| Fixture | Grep calls | Indexed | Scan | Fallback | Candidate files | Verified files | Grep elapsed ms | Index bytes | Index build ms |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
709    );
710    for result in &report.results {
711        text.push_str(&format!(
712            "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
713            result.fixture_id,
714            metric_value(result, "grep_calls"),
715            metric_value(result, "grep_indexed_calls"),
716            metric_value(result, "grep_scan_calls"),
717            metric_value(result, "grep_fallback_calls"),
718            metric_value(result, "grep_candidate_files"),
719            metric_value(result, "grep_verified_files"),
720            metric_value(result, "grep_elapsed_ms"),
721            metric_value(result, "grep_index_bytes"),
722            metric_value(result, "grep_index_build_time_ms"),
723        ));
724    }
725    let profile_comparisons = model_profile_comparisons(report);
726    if !profile_comparisons.is_empty() {
727        text.push_str(
728            "\n## Model Profile Deltas\n\n| Fixture | Profile | Outcome | Failure class | Wall ms | Model calls | Tool calls |\n| --- | --- | --- | --- | ---: | ---: | ---: |\n",
729        );
730        for comparison in profile_comparisons {
731            text.push_str(&format!(
732                "| `{}` | `{}` | `{:?}` | `{}` | {:.0} | {:.0} | {:.0} |\n",
733                comparison.fixture_id,
734                comparison.profile,
735                comparison.outcome,
736                comparison.failure_class,
737                comparison.wall_ms,
738                comparison.model_calls,
739                comparison.tool_calls,
740            ));
741        }
742        text.push_str("\nRecommended profile changes should be made only when this table shows an improved failure class or equivalent quality with lower wall/model/tool cost.\n");
743    }
744    text.push_str(
745        "\n## Context Metrics\n\n| Fixture | Context tokens | Context bytes | Entrypoint candidates | Entrypoint injection event | First relevant read event | Irrelevant reads | Truncation follow-ups | Tool output truncations |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
746    );
747    for result in &report.results {
748        text.push_str(&format!(
749            "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
750            result.fixture_id,
751            metric_value(result, "context_estimated_tokens"),
752            metric_value(result, "context_bytes"),
753            metric_value(result, "entrypoint_candidates"),
754            metric_value(result, "entrypoint_injection_event"),
755            metric_value(result, "first_relevant_file_read_event"),
756            metric_value(result, "irrelevant_file_reads"),
757            metric_value(result, "truncation_follow_ups"),
758            metric_value(result, "tool_output_truncations"),
759        ));
760    }
761    text.push_str(
762        "\n## Task Ledger Metrics\n\n| Fixture | Updates | Tasks | Completed |\n| --- | ---: | ---: | ---: |\n",
763    );
764    for result in &report.results {
765        text.push_str(&format!(
766            "| `{}` | {:.0} | {:.0} | {:.0} |\n",
767            result.fixture_id,
768            metric_value(result, "task_ledger_updates"),
769            metric_value(result, "task_ledger_tasks"),
770            metric_value(result, "task_ledger_completed"),
771        ));
772    }
773    text.push_str(
774        "\n## Verification Metrics\n\n| Fixture | Required | Completed | Failed | Skipped | Open gaps | Remaining gaps |\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
775    );
776    for result in &report.results {
777        text.push_str(&format!(
778            "| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
779            result.fixture_id,
780            metric_value(result, "verification_required"),
781            metric_value(result, "verification_completed"),
782            metric_value(result, "verification_failed"),
783            metric_value(result, "verification_skipped"),
784            metric_value(result, "verification_open_gaps"),
785            markdown_cell(&verification_remaining_gaps(
786                &result.report.trajectory.events
787            )),
788        ));
789    }
790    text.push_str(&retrieval_router_markdown(report));
791    text.push_str(&lazy_discovery_markdown(report));
792    text.push_str(&reliability_markdown(report));
793    let groups = failure_groups(report);
794    if !groups.is_empty() {
795        text.push_str("\n## Failure Groups\n\n| Tool | Model | Failure class | Count |\n| --- | --- | --- | --- |\n");
796        for ((tool, model, class), count) in groups {
797            text.push_str(&format!("| `{tool}` | `{model}` | `{class}` | {count} |\n"));
798        }
799    }
800    text
801}
802
803fn metric_value(result: &EvalFixtureResult, name: &str) -> f64 {
804    result
805        .report
806        .metrics
807        .iter()
808        .find(|metric| metric.name == name)
809        .map(|metric| metric.value)
810        .unwrap_or(0.0)
811}
812
813fn speed_policy_label(result: &EvalFixtureResult) -> &'static str {
814    if result
815        .report
816        .run
817        .tags
818        .iter()
819        .any(|tag| tag == "speed_policy:on")
820    {
821        "on"
822    } else {
823        "off"
824    }
825}
826
827fn profile_label(result: &EvalFixtureResult) -> Option<String> {
828    result
829        .report
830        .run
831        .tags
832        .iter()
833        .find_map(|tag| tag.strip_prefix("profile:").map(str::to_string))
834}
835
836struct ModelProfileComparison {
837    fixture_id: String,
838    profile: String,
839    outcome: EvalOutcome,
840    failure_class: String,
841    wall_ms: f64,
842    model_calls: f64,
843    tool_calls: f64,
844}
845
846fn model_profile_comparisons(report: &EvalSuiteReport) -> Vec<ModelProfileComparison> {
847    let mut rows = report
848        .results
849        .iter()
850        .filter_map(|result| {
851            let profile = profile_label(result)?;
852            Some(ModelProfileComparison {
853                fixture_id: result.fixture_id.clone(),
854                profile,
855                outcome: result.report.outcome.clone(),
856                failure_class: result
857                    .report
858                    .failure_class
859                    .as_ref()
860                    .map(|class| format!("{class:?}"))
861                    .unwrap_or_else(|| "-".to_string()),
862                wall_ms: metric_value(result, "wall_time_ms"),
863                model_calls: metric_value(result, "model_calls"),
864                tool_calls: metric_value(result, "tool_calls"),
865            })
866        })
867        .collect::<Vec<_>>();
868    rows.sort_by(|left, right| {
869        left.fixture_id
870            .cmp(&right.fixture_id)
871            .then_with(|| left.profile.cmp(&right.profile))
872    });
873    rows
874}
875
876struct SpeedPolicyComparison {
877    fixture_id: String,
878    baseline_wall_ms: f64,
879    speed_wall_ms: f64,
880    baseline_model_calls: f64,
881    speed_model_calls: f64,
882    baseline_tool_calls: f64,
883    speed_tool_calls: f64,
884    baseline_child_tasks: f64,
885    speed_child_tasks: f64,
886    quality: String,
887}
888
889fn speed_policy_comparisons(report: &EvalSuiteReport) -> Vec<SpeedPolicyComparison> {
890    let mut by_fixture =
891        BTreeMap::<String, (Option<&EvalFixtureResult>, Option<&EvalFixtureResult>)>::new();
892    for result in &report.results {
893        let entry = by_fixture
894            .entry(result.fixture_id.clone())
895            .or_insert((None, None));
896        match speed_policy_label(result) {
897            "on" => entry.1 = Some(result),
898            _ => entry.0 = Some(result),
899        }
900    }
901    by_fixture
902        .into_iter()
903        .filter_map(|(fixture_id, (baseline, speed))| {
904            let baseline = baseline?;
905            let speed = speed?;
906            Some(SpeedPolicyComparison {
907                fixture_id,
908                baseline_wall_ms: metric_value(baseline, "wall_time_ms"),
909                speed_wall_ms: metric_value(speed, "wall_time_ms"),
910                baseline_model_calls: metric_value(baseline, "model_calls"),
911                speed_model_calls: metric_value(speed, "model_calls"),
912                baseline_tool_calls: metric_value(baseline, "tool_calls"),
913                speed_tool_calls: metric_value(speed, "tool_calls"),
914                baseline_child_tasks: metric_value(baseline, "child_task_count"),
915                speed_child_tasks: metric_value(speed, "child_task_count"),
916                quality: if baseline.report.outcome == speed.report.outcome {
917                    format!("matched `{:?}`", speed.report.outcome)
918                } else {
919                    format!(
920                        "changed `{:?}` -> `{:?}`",
921                        baseline.report.outcome, speed.report.outcome
922                    )
923                },
924            })
925        })
926        .collect()
927}
928
929fn verification_remaining_gaps(events: &[crate::EvalTrajectoryEvent]) -> String {
930    events
931        .iter()
932        .rev()
933        .find(|event| event.event_type == "verification_completed" && event.is_error)
934        .map(|_| "see failure message or verification trace".to_string())
935        .unwrap_or_else(|| "-".to_string())
936}
937
938fn markdown_cell(value: &str) -> String {
939    value.replace('|', "\\|").replace('\n', " ")
940}
941
942fn pass_rate_rows(report: &EvalSuiteReport) -> Vec<(String, usize, usize)> {
943    let mut rows = BTreeMap::<String, (usize, usize)>::new();
944    for result in &report.results {
945        let passed = usize::from(result.report.outcome == EvalOutcome::Pass);
946        let model_scope = format!("{}/{}", result.report.run.provider, result.report.run.model);
947        let entry = rows.entry(format!("model:{model_scope}")).or_insert((0, 0));
948        entry.0 += passed;
949        entry.1 += 1;
950        for tag in result
951            .report
952            .run
953            .tags
954            .iter()
955            .filter(|tag| tag.starts_with("tool:"))
956        {
957            let entry = rows.entry(tag.clone()).or_insert((0, 0));
958            entry.0 += passed;
959            entry.1 += 1;
960        }
961    }
962    rows.into_iter()
963        .map(|(scope, (passed, total))| (scope, passed, total))
964        .collect()
965}
966
967fn failure_groups(report: &EvalSuiteReport) -> BTreeMap<(String, String, String), usize> {
968    let mut groups = BTreeMap::new();
969    for result in &report.results {
970        if result.report.outcome == EvalOutcome::Pass {
971            continue;
972        }
973        let tool = result
974            .report
975            .run
976            .tags
977            .iter()
978            .find_map(|tag| tag.strip_prefix("tool:"))
979            .unwrap_or("unknown")
980            .to_string();
981        let model = format!("{}/{}", result.report.run.provider, result.report.run.model);
982        let class = result
983            .report
984            .failure_class
985            .as_ref()
986            .map(|class| format!("{class:?}"))
987            .unwrap_or_else(|| "Unknown".to_string());
988        *groups.entry((tool, model, class)).or_insert(0) += 1;
989    }
990    groups
991}
992
993#[cfg(test)]
994mod tests {
995    use super::*;
996    use roder_api::events::{
997        ContextAssemblyCompleted, ContextEntrypointCandidatesInjected, InferenceStarted,
998        RoderEvent, ToolCallCompleted, ToolCallRequested, ToolOutputTruncated,
999        VerificationCompleted, VerificationRequired,
1000    };
1001    use roder_api::tasks::TaskStarted;
1002
1003    #[test]
1004    fn context_eval_metrics_track_budget_entrypoints_and_truncation_follow_up() {
1005        let events = vec![
1006            RoderEvent::ContextAssemblyCompleted(ContextAssemblyCompleted {
1007                thread_id: "thread-a".to_string(),
1008                turn_id: "turn-a".to_string(),
1009                block_count: 1,
1010                total_byte_count: 800,
1011                estimated_tokens: 200,
1012                prompt_estimated_tokens: 200,
1013                token_budget: Some(1_000),
1014                timestamp: OffsetDateTime::UNIX_EPOCH,
1015            }),
1016            RoderEvent::ContextEntrypointCandidatesInjected(ContextEntrypointCandidatesInjected {
1017                thread_id: "thread-a".to_string(),
1018                turn_id: "turn-a".to_string(),
1019                candidate_count: 3,
1020                block_byte_count: 120,
1021                estimated_tokens: 30,
1022                timestamp: OffsetDateTime::UNIX_EPOCH,
1023            }),
1024            RoderEvent::ToolOutputTruncated(ToolOutputTruncated {
1025                thread_id: "thread-a".to_string(),
1026                turn_id: "turn-a".to_string(),
1027                tool_id: "tool-a".to_string(),
1028                tool_name: Some("grep".to_string()),
1029                original_line_count: 1_000,
1030                original_char_count: 40_000,
1031                inline_char_count: 2_000,
1032                artifact_backed: false,
1033                timestamp: OffsetDateTime::UNIX_EPOCH,
1034            }),
1035            RoderEvent::ToolCallRequested(ToolCallRequested {
1036                thread_id: "thread-a".to_string(),
1037                turn_id: "turn-a".to_string(),
1038                tool_id: "tool-b".to_string(),
1039                tool_name: "grep".to_string(),
1040                display_payload: None,
1041                timestamp: OffsetDateTime::UNIX_EPOCH,
1042            }),
1043        ];
1044
1045        let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1046        let value = |name: &str| {
1047            metrics
1048                .iter()
1049                .find(|metric| metric.name == name)
1050                .map(|metric| metric.value)
1051                .unwrap()
1052        };
1053
1054        assert_eq!(value("context_estimated_tokens"), 200.0);
1055        assert_eq!(value("context_bytes"), 800.0);
1056        assert_eq!(value("entrypoint_candidates"), 3.0);
1057        assert_eq!(value("entrypoint_injection_event"), 2.0);
1058        assert_eq!(value("truncation_follow_ups"), 1.0);
1059        assert_eq!(value("tool_output_truncations"), 1.0);
1060    }
1061
1062    #[test]
1063    fn verification_eval_metrics_track_required_completed_and_gaps() {
1064        let events = vec![
1065            RoderEvent::VerificationRequired(VerificationRequired {
1066                thread_id: "thread-a".to_string(),
1067                turn_id: "turn-a".to_string(),
1068                reason: "code_changes_without_verification".to_string(),
1069                changed_files: vec!["src/lib.rs".to_string()],
1070                tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
1071                tests_run: Vec::new(),
1072                open_gaps: Vec::new(),
1073                timestamp: OffsetDateTime::UNIX_EPOCH,
1074            }),
1075            RoderEvent::VerificationCompleted(VerificationCompleted {
1076                thread_id: "thread-a".to_string(),
1077                turn_id: "turn-a".to_string(),
1078                passed: false,
1079                changed_files: vec!["src/lib.rs".to_string()],
1080                tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
1081                tests_run: Vec::new(),
1082                open_gaps: vec!["tests not run".to_string()],
1083                timestamp: OffsetDateTime::UNIX_EPOCH,
1084            }),
1085        ];
1086
1087        let metrics = eval_metrics(&events, 42, &EvalOutcome::Fail);
1088        let value = |name: &str| {
1089            metrics
1090                .iter()
1091                .find(|metric| metric.name == name)
1092                .map(|metric| metric.value)
1093                .unwrap()
1094        };
1095
1096        assert_eq!(value("verification_required"), 1.0);
1097        assert_eq!(value("verification_completed"), 0.0);
1098        assert_eq!(value("verification_failed"), 1.0);
1099        assert_eq!(value("verification_open_gaps"), 1.0);
1100    }
1101
1102    #[test]
1103    fn search_eval_metrics_track_grep_engine_and_latency_metadata() {
1104        let events = vec![RoderEvent::ToolCallCompleted(ToolCallCompleted {
1105            thread_id: "thread-a".to_string(),
1106            turn_id: "turn-a".to_string(),
1107            tool_id: "grep-a".to_string(),
1108            tool_name: Some("grep".to_string()),
1109            display_payload: Some(serde_json::json!({
1110                "query": "BUG_ROOT_CAUSE_TOKEN",
1111                "engine": "indexed",
1112                "candidate_files": 4,
1113                "verified_files": 2,
1114                "elapsed_ms": 7,
1115                "index_bytes": 4096,
1116                "index_build_time_ms": 3
1117            })),
1118            is_error: false,
1119            output: None,
1120            timestamp: OffsetDateTime::UNIX_EPOCH,
1121        })];
1122
1123        let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1124        let value = |name: &str| {
1125            metrics
1126                .iter()
1127                .find(|metric| metric.name == name)
1128                .map(|metric| metric.value)
1129                .unwrap()
1130        };
1131
1132        assert_eq!(value("grep_calls"), 1.0);
1133        assert_eq!(value("grep_indexed_calls"), 1.0);
1134        assert_eq!(value("grep_candidate_files"), 4.0);
1135        assert_eq!(value("grep_verified_files"), 2.0);
1136        assert_eq!(value("grep_elapsed_ms"), 7.0);
1137        assert_eq!(value("grep_index_bytes"), 4096.0);
1138        assert_eq!(value("grep_index_build_time_ms"), 3.0);
1139    }
1140
1141    #[test]
1142    fn speed_eval_metrics_track_child_tasks_and_deadline_remaining() {
1143        let events = vec![
1144            RoderEvent::InferenceStarted(InferenceStarted {
1145                thread_id: "thread-a".to_string(),
1146                turn_id: "turn-a".to_string(),
1147                engine_id: "mock".to_string(),
1148                model: roder_api::inference::ModelSelection {
1149                    provider: "mock".to_string(),
1150                    model: "mock".to_string(),
1151                },
1152                reasoning: roder_api::inference::ReasoningConfig::default(),
1153                speed_policy: None,
1154                deadline_remaining_seconds: Some(27),
1155                timestamp: OffsetDateTime::UNIX_EPOCH,
1156            }),
1157            RoderEvent::TaskStarted(TaskStarted {
1158                task_id: "task-a".to_string(),
1159                executor_id: "subagent".to_string(),
1160                task_kind: "subagent".to_string(),
1161                queue_depth: 0,
1162                thread_id: Some("thread-a".to_string()),
1163                turn_id: Some("turn-a".to_string()),
1164                timestamp: OffsetDateTime::UNIX_EPOCH,
1165            }),
1166        ];
1167
1168        let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
1169        let value = |name: &str| {
1170            metrics
1171                .iter()
1172                .find(|metric| metric.name == name)
1173                .map(|metric| metric.value)
1174                .unwrap()
1175        };
1176
1177        assert_eq!(value("model_calls"), 1.0);
1178        assert_eq!(value("child_task_count"), 1.0);
1179        assert_eq!(value("deadline_remaining_seconds"), 27.0);
1180    }
1181}
roder_evals/runner/report.rs

roder_evals/runner/
report.rs