use std::cmp::Reverse;
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use roder_api::events::RoderEvent;
use roder_api::inference::InferenceEvent;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::retrieval_router::retrieval_router_markdown;
use crate::{EvalMetric, EvalMetricKind, EvalOutcome, EvalTrajectory, EvalTrajectoryEvent};
use super::lazy_discovery::lazy_discovery_markdown;
use super::reliability::{
ReliabilityReportSummary, reliability_markdown, reliability_metrics, reliability_summary,
};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct EvalSuiteReport {
pub suite_id: String,
pub fixture_dir: PathBuf,
pub output_dir: PathBuf,
pub offline: bool,
#[serde(with = "time::serde::rfc3339")]
pub generated_at: OffsetDateTime,
pub results: Vec<EvalFixtureResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct EvalFixtureResult {
pub fixture_id: String,
pub title: String,
pub workspace: PathBuf,
pub final_answer: String,
pub report: crate::EvalReport,
#[serde(default)]
pub trace_excerpt: Vec<EvalTrajectoryEvent>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub failure_message: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct EvalReportSummary {
pub id: String,
pub path: PathBuf,
pub suite_id: String,
pub fixture_count: usize,
pub passed: usize,
pub failed: usize,
#[serde(default)]
pub reliability: ReliabilityReportSummary,
#[serde(with = "time::serde::rfc3339")]
pub generated_at: OffsetDateTime,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct EvalReportDocument {
pub summary: EvalReportSummary,
pub markdown: String,
pub truncated: bool,
}
pub fn write_eval_report_files(report: &EvalSuiteReport, output_dir: &Path) -> anyhow::Result<()> {
std::fs::create_dir_all(output_dir)?;
std::fs::write(
output_dir.join("eval-run.json"),
serde_json::to_string_pretty(report)?,
)?;
std::fs::write(
output_dir.join("eval-report.md"),
eval_report_markdown(report),
)?;
Ok(())
}
pub fn list_eval_reports(output_dir: &Path) -> anyhow::Result<Vec<EvalReportSummary>> {
let mut reports = Vec::new();
collect_eval_reports(output_dir, output_dir, &mut reports)?;
reports.sort_by_key(|report| Reverse(report.generated_at));
Ok(reports)
}
pub fn read_eval_report(
output_dir: &Path,
report_id: &str,
max_bytes: usize,
) -> anyhow::Result<EvalReportDocument> {
let reports = list_eval_reports(output_dir)?;
let summary = reports
.into_iter()
.find(|report| report.id == report_id)
.ok_or_else(|| anyhow::anyhow!("eval report not found: {report_id}"))?;
let markdown_path = summary.path.join("eval-report.md");
let markdown = std::fs::read_to_string(&markdown_path)?;
let truncated = markdown.len() > max_bytes;
let markdown = if truncated {
markdown.chars().take(max_bytes).collect()
} else {
markdown
};
Ok(EvalReportDocument {
summary,
markdown,
truncated,
})
}
pub(super) fn eval_metrics(
events: &[RoderEvent],
wall_time_ms: u128,
outcome: &EvalOutcome,
) -> Vec<EvalMetric> {
let search = search_metrics(events);
let model_calls = events
.iter()
.filter(|event| matches!(event, RoderEvent::InferenceStarted(_)))
.count();
let tool_calls = events
.iter()
.filter(|event| matches!(event, RoderEvent::ToolCallRequested(_)))
.count();
let tool_errors = events
.iter()
.filter(|event| {
matches!(
event,
RoderEvent::ToolCallCompleted(completed) if completed.is_error
)
})
.count();
let child_tasks = events
.iter()
.filter(|event| {
matches!(
event,
RoderEvent::TaskStarted(_)
| RoderEvent::SubagentStarted(_)
| RoderEvent::TeamMemberStarted(_)
)
})
.count();
let deadline_remaining_seconds = events
.iter()
.filter_map(|event| match event {
RoderEvent::InferenceStarted(started) => started.deadline_remaining_seconds,
_ => None,
})
.next_back()
.unwrap_or(0);
let total_tokens = events
.iter()
.filter_map(|event| match event {
RoderEvent::InferenceEventReceived(received) => match &received.event {
InferenceEvent::Usage(usage) => Some(u64::from(usage.total_tokens)),
_ => None,
},
_ => None,
})
.sum::<u64>();
let context_tokens = events
.iter()
.filter_map(|event| match event {
RoderEvent::ContextAssemblyCompleted(completed) => {
Some(u64::from(completed.estimated_tokens))
}
_ => None,
})
.max()
.unwrap_or(0);
let context_bytes = events
.iter()
.filter_map(|event| match event {
RoderEvent::ContextAssemblyCompleted(completed) => Some(completed.total_byte_count),
_ => None,
})
.max()
.unwrap_or(0);
let entrypoint_candidates = events
.iter()
.filter_map(|event| match event {
RoderEvent::ContextEntrypointCandidatesInjected(injected) => {
Some(injected.candidate_count)
}
_ => None,
})
.sum::<u64>();
let entrypoint_injection_event = events
.iter()
.position(|event| matches!(event, RoderEvent::ContextEntrypointCandidatesInjected(_)))
.map(|index| index as u64 + 1)
.unwrap_or(0);
let first_relevant_file_read = events
.iter()
.position(is_relevant_file_read)
.map(|index| index as u64 + 1)
.unwrap_or(0);
let irrelevant_file_reads = events
.iter()
.filter(|event| is_file_read(event) && !is_relevant_file_read(event))
.count() as u64;
let truncation_follow_ups = count_truncation_follow_ups(events);
let tool_output_truncations = events
.iter()
.filter(|event| matches!(event, RoderEvent::ToolOutputTruncated(_)))
.count() as u64;
let task_ledger_updates = events
.iter()
.filter(|event| matches!(event, RoderEvent::TaskLedgerUpdated(_)))
.count() as u64;
let task_ledger_tasks = events
.iter()
.filter_map(|event| match event {
RoderEvent::TaskLedgerUpdated(updated) => Some(updated.tasks.len() as u64),
_ => None,
})
.next_back()
.unwrap_or(0);
let task_ledger_completed = events
.iter()
.filter_map(|event| match event {
RoderEvent::TaskLedgerUpdated(updated) => Some(updated.completed_count),
_ => None,
})
.next_back()
.unwrap_or(0);
let verification_required = events
.iter()
.filter(|event| matches!(event, RoderEvent::VerificationRequired(_)))
.count() as u64;
let verification_completed = events
.iter()
.filter(|event| {
matches!(
event,
RoderEvent::VerificationCompleted(completed) if completed.passed
)
})
.count() as u64;
let verification_failed = events
.iter()
.filter(|event| {
matches!(
event,
RoderEvent::VerificationCompleted(completed) if !completed.passed
)
})
.count() as u64;
let verification_skipped = events
.iter()
.filter(|event| matches!(event, RoderEvent::VerificationSkipped(_)))
.count() as u64;
let verification_open_gaps = events
.iter()
.filter_map(|event| match event {
RoderEvent::VerificationCompleted(completed) => Some(completed.open_gaps.len() as u64),
RoderEvent::VerificationRequired(required) => Some(required.open_gaps.len() as u64),
_ => None,
})
.next_back()
.unwrap_or(0);
let mut metrics = vec![
EvalMetric {
name: "outcome_pass".to_string(),
kind: EvalMetricKind::Outcome,
value: if outcome == &EvalOutcome::Pass {
1.0
} else {
0.0
},
unit: None,
},
EvalMetric {
name: "wall_time_ms".to_string(),
kind: EvalMetricKind::Duration,
value: wall_time_ms as f64,
unit: Some("ms".to_string()),
},
EvalMetric {
name: "model_calls".to_string(),
kind: EvalMetricKind::Count,
value: model_calls as f64,
unit: None,
},
EvalMetric {
name: "tool_calls".to_string(),
kind: EvalMetricKind::Count,
value: tool_calls as f64,
unit: None,
},
EvalMetric {
name: "child_task_count".to_string(),
kind: EvalMetricKind::Count,
value: child_tasks as f64,
unit: None,
},
EvalMetric {
name: "deadline_remaining_seconds".to_string(),
kind: EvalMetricKind::Duration,
value: deadline_remaining_seconds as f64,
unit: Some("s".to_string()),
},
EvalMetric {
name: "tool_errors".to_string(),
kind: EvalMetricKind::Count,
value: tool_errors as f64,
unit: None,
},
EvalMetric {
name: "total_tokens".to_string(),
kind: EvalMetricKind::Tokens,
value: total_tokens as f64,
unit: Some("tokens".to_string()),
},
EvalMetric {
name: "context_estimated_tokens".to_string(),
kind: EvalMetricKind::Tokens,
value: context_tokens as f64,
unit: Some("tokens".to_string()),
},
EvalMetric {
name: "context_bytes".to_string(),
kind: EvalMetricKind::Bytes,
value: context_bytes as f64,
unit: Some("bytes".to_string()),
},
EvalMetric {
name: "entrypoint_candidates".to_string(),
kind: EvalMetricKind::Count,
value: entrypoint_candidates as f64,
unit: None,
},
EvalMetric {
name: "entrypoint_injection_event".to_string(),
kind: EvalMetricKind::Count,
value: entrypoint_injection_event as f64,
unit: None,
},
EvalMetric {
name: "first_relevant_file_read_event".to_string(),
kind: EvalMetricKind::Count,
value: first_relevant_file_read as f64,
unit: None,
},
EvalMetric {
name: "irrelevant_file_reads".to_string(),
kind: EvalMetricKind::Count,
value: irrelevant_file_reads as f64,
unit: None,
},
EvalMetric {
name: "truncation_follow_ups".to_string(),
kind: EvalMetricKind::Count,
value: truncation_follow_ups as f64,
unit: None,
},
EvalMetric {
name: "tool_output_truncations".to_string(),
kind: EvalMetricKind::Count,
value: tool_output_truncations as f64,
unit: None,
},
EvalMetric {
name: "grep_calls".to_string(),
kind: EvalMetricKind::Count,
value: search.calls as f64,
unit: None,
},
EvalMetric {
name: "grep_indexed_calls".to_string(),
kind: EvalMetricKind::Count,
value: search.indexed_calls as f64,
unit: None,
},
EvalMetric {
name: "grep_scan_calls".to_string(),
kind: EvalMetricKind::Count,
value: search.scan_calls as f64,
unit: None,
},
EvalMetric {
name: "grep_fallback_calls".to_string(),
kind: EvalMetricKind::Count,
value: search.fallback_calls as f64,
unit: None,
},
EvalMetric {
name: "grep_candidate_files".to_string(),
kind: EvalMetricKind::Count,
value: search.candidate_files as f64,
unit: None,
},
EvalMetric {
name: "grep_verified_files".to_string(),
kind: EvalMetricKind::Count,
value: search.verified_files as f64,
unit: None,
},
EvalMetric {
name: "grep_elapsed_ms".to_string(),
kind: EvalMetricKind::Duration,
value: search.elapsed_ms as f64,
unit: Some("ms".to_string()),
},
EvalMetric {
name: "grep_index_bytes".to_string(),
kind: EvalMetricKind::Bytes,
value: search.index_bytes as f64,
unit: Some("bytes".to_string()),
},
EvalMetric {
name: "grep_index_build_time_ms".to_string(),
kind: EvalMetricKind::Duration,
value: search.index_build_time_ms as f64,
unit: Some("ms".to_string()),
},
EvalMetric {
name: "task_ledger_updates".to_string(),
kind: EvalMetricKind::Count,
value: task_ledger_updates as f64,
unit: None,
},
EvalMetric {
name: "task_ledger_tasks".to_string(),
kind: EvalMetricKind::Count,
value: task_ledger_tasks as f64,
unit: None,
},
EvalMetric {
name: "task_ledger_completed".to_string(),
kind: EvalMetricKind::Count,
value: task_ledger_completed as f64,
unit: None,
},
EvalMetric {
name: "verification_required".to_string(),
kind: EvalMetricKind::Count,
value: verification_required as f64,
unit: None,
},
EvalMetric {
name: "verification_completed".to_string(),
kind: EvalMetricKind::Count,
value: verification_completed as f64,
unit: None,
},
EvalMetric {
name: "verification_failed".to_string(),
kind: EvalMetricKind::Count,
value: verification_failed as f64,
unit: None,
},
EvalMetric {
name: "verification_skipped".to_string(),
kind: EvalMetricKind::Count,
value: verification_skipped as f64,
unit: None,
},
EvalMetric {
name: "verification_open_gaps".to_string(),
kind: EvalMetricKind::Count,
value: verification_open_gaps as f64,
unit: None,
},
];
metrics.extend(reliability_metrics(events, outcome));
metrics
}
#[derive(Default)]
struct SearchEvalMetrics {
calls: u64,
indexed_calls: u64,
scan_calls: u64,
fallback_calls: u64,
candidate_files: u64,
verified_files: u64,
elapsed_ms: u64,
index_bytes: u64,
index_build_time_ms: u64,
}
fn search_metrics(events: &[RoderEvent]) -> SearchEvalMetrics {
let mut metrics = SearchEvalMetrics::default();
for event in events {
let RoderEvent::ToolCallCompleted(completed) = event else {
continue;
};
if completed.tool_name.as_deref() != Some("grep") {
continue;
}
metrics.calls += 1;
let Some(payload) = completed.display_payload.as_ref() else {
continue;
};
match payload.get("engine").and_then(serde_json::Value::as_str) {
Some("indexed") => metrics.indexed_calls += 1,
Some("scan") => metrics.scan_calls += 1,
Some("fallback") => metrics.fallback_calls += 1,
_ => {}
}
metrics.candidate_files += u64_payload(payload, "candidate_files");
metrics.verified_files += u64_payload(payload, "verified_files");
metrics.elapsed_ms += u64_payload(payload, "elapsed_ms");
metrics.index_bytes = metrics.index_bytes.max(u64_payload(payload, "index_bytes"));
metrics.index_build_time_ms = metrics
.index_build_time_ms
.max(u64_payload(payload, "index_build_time_ms"));
}
metrics
}
fn u64_payload(payload: &serde_json::Value, key: &str) -> u64 {
payload
.get(key)
.and_then(serde_json::Value::as_u64)
.unwrap_or_default()
}
fn is_file_read(event: &RoderEvent) -> bool {
matches!(
event,
RoderEvent::ToolCallCompleted(completed)
if completed.tool_name.as_deref() == Some("read_file")
)
}
fn is_relevant_file_read(event: &RoderEvent) -> bool {
matches!(
event,
RoderEvent::ToolCallCompleted(completed)
if completed.tool_name.as_deref() == Some("read_file")
&& completed
.display_payload
.as_ref()
.is_some_and(|payload| payload.to_string().contains("relevant"))
)
}
fn count_truncation_follow_ups(events: &[RoderEvent]) -> u64 {
let mut saw_truncation = false;
let mut follow_ups = 0u64;
for event in events {
match event {
RoderEvent::ToolOutputTruncated(_) => saw_truncation = true,
RoderEvent::ToolCallRequested(requested)
if saw_truncation
&& matches!(requested.tool_name.as_str(), "read_file" | "grep" | "glob") =>
{
follow_ups += 1;
saw_truncation = false;
}
_ => {}
}
}
follow_ups
}
fn collect_eval_reports(
root: &Path,
dir: &Path,
reports: &mut Vec<EvalReportSummary>,
) -> anyhow::Result<()> {
if !dir.exists() {
return Ok(());
}
let run_path = dir.join("eval-run.json");
if run_path.exists() {
let report: EvalSuiteReport = serde_json::from_str(&std::fs::read_to_string(&run_path)?)?;
let id = if dir == root {
"eval-run".to_string()
} else {
dir.strip_prefix(root)
.unwrap_or(dir)
.to_string_lossy()
.replace(std::path::MAIN_SEPARATOR, "/")
};
reports.push(summary_from_report(id, dir.to_path_buf(), &report));
}
for entry in std::fs::read_dir(dir)? {
let path = entry?.path();
if path.is_dir() {
collect_eval_reports(root, &path, reports)?;
}
}
Ok(())
}
fn summary_from_report(id: String, path: PathBuf, report: &EvalSuiteReport) -> EvalReportSummary {
let passed = report
.results
.iter()
.filter(|result| result.report.outcome == EvalOutcome::Pass)
.count();
EvalReportSummary {
id,
path,
suite_id: report.suite_id.clone(),
fixture_count: report.results.len(),
passed,
failed: report.results.len().saturating_sub(passed),
reliability: reliability_summary(report),
generated_at: report.generated_at,
}
}
pub(super) fn trajectory_excerpt(trajectory: &EvalTrajectory) -> Vec<EvalTrajectoryEvent> {
let start = trajectory.events.len().saturating_sub(8);
trajectory.events[start..].to_vec()
}
fn eval_report_markdown(report: &EvalSuiteReport) -> String {
let passed = report
.results
.iter()
.filter(|result| result.report.outcome == EvalOutcome::Pass)
.count();
let mut text = format!(
"# Roder Eval Report\n\n- Suite: `{}`\n- Fixtures: {}\n- Passed: {}\n- Failed: {}\n",
report.suite_id,
report.results.len(),
passed,
report.results.len().saturating_sub(passed)
);
text.push_str(
"\n## Pass Rates\n\n| Scope | Passed | Total | Pass rate |\n| --- | ---: | ---: | ---: |\n",
);
for (scope, passed, total) in pass_rate_rows(report) {
let rate = if total == 0 {
0.0
} else {
(passed as f64 / total as f64) * 100.0
};
text.push_str(&format!(
"| `{scope}` | {passed} | {total} | {rate:.1}% |\n"
));
}
text.push_str(
"\n## Fixtures\n\n| Fixture | Outcome | Failure class | Trace excerpt |\n| --- | --- | --- | --- |\n",
);
for result in &report.results {
let class = result
.report
.failure_class
.as_ref()
.map(|class| format!("{class:?}"))
.unwrap_or_else(|| "-".to_string());
let excerpt = result
.trace_excerpt
.iter()
.map(|event| event.event_type.as_str())
.collect::<Vec<_>>()
.join(" -> ");
text.push_str(&format!(
"| `{}` | `{:?}` | `{}` | {} |\n",
result.fixture_id, result.report.outcome, class, excerpt
));
if let Some(message) = &result.failure_message {
text.push_str(&format!(
"\nFailure `{}`: {}\n\n",
result.fixture_id,
message.replace('\n', " ")
));
}
}
text.push_str(
"\n## Speed Metrics\n\n| Fixture | Policy | Wall ms | Model calls | Tool calls | Child tasks | Deadline remaining s | Outcome |\n| --- | --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
);
for result in &report.results {
text.push_str(&format!(
"| `{}` | `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | `{:?}` |\n",
result.fixture_id,
speed_policy_label(result),
metric_value(result, "wall_time_ms"),
metric_value(result, "model_calls"),
metric_value(result, "tool_calls"),
metric_value(result, "child_task_count"),
metric_value(result, "deadline_remaining_seconds"),
result.report.outcome,
));
}
let comparisons = speed_policy_comparisons(report);
if !comparisons.is_empty() {
text.push_str(
"\n## Speed Policy Comparison\n\n| Fixture | Baseline wall ms | Speed wall ms | Delta ms | Baseline model calls | Speed model calls | Baseline tool calls | Speed tool calls | Baseline child tasks | Speed child tasks | Quality |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
);
for comparison in comparisons {
text.push_str(&format!(
"| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
comparison.fixture_id,
comparison.baseline_wall_ms,
comparison.speed_wall_ms,
comparison.speed_wall_ms - comparison.baseline_wall_ms,
comparison.baseline_model_calls,
comparison.speed_model_calls,
comparison.baseline_tool_calls,
comparison.speed_tool_calls,
comparison.baseline_child_tasks,
comparison.speed_child_tasks,
comparison.quality,
));
}
}
text.push_str(
"\n## Search Metrics\n\n| Fixture | Grep calls | Indexed | Scan | Fallback | Candidate files | Verified files | Grep elapsed ms | Index bytes | Index build ms |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
);
for result in &report.results {
text.push_str(&format!(
"| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
result.fixture_id,
metric_value(result, "grep_calls"),
metric_value(result, "grep_indexed_calls"),
metric_value(result, "grep_scan_calls"),
metric_value(result, "grep_fallback_calls"),
metric_value(result, "grep_candidate_files"),
metric_value(result, "grep_verified_files"),
metric_value(result, "grep_elapsed_ms"),
metric_value(result, "grep_index_bytes"),
metric_value(result, "grep_index_build_time_ms"),
));
}
let profile_comparisons = model_profile_comparisons(report);
if !profile_comparisons.is_empty() {
text.push_str(
"\n## Model Profile Deltas\n\n| Fixture | Profile | Outcome | Failure class | Wall ms | Model calls | Tool calls |\n| --- | --- | --- | --- | ---: | ---: | ---: |\n",
);
for comparison in profile_comparisons {
text.push_str(&format!(
"| `{}` | `{}` | `{:?}` | `{}` | {:.0} | {:.0} | {:.0} |\n",
comparison.fixture_id,
comparison.profile,
comparison.outcome,
comparison.failure_class,
comparison.wall_ms,
comparison.model_calls,
comparison.tool_calls,
));
}
text.push_str("\nRecommended profile changes should be made only when this table shows an improved failure class or equivalent quality with lower wall/model/tool cost.\n");
}
text.push_str(
"\n## Context Metrics\n\n| Fixture | Context tokens | Context bytes | Entrypoint candidates | Entrypoint injection event | First relevant read event | Irrelevant reads | Truncation follow-ups | Tool output truncations |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n",
);
for result in &report.results {
text.push_str(&format!(
"| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} |\n",
result.fixture_id,
metric_value(result, "context_estimated_tokens"),
metric_value(result, "context_bytes"),
metric_value(result, "entrypoint_candidates"),
metric_value(result, "entrypoint_injection_event"),
metric_value(result, "first_relevant_file_read_event"),
metric_value(result, "irrelevant_file_reads"),
metric_value(result, "truncation_follow_ups"),
metric_value(result, "tool_output_truncations"),
));
}
text.push_str(
"\n## Task Ledger Metrics\n\n| Fixture | Updates | Tasks | Completed |\n| --- | ---: | ---: | ---: |\n",
);
for result in &report.results {
text.push_str(&format!(
"| `{}` | {:.0} | {:.0} | {:.0} |\n",
result.fixture_id,
metric_value(result, "task_ledger_updates"),
metric_value(result, "task_ledger_tasks"),
metric_value(result, "task_ledger_completed"),
));
}
text.push_str(
"\n## Verification Metrics\n\n| Fixture | Required | Completed | Failed | Skipped | Open gaps | Remaining gaps |\n| --- | ---: | ---: | ---: | ---: | ---: | --- |\n",
);
for result in &report.results {
text.push_str(&format!(
"| `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {} |\n",
result.fixture_id,
metric_value(result, "verification_required"),
metric_value(result, "verification_completed"),
metric_value(result, "verification_failed"),
metric_value(result, "verification_skipped"),
metric_value(result, "verification_open_gaps"),
markdown_cell(&verification_remaining_gaps(
&result.report.trajectory.events
)),
));
}
text.push_str(&retrieval_router_markdown(report));
text.push_str(&lazy_discovery_markdown(report));
text.push_str(&reliability_markdown(report));
let groups = failure_groups(report);
if !groups.is_empty() {
text.push_str("\n## Failure Groups\n\n| Tool | Model | Failure class | Count |\n| --- | --- | --- | --- |\n");
for ((tool, model, class), count) in groups {
text.push_str(&format!("| `{tool}` | `{model}` | `{class}` | {count} |\n"));
}
}
text
}
fn metric_value(result: &EvalFixtureResult, name: &str) -> f64 {
result
.report
.metrics
.iter()
.find(|metric| metric.name == name)
.map(|metric| metric.value)
.unwrap_or(0.0)
}
fn speed_policy_label(result: &EvalFixtureResult) -> &'static str {
if result
.report
.run
.tags
.iter()
.any(|tag| tag == "speed_policy:on")
{
"on"
} else {
"off"
}
}
fn profile_label(result: &EvalFixtureResult) -> Option<String> {
result
.report
.run
.tags
.iter()
.find_map(|tag| tag.strip_prefix("profile:").map(str::to_string))
}
struct ModelProfileComparison {
fixture_id: String,
profile: String,
outcome: EvalOutcome,
failure_class: String,
wall_ms: f64,
model_calls: f64,
tool_calls: f64,
}
fn model_profile_comparisons(report: &EvalSuiteReport) -> Vec<ModelProfileComparison> {
let mut rows = report
.results
.iter()
.filter_map(|result| {
let profile = profile_label(result)?;
Some(ModelProfileComparison {
fixture_id: result.fixture_id.clone(),
profile,
outcome: result.report.outcome.clone(),
failure_class: result
.report
.failure_class
.as_ref()
.map(|class| format!("{class:?}"))
.unwrap_or_else(|| "-".to_string()),
wall_ms: metric_value(result, "wall_time_ms"),
model_calls: metric_value(result, "model_calls"),
tool_calls: metric_value(result, "tool_calls"),
})
})
.collect::<Vec<_>>();
rows.sort_by(|left, right| {
left.fixture_id
.cmp(&right.fixture_id)
.then_with(|| left.profile.cmp(&right.profile))
});
rows
}
struct SpeedPolicyComparison {
fixture_id: String,
baseline_wall_ms: f64,
speed_wall_ms: f64,
baseline_model_calls: f64,
speed_model_calls: f64,
baseline_tool_calls: f64,
speed_tool_calls: f64,
baseline_child_tasks: f64,
speed_child_tasks: f64,
quality: String,
}
fn speed_policy_comparisons(report: &EvalSuiteReport) -> Vec<SpeedPolicyComparison> {
let mut by_fixture =
BTreeMap::<String, (Option<&EvalFixtureResult>, Option<&EvalFixtureResult>)>::new();
for result in &report.results {
let entry = by_fixture
.entry(result.fixture_id.clone())
.or_insert((None, None));
match speed_policy_label(result) {
"on" => entry.1 = Some(result),
_ => entry.0 = Some(result),
}
}
by_fixture
.into_iter()
.filter_map(|(fixture_id, (baseline, speed))| {
let baseline = baseline?;
let speed = speed?;
Some(SpeedPolicyComparison {
fixture_id,
baseline_wall_ms: metric_value(baseline, "wall_time_ms"),
speed_wall_ms: metric_value(speed, "wall_time_ms"),
baseline_model_calls: metric_value(baseline, "model_calls"),
speed_model_calls: metric_value(speed, "model_calls"),
baseline_tool_calls: metric_value(baseline, "tool_calls"),
speed_tool_calls: metric_value(speed, "tool_calls"),
baseline_child_tasks: metric_value(baseline, "child_task_count"),
speed_child_tasks: metric_value(speed, "child_task_count"),
quality: if baseline.report.outcome == speed.report.outcome {
format!("matched `{:?}`", speed.report.outcome)
} else {
format!(
"changed `{:?}` -> `{:?}`",
baseline.report.outcome, speed.report.outcome
)
},
})
})
.collect()
}
fn verification_remaining_gaps(events: &[crate::EvalTrajectoryEvent]) -> String {
events
.iter()
.rev()
.find(|event| event.event_type == "verification_completed" && event.is_error)
.map(|_| "see failure message or verification trace".to_string())
.unwrap_or_else(|| "-".to_string())
}
fn markdown_cell(value: &str) -> String {
value.replace('|', "\\|").replace('\n', " ")
}
fn pass_rate_rows(report: &EvalSuiteReport) -> Vec<(String, usize, usize)> {
let mut rows = BTreeMap::<String, (usize, usize)>::new();
for result in &report.results {
let passed = usize::from(result.report.outcome == EvalOutcome::Pass);
let model_scope = format!("{}/{}", result.report.run.provider, result.report.run.model);
let entry = rows.entry(format!("model:{model_scope}")).or_insert((0, 0));
entry.0 += passed;
entry.1 += 1;
for tag in result
.report
.run
.tags
.iter()
.filter(|tag| tag.starts_with("tool:"))
{
let entry = rows.entry(tag.clone()).or_insert((0, 0));
entry.0 += passed;
entry.1 += 1;
}
}
rows.into_iter()
.map(|(scope, (passed, total))| (scope, passed, total))
.collect()
}
fn failure_groups(report: &EvalSuiteReport) -> BTreeMap<(String, String, String), usize> {
let mut groups = BTreeMap::new();
for result in &report.results {
if result.report.outcome == EvalOutcome::Pass {
continue;
}
let tool = result
.report
.run
.tags
.iter()
.find_map(|tag| tag.strip_prefix("tool:"))
.unwrap_or("unknown")
.to_string();
let model = format!("{}/{}", result.report.run.provider, result.report.run.model);
let class = result
.report
.failure_class
.as_ref()
.map(|class| format!("{class:?}"))
.unwrap_or_else(|| "Unknown".to_string());
*groups.entry((tool, model, class)).or_insert(0) += 1;
}
groups
}
#[cfg(test)]
mod tests {
use super::*;
use roder_api::events::{
ContextAssemblyCompleted, ContextEntrypointCandidatesInjected, InferenceStarted,
RoderEvent, ToolCallCompleted, ToolCallRequested, ToolOutputTruncated,
VerificationCompleted, VerificationRequired,
};
use roder_api::tasks::TaskStarted;
#[test]
fn context_eval_metrics_track_budget_entrypoints_and_truncation_follow_up() {
let events = vec![
RoderEvent::ContextAssemblyCompleted(ContextAssemblyCompleted {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
block_count: 1,
total_byte_count: 800,
estimated_tokens: 200,
prompt_estimated_tokens: 200,
token_budget: Some(1_000),
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
RoderEvent::ContextEntrypointCandidatesInjected(ContextEntrypointCandidatesInjected {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
candidate_count: 3,
block_byte_count: 120,
estimated_tokens: 30,
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
RoderEvent::ToolOutputTruncated(ToolOutputTruncated {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
tool_id: "tool-a".to_string(),
tool_name: Some("grep".to_string()),
original_line_count: 1_000,
original_char_count: 40_000,
inline_char_count: 2_000,
artifact_backed: false,
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
RoderEvent::ToolCallRequested(ToolCallRequested {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
tool_id: "tool-b".to_string(),
tool_name: "grep".to_string(),
display_payload: None,
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
];
let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
let value = |name: &str| {
metrics
.iter()
.find(|metric| metric.name == name)
.map(|metric| metric.value)
.unwrap()
};
assert_eq!(value("context_estimated_tokens"), 200.0);
assert_eq!(value("context_bytes"), 800.0);
assert_eq!(value("entrypoint_candidates"), 3.0);
assert_eq!(value("entrypoint_injection_event"), 2.0);
assert_eq!(value("truncation_follow_ups"), 1.0);
assert_eq!(value("tool_output_truncations"), 1.0);
}
#[test]
fn verification_eval_metrics_track_required_completed_and_gaps() {
let events = vec![
RoderEvent::VerificationRequired(VerificationRequired {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
reason: "code_changes_without_verification".to_string(),
changed_files: vec!["src/lib.rs".to_string()],
tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
tests_run: Vec::new(),
open_gaps: Vec::new(),
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
RoderEvent::VerificationCompleted(VerificationCompleted {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
passed: false,
changed_files: vec!["src/lib.rs".to_string()],
tool_evidence: vec!["write_file: wrote src/lib.rs".to_string()],
tests_run: Vec::new(),
open_gaps: vec!["tests not run".to_string()],
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
];
let metrics = eval_metrics(&events, 42, &EvalOutcome::Fail);
let value = |name: &str| {
metrics
.iter()
.find(|metric| metric.name == name)
.map(|metric| metric.value)
.unwrap()
};
assert_eq!(value("verification_required"), 1.0);
assert_eq!(value("verification_completed"), 0.0);
assert_eq!(value("verification_failed"), 1.0);
assert_eq!(value("verification_open_gaps"), 1.0);
}
#[test]
fn search_eval_metrics_track_grep_engine_and_latency_metadata() {
let events = vec![RoderEvent::ToolCallCompleted(ToolCallCompleted {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
tool_id: "grep-a".to_string(),
tool_name: Some("grep".to_string()),
display_payload: Some(serde_json::json!({
"query": "BUG_ROOT_CAUSE_TOKEN",
"engine": "indexed",
"candidate_files": 4,
"verified_files": 2,
"elapsed_ms": 7,
"index_bytes": 4096,
"index_build_time_ms": 3
})),
is_error: false,
output: None,
timestamp: OffsetDateTime::UNIX_EPOCH,
})];
let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
let value = |name: &str| {
metrics
.iter()
.find(|metric| metric.name == name)
.map(|metric| metric.value)
.unwrap()
};
assert_eq!(value("grep_calls"), 1.0);
assert_eq!(value("grep_indexed_calls"), 1.0);
assert_eq!(value("grep_candidate_files"), 4.0);
assert_eq!(value("grep_verified_files"), 2.0);
assert_eq!(value("grep_elapsed_ms"), 7.0);
assert_eq!(value("grep_index_bytes"), 4096.0);
assert_eq!(value("grep_index_build_time_ms"), 3.0);
}
#[test]
fn speed_eval_metrics_track_child_tasks_and_deadline_remaining() {
let events = vec![
RoderEvent::InferenceStarted(InferenceStarted {
thread_id: "thread-a".to_string(),
turn_id: "turn-a".to_string(),
engine_id: "mock".to_string(),
model: roder_api::inference::ModelSelection {
provider: "mock".to_string(),
model: "mock".to_string(),
},
reasoning: roder_api::inference::ReasoningConfig::default(),
speed_policy: None,
deadline_remaining_seconds: Some(27),
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
RoderEvent::TaskStarted(TaskStarted {
task_id: "task-a".to_string(),
executor_id: "subagent".to_string(),
task_kind: "subagent".to_string(),
queue_depth: 0,
thread_id: Some("thread-a".to_string()),
turn_id: Some("turn-a".to_string()),
timestamp: OffsetDateTime::UNIX_EPOCH,
}),
];
let metrics = eval_metrics(&events, 42, &EvalOutcome::Pass);
let value = |name: &str| {
metrics
.iter()
.find(|metric| metric.name == name)
.map(|metric| metric.value)
.unwrap()
};
assert_eq!(value("model_calls"), 1.0);
assert_eq!(value("child_task_count"), 1.0);
assert_eq!(value("deadline_remaining_seconds"), 27.0);
}
}