use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
use anyhow::Context;
use roder_api::catalog::PROVIDER_MOCK;
use roder_api::events::RoderEvent;
use roder_api::inference::{InstructionBundle, RuntimeProfile};
use roder_core::StartTurnRequest;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use tokio::time::Duration;
mod baseline;
mod fixture_metrics;
mod lazy_discovery;
mod reliability;
mod report;
mod runtime_harness;
#[cfg(test)]
mod tests;
mod workspace;
pub use baseline::{
ReliabilityBaseline, ReliabilityBaselineComparison, ReliabilityBaselineExpectation,
ReliabilityBaselineStatus, compare_eval_report_to_baseline, compare_reliability_baseline,
};
pub use reliability::ReliabilityReportSummary;
pub use report::{
EvalFixtureResult, EvalReportDocument, EvalReportSummary, EvalSuiteReport, list_eval_reports,
read_eval_report, write_eval_report_files,
};
use fixture_metrics::fixture_command_check_metrics;
use lazy_discovery::lazy_discovery_metrics;
use reliability::fixture_reliability_injection;
use report::{eval_metrics, trajectory_excerpt};
use runtime_harness::{
TurnCollectionError, build_fake_runtime, collect_turn_events, deadline_seconds_from_timeout_ms,
};
use workspace::{
create_workspace, failure_class_for_fixture, grade_expected_evidence, run_workspace_setup,
};
use crate::retrieval_router::grade_retrieval_router_fixture;
use crate::{EvalFailureClass, EvalFixture, EvalOutcome, EvalReport, EvalRun, EvalTrajectory};
const DEFAULT_TIMEOUT_MS: u64 = 30_000;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct OfflineEvalRunnerOptions {
pub offline: bool,
pub output_dir: PathBuf,
#[serde(default = "default_provider")]
pub provider: String,
#[serde(default = "default_model")]
pub model: String,
#[serde(default)]
pub runtime_profile: RuntimeProfile,
#[serde(default)]
pub speed_policy: EvalSpeedPolicyMode,
#[serde(default)]
pub profiles: EvalProfileMode,
}
impl Default for OfflineEvalRunnerOptions {
fn default() -> Self {
Self {
offline: true,
output_dir: PathBuf::from("evals").join("reports"),
provider: default_provider(),
model: default_model(),
runtime_profile: RuntimeProfile::Interactive,
speed_policy: EvalSpeedPolicyMode::Off,
profiles: EvalProfileMode::Off,
}
}
}
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EvalSpeedPolicyMode {
#[default]
Off,
On,
Both,
}
impl EvalSpeedPolicyMode {
fn runs(self, runtime_profile: RuntimeProfile) -> Vec<EvalSpeedPolicyRun> {
match self {
Self::Off => vec![EvalSpeedPolicyRun {
label: "speed_policy:off",
runtime_profile,
enabled: false,
}],
Self::On => vec![EvalSpeedPolicyRun {
label: "speed_policy:on",
runtime_profile: RuntimeProfile::Eval,
enabled: true,
}],
Self::Both => vec![
EvalSpeedPolicyRun {
label: "speed_policy:off",
runtime_profile: RuntimeProfile::Eval,
enabled: false,
},
EvalSpeedPolicyRun {
label: "speed_policy:on",
runtime_profile: RuntimeProfile::Eval,
enabled: true,
},
],
}
}
}
impl std::str::FromStr for EvalSpeedPolicyMode {
type Err = anyhow::Error;
fn from_str(value: &str) -> Result<Self, Self::Err> {
match value {
"off" => Ok(Self::Off),
"on" => Ok(Self::On),
"both" => Ok(Self::Both),
other => anyhow::bail!("invalid --speed-policy {other:?}; expected off, on, or both"),
}
}
}
#[derive(Debug, Clone, Copy)]
struct EvalSpeedPolicyRun {
label: &'static str,
runtime_profile: RuntimeProfile,
enabled: bool,
}
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EvalProfileMode {
#[default]
Off,
All,
}
impl EvalProfileMode {
fn runs(self, default_model: &str) -> Vec<EvalProfileRun> {
match self {
Self::Off => vec![EvalProfileRun {
label: None,
model: default_model.to_string(),
}],
Self::All => vec![
EvalProfileRun {
label: Some("profile:gpt-5.5"),
model: "gpt-5.5".to_string(),
},
EvalProfileRun {
label: Some("profile:claude-haiku-4-5-20251001"),
model: "claude-haiku-4-5-20251001".to_string(),
},
],
}
}
}
impl std::str::FromStr for EvalProfileMode {
type Err = anyhow::Error;
fn from_str(value: &str) -> Result<Self, Self::Err> {
match value {
"off" => Ok(Self::Off),
"all" => Ok(Self::All),
other => anyhow::bail!("invalid --profiles {other:?}; expected off or all"),
}
}
}
#[derive(Debug, Clone)]
struct EvalProfileRun {
label: Option<&'static str>,
model: String,
}
pub fn load_eval_fixtures(dir: &Path) -> anyhow::Result<Vec<EvalFixture>> {
let mut fixtures = Vec::new();
load_eval_fixtures_from_dir(dir, &mut fixtures)
.with_context(|| format!("failed to load eval fixtures from {}", dir.display()))?;
fixtures.sort_by(|left, right| left.id.cmp(&right.id));
Ok(fixtures)
}
pub async fn run_offline_eval_suite(
fixture_dir: &Path,
options: OfflineEvalRunnerOptions,
) -> anyhow::Result<EvalSuiteReport> {
if !options.offline {
anyhow::bail!("offline eval runner requires --offline");
}
let fixtures = load_eval_fixtures(fixture_dir)?;
if fixtures.is_empty() {
anyhow::bail!(
"no canonical eval fixtures found in {}",
fixture_dir.display()
);
}
let generated_at = OffsetDateTime::now_utc();
let run_id = format!("eval-{}", uuid::Uuid::new_v4());
let suite_id = fixture_dir
.file_name()
.and_then(|name| name.to_str())
.filter(|name| !name.is_empty())
.unwrap_or("fixtures")
.to_string();
let speed_runs = options.speed_policy.runs(options.runtime_profile);
let profile_runs = options.profiles.runs(&options.model);
let mut results = Vec::with_capacity(fixtures.len() * speed_runs.len() * profile_runs.len());
for fixture in fixtures {
for profile_run in &profile_runs {
for speed_run in &speed_runs {
results.push(
run_offline_fixture(
&suite_id,
&run_id,
&fixture,
&options.provider,
profile_run,
*speed_run,
)
.await?,
);
}
}
}
let report = EvalSuiteReport {
suite_id,
fixture_dir: fixture_dir.to_path_buf(),
output_dir: options.output_dir.clone(),
offline: options.offline,
generated_at,
results,
};
write_eval_report_files(&report, &options.output_dir)?;
Ok(report)
}
fn load_eval_fixtures_from_dir(dir: &Path, fixtures: &mut Vec<EvalFixture>) -> anyhow::Result<()> {
for entry in std::fs::read_dir(dir)? {
let path = entry?.path();
if path.is_dir() {
load_eval_fixtures_from_dir(&path, fixtures)?;
continue;
}
if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
continue;
}
let text = std::fs::read_to_string(&path)?;
let value: serde_json::Value = serde_json::from_str(&text)?;
if !value
.get("expected")
.is_some_and(serde_json::Value::is_object)
{
continue;
}
if let Ok(fixture) = serde_json::from_value::<EvalFixture>(value) {
fixtures.push(fixture);
}
}
Ok(())
}
async fn run_offline_fixture(
suite_id: &str,
run_id: &str,
fixture: &EvalFixture,
provider: &str,
profile_run: &EvalProfileRun,
speed_run: EvalSpeedPolicyRun,
) -> anyhow::Result<EvalFixtureResult> {
let start = Instant::now();
let workspace = create_workspace(fixture)?;
let thread_id = format!("eval-{}", fixture.id);
let mut events = Vec::new();
let mut final_answer = String::new();
let mut failure_message = None;
let mut outcome = EvalOutcome::Pass;
let mut failure_class = None;
if let Err(err) = run_workspace_setup(fixture, &workspace.path) {
outcome = EvalOutcome::HarnessError;
failure_class = Some(EvalFailureClass::Environment);
failure_message = Some(err.to_string());
}
let mut turn_id = "setup-failed".to_string();
if outcome == EvalOutcome::Pass {
let runtime = Arc::new(build_fake_runtime(
fixture,
&workspace.path,
provider,
&profile_run.model,
speed_run.runtime_profile,
speed_run.enabled,
fixture.timeout_ms.map(deadline_seconds_from_timeout_ms),
)?);
let mut rx = runtime.subscribe_events();
turn_id = runtime
.start_turn(StartTurnRequest {
thread_id: thread_id.clone(),
message: fixture.prompt.clone(),
images: Vec::new(),
provider_override: Some(provider.to_string()),
model_override: Some(profile_run.model.clone()),
reasoning_override: None,
workspace: workspace.path.display().to_string(),
instructions: InstructionBundle::default(),
developer_context: None,
task_ledger_required: fixture.expected.task_ledger_required,
})
.await?;
let timeout_ms = fixture.timeout_ms.unwrap_or(DEFAULT_TIMEOUT_MS);
match collect_turn_events(
&mut rx,
&thread_id,
&turn_id,
Duration::from_millis(timeout_ms),
&mut final_answer,
)
.await
{
Ok(collected) => events = collected,
Err(TurnCollectionError::Timeout { collected }) => {
events = collected;
outcome = EvalOutcome::Timeout;
failure_class = Some(EvalFailureClass::Runtime);
failure_message = Some(format!("fixture timed out after {timeout_ms}ms"));
}
Err(TurnCollectionError::Failed { error, collected }) => {
events = collected;
outcome = EvalOutcome::Fail;
failure_class = Some(if error.contains("verification gaps remain") {
EvalFailureClass::Verifier
} else {
EvalFailureClass::Runtime
});
failure_message = Some(error);
}
}
}
if let Some(injection) = fixture_reliability_injection(fixture, &thread_id, &turn_id) {
events.extend(injection.events);
if let Some(next) = injection.outcome {
outcome = next;
}
if let Some(next) = injection.failure_class {
failure_class = Some(next);
}
if let Some(next) = injection.failure_message {
failure_message = Some(next);
}
}
if outcome == EvalOutcome::Pass
&& let Err(err) = grade_expected_evidence(fixture, &workspace.path, &final_answer)
{
outcome = EvalOutcome::Fail;
failure_class = Some(failure_class_for_fixture(fixture));
failure_message = Some(err.to_string());
}
if outcome == EvalOutcome::Pass
&& let Err(err) = grade_task_ledger_requirement(fixture, &events)
{
outcome = EvalOutcome::Fail;
failure_class = Some(EvalFailureClass::Verifier);
failure_message = Some(err.to_string());
}
let trajectory = EvalTrajectory::from_events(thread_id.clone(), turn_id.clone(), &events);
let trace_excerpt = trajectory_excerpt(&trajectory);
let mut metrics = eval_metrics(&events, start.elapsed().as_millis(), &outcome);
metrics.extend(fixture_command_check_metrics(fixture, &outcome));
metrics.extend(lazy_discovery_metrics(fixture, &events, &outcome));
metrics.extend(grade_retrieval_router_fixture(fixture, &events, &outcome));
let report = EvalReport {
run: EvalRun {
suite_id: suite_id.to_string(),
run_id: run_id.to_string(),
provider: provider.to_string(),
model: profile_run.model.clone(),
started_at: OffsetDateTime::now_utc(),
tags: {
let mut tags = fixture.tags.clone();
tags.push(speed_run.label.to_string());
if let Some(label) = profile_run.label {
tags.push(label.to_string());
}
tags
},
},
outcome: outcome.clone(),
failure_class: failure_class.clone(),
trajectory,
metrics,
};
Ok(EvalFixtureResult {
fixture_id: fixture.id.clone(),
title: fixture.title.clone(),
workspace: workspace.path.clone(),
final_answer,
report,
trace_excerpt,
failure_message,
})
}
fn grade_task_ledger_requirement(
fixture: &EvalFixture,
events: &[RoderEvent],
) -> anyhow::Result<()> {
if !fixture.expected.task_ledger_required {
return Ok(());
}
let Some(snapshot) = events.iter().rev().find_map(|event| match event {
RoderEvent::TaskLedgerUpdated(updated) => Some(updated),
_ => None,
}) else {
anyhow::bail!("task ledger was required but was not created");
};
if snapshot.tasks.is_empty() {
anyhow::bail!("task ledger was required but contained no tasks");
}
let incomplete = snapshot
.tasks
.iter()
.filter(|task| {
!matches!(
task.status,
roder_api::task_ledger::TaskLedgerStatus::Completed
)
})
.map(|task| task.id.as_str())
.collect::<Vec<_>>();
if !incomplete.is_empty() {
anyhow::bail!("task ledger incomplete: {}", incomplete.join(", "));
}
Ok(())
}
fn default_provider() -> String {
PROVIDER_MOCK.to_string()
}
fn default_model() -> String {
"mock".to_string()
}