Skip to main content

roder_evals/
runner.rs

1use std::path::{Path, PathBuf};
2use std::sync::Arc;
3use std::time::Instant;
4
5use anyhow::Context;
6use roder_api::catalog::PROVIDER_MOCK;
7use roder_api::events::RoderEvent;
8use roder_api::inference::{InstructionBundle, RuntimeProfile};
9use roder_core::StartTurnRequest;
10use serde::{Deserialize, Serialize};
11use time::OffsetDateTime;
12use tokio::time::Duration;
13
14mod baseline;
15mod fixture_metrics;
16mod lazy_discovery;
17mod reliability;
18mod report;
19mod runtime_harness;
20#[cfg(test)]
21mod tests;
22mod workspace;
23
24pub use baseline::{
25    ReliabilityBaseline, ReliabilityBaselineComparison, ReliabilityBaselineExpectation,
26    ReliabilityBaselineStatus, compare_eval_report_to_baseline, compare_reliability_baseline,
27};
28pub use reliability::ReliabilityReportSummary;
29pub use report::{
30    EvalFixtureResult, EvalReportDocument, EvalReportSummary, EvalSuiteReport, list_eval_reports,
31    read_eval_report, write_eval_report_files,
32};
33
34use fixture_metrics::fixture_command_check_metrics;
35use lazy_discovery::lazy_discovery_metrics;
36use reliability::fixture_reliability_injection;
37use report::{eval_metrics, trajectory_excerpt};
38use runtime_harness::{
39    TurnCollectionError, build_fake_runtime, collect_turn_events, deadline_seconds_from_timeout_ms,
40};
41use workspace::{
42    create_workspace, failure_class_for_fixture, grade_expected_evidence, run_workspace_setup,
43};
44
45use crate::retrieval_router::grade_retrieval_router_fixture;
46use crate::{EvalFailureClass, EvalFixture, EvalOutcome, EvalReport, EvalRun, EvalTrajectory};
47
48const DEFAULT_TIMEOUT_MS: u64 = 30_000;
49
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51#[serde(rename_all = "camelCase")]
52pub struct OfflineEvalRunnerOptions {
53    pub offline: bool,
54    pub output_dir: PathBuf,
55    #[serde(default = "default_provider")]
56    pub provider: String,
57    #[serde(default = "default_model")]
58    pub model: String,
59    #[serde(default)]
60    pub runtime_profile: RuntimeProfile,
61    #[serde(default)]
62    pub speed_policy: EvalSpeedPolicyMode,
63    #[serde(default)]
64    pub profiles: EvalProfileMode,
65}
66
67impl Default for OfflineEvalRunnerOptions {
68    fn default() -> Self {
69        Self {
70            offline: true,
71            output_dir: PathBuf::from("evals").join("reports"),
72            provider: default_provider(),
73            model: default_model(),
74            runtime_profile: RuntimeProfile::Interactive,
75            speed_policy: EvalSpeedPolicyMode::Off,
76            profiles: EvalProfileMode::Off,
77        }
78    }
79}
80
81#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
82#[serde(rename_all = "snake_case")]
83pub enum EvalSpeedPolicyMode {
84    #[default]
85    Off,
86    On,
87    Both,
88}
89
90impl EvalSpeedPolicyMode {
91    fn runs(self, runtime_profile: RuntimeProfile) -> Vec<EvalSpeedPolicyRun> {
92        match self {
93            Self::Off => vec![EvalSpeedPolicyRun {
94                label: "speed_policy:off",
95                runtime_profile,
96                enabled: false,
97            }],
98            Self::On => vec![EvalSpeedPolicyRun {
99                label: "speed_policy:on",
100                runtime_profile: RuntimeProfile::Eval,
101                enabled: true,
102            }],
103            Self::Both => vec![
104                EvalSpeedPolicyRun {
105                    label: "speed_policy:off",
106                    runtime_profile: RuntimeProfile::Eval,
107                    enabled: false,
108                },
109                EvalSpeedPolicyRun {
110                    label: "speed_policy:on",
111                    runtime_profile: RuntimeProfile::Eval,
112                    enabled: true,
113                },
114            ],
115        }
116    }
117}
118
119impl std::str::FromStr for EvalSpeedPolicyMode {
120    type Err = anyhow::Error;
121
122    fn from_str(value: &str) -> Result<Self, Self::Err> {
123        match value {
124            "off" => Ok(Self::Off),
125            "on" => Ok(Self::On),
126            "both" => Ok(Self::Both),
127            other => anyhow::bail!("invalid --speed-policy {other:?}; expected off, on, or both"),
128        }
129    }
130}
131
132#[derive(Debug, Clone, Copy)]
133struct EvalSpeedPolicyRun {
134    label: &'static str,
135    runtime_profile: RuntimeProfile,
136    enabled: bool,
137}
138
139#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
140#[serde(rename_all = "snake_case")]
141pub enum EvalProfileMode {
142    #[default]
143    Off,
144    All,
145}
146
147impl EvalProfileMode {
148    fn runs(self, default_model: &str) -> Vec<EvalProfileRun> {
149        match self {
150            Self::Off => vec![EvalProfileRun {
151                label: None,
152                model: default_model.to_string(),
153            }],
154            Self::All => vec![
155                EvalProfileRun {
156                    label: Some("profile:gpt-5.5"),
157                    model: "gpt-5.5".to_string(),
158                },
159                EvalProfileRun {
160                    label: Some("profile:claude-haiku-4-5-20251001"),
161                    model: "claude-haiku-4-5-20251001".to_string(),
162                },
163            ],
164        }
165    }
166}
167
168impl std::str::FromStr for EvalProfileMode {
169    type Err = anyhow::Error;
170
171    fn from_str(value: &str) -> Result<Self, Self::Err> {
172        match value {
173            "off" => Ok(Self::Off),
174            "all" => Ok(Self::All),
175            other => anyhow::bail!("invalid --profiles {other:?}; expected off or all"),
176        }
177    }
178}
179
180#[derive(Debug, Clone)]
181struct EvalProfileRun {
182    label: Option<&'static str>,
183    model: String,
184}
185
186pub fn load_eval_fixtures(dir: &Path) -> anyhow::Result<Vec<EvalFixture>> {
187    let mut fixtures = Vec::new();
188    load_eval_fixtures_from_dir(dir, &mut fixtures)
189        .with_context(|| format!("failed to load eval fixtures from {}", dir.display()))?;
190    fixtures.sort_by(|left, right| left.id.cmp(&right.id));
191    Ok(fixtures)
192}
193
194pub async fn run_offline_eval_suite(
195    fixture_dir: &Path,
196    options: OfflineEvalRunnerOptions,
197) -> anyhow::Result<EvalSuiteReport> {
198    if !options.offline {
199        anyhow::bail!("offline eval runner requires --offline");
200    }
201    let fixtures = load_eval_fixtures(fixture_dir)?;
202    if fixtures.is_empty() {
203        anyhow::bail!(
204            "no canonical eval fixtures found in {}",
205            fixture_dir.display()
206        );
207    }
208    let generated_at = OffsetDateTime::now_utc();
209    let run_id = format!("eval-{}", uuid::Uuid::new_v4());
210    let suite_id = fixture_dir
211        .file_name()
212        .and_then(|name| name.to_str())
213        .filter(|name| !name.is_empty())
214        .unwrap_or("fixtures")
215        .to_string();
216    let speed_runs = options.speed_policy.runs(options.runtime_profile);
217    let profile_runs = options.profiles.runs(&options.model);
218    let mut results = Vec::with_capacity(fixtures.len() * speed_runs.len() * profile_runs.len());
219    for fixture in fixtures {
220        for profile_run in &profile_runs {
221            for speed_run in &speed_runs {
222                results.push(
223                    run_offline_fixture(
224                        &suite_id,
225                        &run_id,
226                        &fixture,
227                        &options.provider,
228                        profile_run,
229                        *speed_run,
230                    )
231                    .await?,
232                );
233            }
234        }
235    }
236    let report = EvalSuiteReport {
237        suite_id,
238        fixture_dir: fixture_dir.to_path_buf(),
239        output_dir: options.output_dir.clone(),
240        offline: options.offline,
241        generated_at,
242        results,
243    };
244    write_eval_report_files(&report, &options.output_dir)?;
245    Ok(report)
246}
247
248fn load_eval_fixtures_from_dir(dir: &Path, fixtures: &mut Vec<EvalFixture>) -> anyhow::Result<()> {
249    for entry in std::fs::read_dir(dir)? {
250        let path = entry?.path();
251        if path.is_dir() {
252            load_eval_fixtures_from_dir(&path, fixtures)?;
253            continue;
254        }
255        if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
256            continue;
257        }
258        let text = std::fs::read_to_string(&path)?;
259        let value: serde_json::Value = serde_json::from_str(&text)?;
260        if !value
261            .get("expected")
262            .is_some_and(serde_json::Value::is_object)
263        {
264            continue;
265        }
266        if let Ok(fixture) = serde_json::from_value::<EvalFixture>(value) {
267            fixtures.push(fixture);
268        }
269    }
270    Ok(())
271}
272
273async fn run_offline_fixture(
274    suite_id: &str,
275    run_id: &str,
276    fixture: &EvalFixture,
277    provider: &str,
278    profile_run: &EvalProfileRun,
279    speed_run: EvalSpeedPolicyRun,
280) -> anyhow::Result<EvalFixtureResult> {
281    let start = Instant::now();
282    let workspace = create_workspace(fixture)?;
283    let thread_id = format!("eval-{}", fixture.id);
284    let mut events = Vec::new();
285    let mut final_answer = String::new();
286    let mut failure_message = None;
287    let mut outcome = EvalOutcome::Pass;
288    let mut failure_class = None;
289    if let Err(err) = run_workspace_setup(fixture, &workspace.path) {
290        outcome = EvalOutcome::HarnessError;
291        failure_class = Some(EvalFailureClass::Environment);
292        failure_message = Some(err.to_string());
293    }
294    let mut turn_id = "setup-failed".to_string();
295    if outcome == EvalOutcome::Pass {
296        let runtime = Arc::new(build_fake_runtime(
297            fixture,
298            &workspace.path,
299            provider,
300            &profile_run.model,
301            speed_run.runtime_profile,
302            speed_run.enabled,
303            fixture.timeout_ms.map(deadline_seconds_from_timeout_ms),
304        )?);
305        let mut rx = runtime.subscribe_events();
306        turn_id = runtime
307            .start_turn(StartTurnRequest {
308                thread_id: thread_id.clone(),
309                message: fixture.prompt.clone(),
310                images: Vec::new(),
311                provider_override: Some(provider.to_string()),
312                model_override: Some(profile_run.model.clone()),
313                reasoning_override: None,
314                workspace: workspace.path.display().to_string(),
315                instructions: InstructionBundle::default(),
316                developer_context: None,
317                task_ledger_required: fixture.expected.task_ledger_required,
318            })
319            .await?;
320        let timeout_ms = fixture.timeout_ms.unwrap_or(DEFAULT_TIMEOUT_MS);
321        match collect_turn_events(
322            &mut rx,
323            &thread_id,
324            &turn_id,
325            Duration::from_millis(timeout_ms),
326            &mut final_answer,
327        )
328        .await
329        {
330            Ok(collected) => events = collected,
331            Err(TurnCollectionError::Timeout { collected }) => {
332                events = collected;
333                outcome = EvalOutcome::Timeout;
334                failure_class = Some(EvalFailureClass::Runtime);
335                failure_message = Some(format!("fixture timed out after {timeout_ms}ms"));
336            }
337            Err(TurnCollectionError::Failed { error, collected }) => {
338                events = collected;
339                outcome = EvalOutcome::Fail;
340                failure_class = Some(if error.contains("verification gaps remain") {
341                    EvalFailureClass::Verifier
342                } else {
343                    EvalFailureClass::Runtime
344                });
345                failure_message = Some(error);
346            }
347        }
348    }
349    if let Some(injection) = fixture_reliability_injection(fixture, &thread_id, &turn_id) {
350        events.extend(injection.events);
351        if let Some(next) = injection.outcome {
352            outcome = next;
353        }
354        if let Some(next) = injection.failure_class {
355            failure_class = Some(next);
356        }
357        if let Some(next) = injection.failure_message {
358            failure_message = Some(next);
359        }
360    }
361    if outcome == EvalOutcome::Pass
362        && let Err(err) = grade_expected_evidence(fixture, &workspace.path, &final_answer)
363    {
364        outcome = EvalOutcome::Fail;
365        failure_class = Some(failure_class_for_fixture(fixture));
366        failure_message = Some(err.to_string());
367    }
368    if outcome == EvalOutcome::Pass
369        && let Err(err) = grade_task_ledger_requirement(fixture, &events)
370    {
371        outcome = EvalOutcome::Fail;
372        failure_class = Some(EvalFailureClass::Verifier);
373        failure_message = Some(err.to_string());
374    }
375    let trajectory = EvalTrajectory::from_events(thread_id.clone(), turn_id.clone(), &events);
376    let trace_excerpt = trajectory_excerpt(&trajectory);
377    let mut metrics = eval_metrics(&events, start.elapsed().as_millis(), &outcome);
378    metrics.extend(fixture_command_check_metrics(fixture, &outcome));
379    metrics.extend(lazy_discovery_metrics(fixture, &events, &outcome));
380    metrics.extend(grade_retrieval_router_fixture(fixture, &events, &outcome));
381    let report = EvalReport {
382        run: EvalRun {
383            suite_id: suite_id.to_string(),
384            run_id: run_id.to_string(),
385            provider: provider.to_string(),
386            model: profile_run.model.clone(),
387            started_at: OffsetDateTime::now_utc(),
388            tags: {
389                let mut tags = fixture.tags.clone();
390                tags.push(speed_run.label.to_string());
391                if let Some(label) = profile_run.label {
392                    tags.push(label.to_string());
393                }
394                tags
395            },
396        },
397        outcome: outcome.clone(),
398        failure_class: failure_class.clone(),
399        trajectory,
400        metrics,
401    };
402    Ok(EvalFixtureResult {
403        fixture_id: fixture.id.clone(),
404        title: fixture.title.clone(),
405        workspace: workspace.path.clone(),
406        final_answer,
407        report,
408        trace_excerpt,
409        failure_message,
410    })
411}
412
413fn grade_task_ledger_requirement(
414    fixture: &EvalFixture,
415    events: &[RoderEvent],
416) -> anyhow::Result<()> {
417    if !fixture.expected.task_ledger_required {
418        return Ok(());
419    }
420    let Some(snapshot) = events.iter().rev().find_map(|event| match event {
421        RoderEvent::TaskLedgerUpdated(updated) => Some(updated),
422        _ => None,
423    }) else {
424        anyhow::bail!("task ledger was required but was not created");
425    };
426    if snapshot.tasks.is_empty() {
427        anyhow::bail!("task ledger was required but contained no tasks");
428    }
429    let incomplete = snapshot
430        .tasks
431        .iter()
432        .filter(|task| {
433            !matches!(
434                task.status,
435                roder_api::task_ledger::TaskLedgerStatus::Completed
436            )
437        })
438        .map(|task| task.id.as_str())
439        .collect::<Vec<_>>();
440    if !incomplete.is_empty() {
441        anyhow::bail!("task ledger incomplete: {}", incomplete.join(", "));
442    }
443    Ok(())
444}
445
446fn default_provider() -> String {
447    PROVIDER_MOCK.to_string()
448}
449
450fn default_model() -> String {
451    "mock".to_string()
452}