1use std::path::{Path, PathBuf};
2use std::sync::Arc;
3use std::time::Instant;
4
5use anyhow::Context;
6use roder_api::catalog::PROVIDER_MOCK;
7use roder_api::events::RoderEvent;
8use roder_api::inference::{InstructionBundle, RuntimeProfile};
9use roder_core::StartTurnRequest;
10use serde::{Deserialize, Serialize};
11use time::OffsetDateTime;
12use tokio::time::Duration;
13
14mod baseline;
15mod fixture_metrics;
16mod lazy_discovery;
17mod reliability;
18mod report;
19mod runtime_harness;
20#[cfg(test)]
21mod tests;
22mod workspace;
23
24pub use baseline::{
25 ReliabilityBaseline, ReliabilityBaselineComparison, ReliabilityBaselineExpectation,
26 ReliabilityBaselineStatus, compare_eval_report_to_baseline, compare_reliability_baseline,
27};
28pub use reliability::ReliabilityReportSummary;
29pub use report::{
30 EvalFixtureResult, EvalReportDocument, EvalReportSummary, EvalSuiteReport, list_eval_reports,
31 read_eval_report, write_eval_report_files,
32};
33
34use fixture_metrics::fixture_command_check_metrics;
35use lazy_discovery::lazy_discovery_metrics;
36use reliability::fixture_reliability_injection;
37use report::{eval_metrics, trajectory_excerpt};
38use runtime_harness::{
39 TurnCollectionError, build_fake_runtime, collect_turn_events, deadline_seconds_from_timeout_ms,
40};
41use workspace::{
42 create_workspace, failure_class_for_fixture, grade_expected_evidence, run_workspace_setup,
43};
44
45use crate::retrieval_router::grade_retrieval_router_fixture;
46use crate::{EvalFailureClass, EvalFixture, EvalOutcome, EvalReport, EvalRun, EvalTrajectory};
47
48const DEFAULT_TIMEOUT_MS: u64 = 30_000;
49
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
51#[serde(rename_all = "camelCase")]
52pub struct OfflineEvalRunnerOptions {
53 pub offline: bool,
54 pub output_dir: PathBuf,
55 #[serde(default = "default_provider")]
56 pub provider: String,
57 #[serde(default = "default_model")]
58 pub model: String,
59 #[serde(default)]
60 pub runtime_profile: RuntimeProfile,
61 #[serde(default)]
62 pub speed_policy: EvalSpeedPolicyMode,
63 #[serde(default)]
64 pub profiles: EvalProfileMode,
65}
66
67impl Default for OfflineEvalRunnerOptions {
68 fn default() -> Self {
69 Self {
70 offline: true,
71 output_dir: PathBuf::from("evals").join("reports"),
72 provider: default_provider(),
73 model: default_model(),
74 runtime_profile: RuntimeProfile::Interactive,
75 speed_policy: EvalSpeedPolicyMode::Off,
76 profiles: EvalProfileMode::Off,
77 }
78 }
79}
80
81#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
82#[serde(rename_all = "snake_case")]
83pub enum EvalSpeedPolicyMode {
84 #[default]
85 Off,
86 On,
87 Both,
88}
89
90impl EvalSpeedPolicyMode {
91 fn runs(self, runtime_profile: RuntimeProfile) -> Vec<EvalSpeedPolicyRun> {
92 match self {
93 Self::Off => vec![EvalSpeedPolicyRun {
94 label: "speed_policy:off",
95 runtime_profile,
96 enabled: false,
97 }],
98 Self::On => vec![EvalSpeedPolicyRun {
99 label: "speed_policy:on",
100 runtime_profile: RuntimeProfile::Eval,
101 enabled: true,
102 }],
103 Self::Both => vec![
104 EvalSpeedPolicyRun {
105 label: "speed_policy:off",
106 runtime_profile: RuntimeProfile::Eval,
107 enabled: false,
108 },
109 EvalSpeedPolicyRun {
110 label: "speed_policy:on",
111 runtime_profile: RuntimeProfile::Eval,
112 enabled: true,
113 },
114 ],
115 }
116 }
117}
118
119impl std::str::FromStr for EvalSpeedPolicyMode {
120 type Err = anyhow::Error;
121
122 fn from_str(value: &str) -> Result<Self, Self::Err> {
123 match value {
124 "off" => Ok(Self::Off),
125 "on" => Ok(Self::On),
126 "both" => Ok(Self::Both),
127 other => anyhow::bail!("invalid --speed-policy {other:?}; expected off, on, or both"),
128 }
129 }
130}
131
132#[derive(Debug, Clone, Copy)]
133struct EvalSpeedPolicyRun {
134 label: &'static str,
135 runtime_profile: RuntimeProfile,
136 enabled: bool,
137}
138
139#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
140#[serde(rename_all = "snake_case")]
141pub enum EvalProfileMode {
142 #[default]
143 Off,
144 All,
145}
146
147impl EvalProfileMode {
148 fn runs(self, default_model: &str) -> Vec<EvalProfileRun> {
149 match self {
150 Self::Off => vec![EvalProfileRun {
151 label: None,
152 model: default_model.to_string(),
153 }],
154 Self::All => vec![
155 EvalProfileRun {
156 label: Some("profile:gpt-5.5"),
157 model: "gpt-5.5".to_string(),
158 },
159 EvalProfileRun {
160 label: Some("profile:claude-haiku-4-5-20251001"),
161 model: "claude-haiku-4-5-20251001".to_string(),
162 },
163 ],
164 }
165 }
166}
167
168impl std::str::FromStr for EvalProfileMode {
169 type Err = anyhow::Error;
170
171 fn from_str(value: &str) -> Result<Self, Self::Err> {
172 match value {
173 "off" => Ok(Self::Off),
174 "all" => Ok(Self::All),
175 other => anyhow::bail!("invalid --profiles {other:?}; expected off or all"),
176 }
177 }
178}
179
180#[derive(Debug, Clone)]
181struct EvalProfileRun {
182 label: Option<&'static str>,
183 model: String,
184}
185
186pub fn load_eval_fixtures(dir: &Path) -> anyhow::Result<Vec<EvalFixture>> {
187 let mut fixtures = Vec::new();
188 load_eval_fixtures_from_dir(dir, &mut fixtures)
189 .with_context(|| format!("failed to load eval fixtures from {}", dir.display()))?;
190 fixtures.sort_by(|left, right| left.id.cmp(&right.id));
191 Ok(fixtures)
192}
193
194pub async fn run_offline_eval_suite(
195 fixture_dir: &Path,
196 options: OfflineEvalRunnerOptions,
197) -> anyhow::Result<EvalSuiteReport> {
198 if !options.offline {
199 anyhow::bail!("offline eval runner requires --offline");
200 }
201 let fixtures = load_eval_fixtures(fixture_dir)?;
202 if fixtures.is_empty() {
203 anyhow::bail!(
204 "no canonical eval fixtures found in {}",
205 fixture_dir.display()
206 );
207 }
208 let generated_at = OffsetDateTime::now_utc();
209 let run_id = format!("eval-{}", uuid::Uuid::new_v4());
210 let suite_id = fixture_dir
211 .file_name()
212 .and_then(|name| name.to_str())
213 .filter(|name| !name.is_empty())
214 .unwrap_or("fixtures")
215 .to_string();
216 let speed_runs = options.speed_policy.runs(options.runtime_profile);
217 let profile_runs = options.profiles.runs(&options.model);
218 let mut results = Vec::with_capacity(fixtures.len() * speed_runs.len() * profile_runs.len());
219 for fixture in fixtures {
220 for profile_run in &profile_runs {
221 for speed_run in &speed_runs {
222 results.push(
223 run_offline_fixture(
224 &suite_id,
225 &run_id,
226 &fixture,
227 &options.provider,
228 profile_run,
229 *speed_run,
230 )
231 .await?,
232 );
233 }
234 }
235 }
236 let report = EvalSuiteReport {
237 suite_id,
238 fixture_dir: fixture_dir.to_path_buf(),
239 output_dir: options.output_dir.clone(),
240 offline: options.offline,
241 generated_at,
242 results,
243 };
244 write_eval_report_files(&report, &options.output_dir)?;
245 Ok(report)
246}
247
248fn load_eval_fixtures_from_dir(dir: &Path, fixtures: &mut Vec<EvalFixture>) -> anyhow::Result<()> {
249 for entry in std::fs::read_dir(dir)? {
250 let path = entry?.path();
251 if path.is_dir() {
252 load_eval_fixtures_from_dir(&path, fixtures)?;
253 continue;
254 }
255 if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
256 continue;
257 }
258 let text = std::fs::read_to_string(&path)?;
259 let value: serde_json::Value = serde_json::from_str(&text)?;
260 if !value
261 .get("expected")
262 .is_some_and(serde_json::Value::is_object)
263 {
264 continue;
265 }
266 if let Ok(fixture) = serde_json::from_value::<EvalFixture>(value) {
267 fixtures.push(fixture);
268 }
269 }
270 Ok(())
271}
272
273async fn run_offline_fixture(
274 suite_id: &str,
275 run_id: &str,
276 fixture: &EvalFixture,
277 provider: &str,
278 profile_run: &EvalProfileRun,
279 speed_run: EvalSpeedPolicyRun,
280) -> anyhow::Result<EvalFixtureResult> {
281 let start = Instant::now();
282 let workspace = create_workspace(fixture)?;
283 let thread_id = format!("eval-{}", fixture.id);
284 let mut events = Vec::new();
285 let mut final_answer = String::new();
286 let mut failure_message = None;
287 let mut outcome = EvalOutcome::Pass;
288 let mut failure_class = None;
289 if let Err(err) = run_workspace_setup(fixture, &workspace.path) {
290 outcome = EvalOutcome::HarnessError;
291 failure_class = Some(EvalFailureClass::Environment);
292 failure_message = Some(err.to_string());
293 }
294 let mut turn_id = "setup-failed".to_string();
295 if outcome == EvalOutcome::Pass {
296 let runtime = Arc::new(build_fake_runtime(
297 fixture,
298 &workspace.path,
299 provider,
300 &profile_run.model,
301 speed_run.runtime_profile,
302 speed_run.enabled,
303 fixture.timeout_ms.map(deadline_seconds_from_timeout_ms),
304 )?);
305 let mut rx = runtime.subscribe_events();
306 turn_id = runtime
307 .start_turn(StartTurnRequest {
308 thread_id: thread_id.clone(),
309 message: fixture.prompt.clone(),
310 images: Vec::new(),
311 provider_override: Some(provider.to_string()),
312 model_override: Some(profile_run.model.clone()),
313 reasoning_override: None,
314 workspace: workspace.path.display().to_string(),
315 instructions: InstructionBundle::default(),
316 developer_context: None,
317 task_ledger_required: fixture.expected.task_ledger_required,
318 })
319 .await?;
320 let timeout_ms = fixture.timeout_ms.unwrap_or(DEFAULT_TIMEOUT_MS);
321 match collect_turn_events(
322 &mut rx,
323 &thread_id,
324 &turn_id,
325 Duration::from_millis(timeout_ms),
326 &mut final_answer,
327 )
328 .await
329 {
330 Ok(collected) => events = collected,
331 Err(TurnCollectionError::Timeout { collected }) => {
332 events = collected;
333 outcome = EvalOutcome::Timeout;
334 failure_class = Some(EvalFailureClass::Runtime);
335 failure_message = Some(format!("fixture timed out after {timeout_ms}ms"));
336 }
337 Err(TurnCollectionError::Failed { error, collected }) => {
338 events = collected;
339 outcome = EvalOutcome::Fail;
340 failure_class = Some(if error.contains("verification gaps remain") {
341 EvalFailureClass::Verifier
342 } else {
343 EvalFailureClass::Runtime
344 });
345 failure_message = Some(error);
346 }
347 }
348 }
349 if let Some(injection) = fixture_reliability_injection(fixture, &thread_id, &turn_id) {
350 events.extend(injection.events);
351 if let Some(next) = injection.outcome {
352 outcome = next;
353 }
354 if let Some(next) = injection.failure_class {
355 failure_class = Some(next);
356 }
357 if let Some(next) = injection.failure_message {
358 failure_message = Some(next);
359 }
360 }
361 if outcome == EvalOutcome::Pass
362 && let Err(err) = grade_expected_evidence(fixture, &workspace.path, &final_answer)
363 {
364 outcome = EvalOutcome::Fail;
365 failure_class = Some(failure_class_for_fixture(fixture));
366 failure_message = Some(err.to_string());
367 }
368 if outcome == EvalOutcome::Pass
369 && let Err(err) = grade_task_ledger_requirement(fixture, &events)
370 {
371 outcome = EvalOutcome::Fail;
372 failure_class = Some(EvalFailureClass::Verifier);
373 failure_message = Some(err.to_string());
374 }
375 let trajectory = EvalTrajectory::from_events(thread_id.clone(), turn_id.clone(), &events);
376 let trace_excerpt = trajectory_excerpt(&trajectory);
377 let mut metrics = eval_metrics(&events, start.elapsed().as_millis(), &outcome);
378 metrics.extend(fixture_command_check_metrics(fixture, &outcome));
379 metrics.extend(lazy_discovery_metrics(fixture, &events, &outcome));
380 metrics.extend(grade_retrieval_router_fixture(fixture, &events, &outcome));
381 let report = EvalReport {
382 run: EvalRun {
383 suite_id: suite_id.to_string(),
384 run_id: run_id.to_string(),
385 provider: provider.to_string(),
386 model: profile_run.model.clone(),
387 started_at: OffsetDateTime::now_utc(),
388 tags: {
389 let mut tags = fixture.tags.clone();
390 tags.push(speed_run.label.to_string());
391 if let Some(label) = profile_run.label {
392 tags.push(label.to_string());
393 }
394 tags
395 },
396 },
397 outcome: outcome.clone(),
398 failure_class: failure_class.clone(),
399 trajectory,
400 metrics,
401 };
402 Ok(EvalFixtureResult {
403 fixture_id: fixture.id.clone(),
404 title: fixture.title.clone(),
405 workspace: workspace.path.clone(),
406 final_answer,
407 report,
408 trace_excerpt,
409 failure_message,
410 })
411}
412
413fn grade_task_ledger_requirement(
414 fixture: &EvalFixture,
415 events: &[RoderEvent],
416) -> anyhow::Result<()> {
417 if !fixture.expected.task_ledger_required {
418 return Ok(());
419 }
420 let Some(snapshot) = events.iter().rev().find_map(|event| match event {
421 RoderEvent::TaskLedgerUpdated(updated) => Some(updated),
422 _ => None,
423 }) else {
424 anyhow::bail!("task ledger was required but was not created");
425 };
426 if snapshot.tasks.is_empty() {
427 anyhow::bail!("task ledger was required but contained no tasks");
428 }
429 let incomplete = snapshot
430 .tasks
431 .iter()
432 .filter(|task| {
433 !matches!(
434 task.status,
435 roder_api::task_ledger::TaskLedgerStatus::Completed
436 )
437 })
438 .map(|task| task.id.as_str())
439 .collect::<Vec<_>>();
440 if !incomplete.is_empty() {
441 anyhow::bail!("task ledger incomplete: {}", incomplete.join(", "));
442 }
443 Ok(())
444}
445
446fn default_provider() -> String {
447 PROVIDER_MOCK.to_string()
448}
449
450fn default_model() -> String {
451 "mock".to_string()
452}