1use std::collections::{BTreeMap, BTreeSet};
4use std::io::{Read, Write};
5use std::path::{Path, PathBuf};
6use std::process::{Command, Stdio};
7use std::sync::Arc;
8use std::time::Duration;
9
10use crate::event_log::EventLog;
11use sha2::{Digest, Sha256};
12use wait_timeout::ChildExt;
13
14use super::super::{
15 evaluate_context_pack_suggestion_expectations, generate_context_pack_suggestions, new_id,
16 normalize_friction_events_json, now_rfc3339, parse_json_value, run_persona_eval_ladder,
17 ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent,
18};
19use super::diff::diff_run_records;
20use super::json::{clarifying_max_questions, clarifying_min_questions, normalize_question_text};
21use super::persistence::load_run_record;
22use super::types::{
23 EvalLedgerAppendReport, EvalLedgerFingerprintMismatch, EvalLedgerPriorCommitReport,
24 EvalLedgerProvenance, EvalLedgerReadReport, EvalLedgerResumeCell, EvalLedgerResumePlan,
25 EvalLedgerRow, EvalPackAssertion, EvalPackCase, EvalPackCaseReport, EvalPackCommandObject,
26 EvalPackCommandSpec, EvalPackFixtureRef, EvalPackManifest, EvalPackReliabilityBreakdown,
27 EvalPackReliabilityReport, EvalPackReport, EvalPackRubric, EvalPackRunState,
28 EvalPackSplitValidationReport, EvalPackStatsReport, EvalPackStatsRow, EvalPackTrialReport,
29 EvalSuiteManifest, ReplayEvalCaseReport, ReplayEvalReport, ReplayEvalSuiteReport,
30 ReplayFixture, ReplayStageAssertion, RunDiffReport, RunRecord, RunStageRecord,
31};
32use crate::value::{VmError, VmValue};
33
34const EVAL_LEDGER_ROW_SCHEMA: &str = "harn.eval.ledger.row.v1";
35const EVAL_LEDGER_RUN_STATE_SCHEMA: &str = "harn.eval.run-state.v1";
36const EVAL_LEDGER_RESUME_PLAN_SCHEMA: &str = "harn.eval.resume-plan.v1";
37const EVAL_LEDGER_ROW_KIND: &str = "eval.ledger.row";
38const EVAL_LEDGER_RUN_STATE_KIND: &str = "eval.ledger.run_state";
39const EVAL_LEDGER_TOPIC_PREFIX: &str = "eval.ledger";
40const EVAL_LEDGER_IDENTITY_HEADER: &str = "eval_ledger_identity";
41const EVAL_LEDGER_QUEUE_DEPTH: usize =
42 crate::runtime_limits::RuntimeLimits::DEFAULT.default_event_log_queue_depth;
43const EVAL_LEDGER_READ_BATCH_LIMIT: usize = 1024;
44const LIVE_EXECUTOR_REQUEST_SCHEMA: &str = "harn.eval.live_verify.executor_request.v1";
45const DEFAULT_LIVE_EXECUTOR_TIMEOUT_SECONDS: f64 = 600.0;
46const DEFAULT_LIVE_VERIFY_TIMEOUT_SECONDS: f64 = 120.0;
47
48#[derive(Clone, Debug, Default, serde::Deserialize)]
49#[serde(default)]
50struct EvalLedgerOptions {
51 namespace: Option<String>,
52 suite: Option<String>,
53 model: Option<String>,
54 split: Option<String>,
55 commit: Option<String>,
56 branch: Option<String>,
57 #[serde(alias = "case")]
58 case_name: Option<String>,
59 case_fingerprint: Option<String>,
60 harness_config_fingerprint: Option<String>,
61 limit: Option<usize>,
62}
63
64#[derive(Clone, Copy, Debug, PartialEq, Eq)]
65enum EvalPackCaseKind {
66 Replay,
67 Friction,
68 LiveVerify,
69}
70
71#[derive(Clone, Debug, Default, serde::Deserialize, serde::Serialize)]
72#[serde(default)]
73pub struct EvalPackLiveVerifyOutcome {
74 pub verification: Option<String>,
75 #[serde(alias = "verificationExitCode")]
76 pub verification_exit_code: Option<i64>,
77 #[serde(alias = "pass", alias = "success")]
78 pub passed: Option<bool>,
79 #[serde(alias = "timedOut")]
80 pub timed_out: bool,
81 #[serde(alias = "wallTimeSeconds")]
82 pub wall_time_seconds: f64,
83 #[serde(alias = "costUsd")]
84 pub cost_usd: f64,
85 #[serde(default, alias = "producedPaths")]
86 pub produced_paths: Vec<String>,
87 #[serde(default, alias = "toolCallSummary", alias = "tool_summary")]
88 pub tool_call_summary: serde_json::Value,
89 pub failures: Vec<String>,
90 pub warnings: Vec<String>,
91 pub informational: Vec<String>,
92 #[serde(alias = "runId")]
93 pub run_id: Option<String>,
94 #[serde(alias = "workflowId")]
95 pub workflow_id: Option<String>,
96 #[serde(alias = "sourcePath")]
97 pub source_path: Option<String>,
98 #[serde(alias = "stageCount")]
99 pub stage_count: Option<usize>,
100}
101
102#[derive(Clone, Debug)]
103pub struct EvalPackLiveExecutorRequest {
104 pub executor: EvalPackCommandSpec,
105 pub payload: serde_json::Value,
106 pub manifest_id: String,
107 pub case: EvalPackCase,
108 pub case_id: String,
109 pub trial: usize,
110 pub trials: usize,
111 pub workspace: PathBuf,
112 pub base_dir: Option<PathBuf>,
113}
114
115pub trait EvalPackLiveExecutor {
116 fn execute(
117 &mut self,
118 request: EvalPackLiveExecutorRequest,
119 ) -> Result<EvalPackLiveVerifyOutcome, VmError>;
120}
121
122struct EvalPackShellLiveExecutor;
123
124impl EvalPackLiveExecutor for EvalPackShellLiveExecutor {
125 fn execute(
126 &mut self,
127 request: EvalPackLiveExecutorRequest,
128 ) -> Result<EvalPackLiveVerifyOutcome, VmError> {
129 let output = run_eval_pack_command(
130 &request.executor,
131 &request.workspace,
132 Some(&request.payload),
133 DEFAULT_LIVE_EXECUTOR_TIMEOUT_SECONDS,
134 )?;
135 let mut failures = Vec::new();
136 let mut outcome = live_outcome_from_executor_output(output, &mut failures);
137 outcome.failures.extend(failures);
138 Ok(outcome)
139 }
140}
141
142#[derive(Clone, Debug)]
143struct EvalPackCommandOutput {
144 exit_code: i64,
145 stdout: String,
146 stderr: String,
147 timed_out: bool,
148 wall_time_seconds: f64,
149}
150
151struct EvalPackLedgerRun {
152 log: Arc<crate::event_log::AnyEventLog>,
153 topic: crate::event_log::Topic,
154 rows: Vec<EvalLedgerRow>,
155 suite: String,
156 model: String,
157 commit: String,
158 branch: Option<String>,
159 provenance: EvalLedgerProvenance,
160 inserted: usize,
161 duplicates: usize,
162 fingerprint_refusals: Vec<EvalLedgerFingerprintMismatch>,
163}
164
165pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
166 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
167 if manifest.type_name.is_empty() {
168 manifest.type_name = "eval_suite_manifest".to_string();
169 }
170 if manifest.id.is_empty() {
171 manifest.id = new_id("eval_suite");
172 }
173 Ok(manifest)
174}
175
176pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
177 let content = std::fs::read_to_string(path)
178 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
179 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
180 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
181 if manifest.base_dir.is_none() {
182 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
183 }
184 Ok(manifest)
185}
186
187pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
188 let content = std::fs::read_to_string(path)
189 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
190 let mut manifest: EvalPackManifest =
191 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
192 serde_json::from_str(&content)
193 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
194 } else {
195 toml::from_str(&content)
196 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
197 };
198 normalize_eval_pack_manifest(&mut manifest)?;
199 if manifest.base_dir.is_none() {
200 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
201 }
202 Ok(manifest)
203}
204
205pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
206 let mut manifest: EvalPackManifest = parse_json_value(value)?;
207 normalize_eval_pack_manifest(&mut manifest)?;
208 Ok(manifest)
209}
210
211fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) -> Result<(), VmError> {
212 if manifest.version == 0 {
213 manifest.version = 1;
214 }
215 if manifest.trials == 0 {
216 manifest.trials = 1;
217 }
218 if manifest.id.is_empty() {
219 manifest.id = manifest
220 .name
221 .clone()
222 .filter(|name| !name.trim().is_empty())
223 .unwrap_or_else(|| new_id("eval_pack"));
224 }
225 let rubrics_by_id = manifest
226 .rubrics
227 .iter()
228 .filter(|rubric| !rubric.id.is_empty())
229 .map(|rubric| (rubric.id.as_str(), rubric))
230 .collect::<BTreeMap<_, _>>();
231 let fixtures_by_id = manifest
232 .fixtures
233 .iter()
234 .filter(|fixture| !fixture.id.is_empty())
235 .map(|fixture| (fixture.id.as_str(), fixture))
236 .collect::<BTreeMap<_, _>>();
237 for case in &mut manifest.cases {
238 if case.trials == Some(0) {
239 return Err(VmError::Runtime(format!(
240 "eval pack case '{}' has trials = 0",
241 case.id.as_deref().unwrap_or("<unnamed>")
242 )));
243 }
244 case.case_fingerprint =
245 eval_pack_case_fingerprint_with_refs(case, &rubrics_by_id, &fixtures_by_id)?;
246 }
247 for ladder in &mut manifest.ladders {
248 super::super::normalize_persona_eval_ladder_manifest(ladder);
249 }
250 Ok(())
251}
252
253pub fn eval_pack_case_fingerprint(case: &EvalPackCase) -> Result<String, VmError> {
254 eval_pack_case_fingerprint_with_refs(case, &BTreeMap::new(), &BTreeMap::new())
255}
256
257fn eval_pack_case_fingerprint_with_refs(
258 case: &EvalPackCase,
259 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
260 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
261) -> Result<String, VmError> {
262 let mut task = BTreeMap::new();
263 insert_json_field(&mut task, "kind", &normalized_eval_pack_case_kind(case))?;
264 insert_json_field(&mut task, "run", &case.run)?;
265 insert_json_field(&mut task, "run_path", &case.run_path)?;
266 insert_json_field(&mut task, "friction_events", &case.friction_events)?;
267 insert_json_field(&mut task, "task", &case.task)?;
268 insert_json_field(&mut task, "workspace", &case.workspace)?;
269 insert_json_field(&mut task, "project", &case.project)?;
270
271 let mut expected_outputs = BTreeMap::new();
272 insert_json_field(&mut expected_outputs, "fixture", &case.fixture)?;
273 insert_json_field(&mut expected_outputs, "fixture_path", &case.fixture_path)?;
274 insert_json_field(
275 &mut expected_outputs,
276 "expected_output_paths",
277 &case.expected_output_paths,
278 )?;
279 insert_json_field(
280 &mut expected_outputs,
281 "required_output_snippets",
282 &case.required_output_snippets,
283 )?;
284 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
285 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
286 insert_json_field(&mut expected_outputs, "fixture_ref", *fixture)?;
287 }
288 }
289
290 let resolved_rubrics = case
291 .rubrics
292 .iter()
293 .filter_map(|rubric_id| rubrics_by_id.get(rubric_id.as_str()))
294 .map(|rubric| {
295 serde_json::to_value(rubric)
296 .map_err(|e| VmError::Runtime(format!("failed to encode eval pack rubric: {e}")))
297 })
298 .collect::<Result<Vec<_>, _>>()?;
299 let mut verify = BTreeMap::new();
300 insert_json_field(&mut verify, "compare_to", &case.compare_to)?;
301 insert_json_field(&mut verify, "verify_command", &case.verify_command)?;
302 insert_json_field(&mut verify, "tool_budgets", &case.tool_budgets)?;
303 insert_json_field(&mut verify, "rubric_ids", &case.rubrics)?;
304 verify.insert(
305 "rubrics".to_string(),
306 serde_json::Value::Array(resolved_rubrics),
307 );
308
309 let mut flags = BTreeMap::new();
310 insert_json_field(&mut flags, "severity", &case.severity)?;
311 insert_json_field(&mut flags, "thresholds", &case.thresholds)?;
312 insert_json_field(&mut flags, "metadata", &case.metadata)?;
313 insert_json_field(&mut flags, "executor", &case.executor)?;
314
315 let mut payload = BTreeMap::new();
316 payload.insert("task".to_string(), encode_json(&task)?);
317 payload.insert(
318 "expected_outputs".to_string(),
319 encode_json(&expected_outputs)?,
320 );
321 payload.insert("verify".to_string(), encode_json(&verify)?);
322 payload.insert("flags".to_string(), encode_json(&flags)?);
323 fingerprint_json(&payload)
324}
325
326pub fn eval_pack_harness_config_fingerprint(
327 manifest: &EvalPackManifest,
328) -> Result<String, VmError> {
329 let rubric_harness = manifest
330 .rubrics
331 .iter()
332 .map(|rubric| {
333 let mut item = BTreeMap::new();
334 insert_json_field(&mut item, "id", &rubric.id)?;
335 insert_json_field(&mut item, "kind", &rubric.kind)?;
336 insert_json_field(&mut item, "prompt", &rubric.prompt)?;
337 insert_json_field(&mut item, "judge", &rubric.judge)?;
338 encode_json(&item)
339 })
340 .collect::<Result<Vec<_>, VmError>>()?;
341 let mut harness_metadata = BTreeMap::new();
342 for key in [
343 "model",
344 "provider",
345 "route",
346 "prompt",
347 "promptVersion",
348 "prompt_version",
349 "toolFormat",
350 "tool_format",
351 "pipelineRev",
352 "pipeline_rev",
353 "pipelineRevision",
354 "pipeline_revision",
355 "harnVersion",
356 "harn_version",
357 "harness",
358 "harnessConfig",
359 "harness_config",
360 ] {
361 if let Some(value) = manifest.metadata.get(key) {
362 harness_metadata.insert(key.to_string(), value.clone());
363 }
364 }
365
366 let mut payload = BTreeMap::new();
367 insert_json_field(&mut payload, "executor", &manifest.executor)?;
368 insert_json_field(&mut payload, "manifest_judge", &manifest.judge)?;
369 insert_json_field(&mut payload, "default_judge", &manifest.defaults.judge)?;
370 insert_json_field(&mut payload, "package", &manifest.package)?;
371 payload.insert(
372 "harness_metadata".to_string(),
373 encode_json(&harness_metadata)?,
374 );
375 payload.insert(
376 "rubric_harness".to_string(),
377 serde_json::Value::Array(rubric_harness),
378 );
379 fingerprint_json(&payload)
380}
381
382fn insert_json_field<T: serde::Serialize>(
383 map: &mut BTreeMap<String, serde_json::Value>,
384 key: &str,
385 value: &T,
386) -> Result<(), VmError> {
387 map.insert(key.to_string(), encode_json(value)?);
388 Ok(())
389}
390
391fn encode_json<T: serde::Serialize>(value: &T) -> Result<serde_json::Value, VmError> {
392 serde_json::to_value(value)
393 .map_err(|e| VmError::Runtime(format!("failed to encode eval pack fingerprint: {e}")))
394}
395
396fn fingerprint_json<T: serde::Serialize>(value: &T) -> Result<String, VmError> {
397 let bytes = serde_json::to_vec(value)
398 .map_err(|e| VmError::Runtime(format!("failed to encode eval pack fingerprint: {e}")))?;
399 let digest = hex::encode(Sha256::digest(bytes));
400 Ok(digest.chars().take(16).collect())
401}
402
403fn eval_pack_case_kind(case: &EvalPackCase) -> EvalPackCaseKind {
404 match normalized_eval_pack_case_kind(case).as_str() {
405 "live-verify" => EvalPackCaseKind::LiveVerify,
406 "friction" => EvalPackCaseKind::Friction,
407 _ => EvalPackCaseKind::Replay,
408 }
409}
410
411fn normalized_eval_pack_case_kind(case: &EvalPackCase) -> String {
412 match case
413 .kind
414 .as_deref()
415 .map(|kind| kind.trim().to_ascii_lowercase().replace('_', "-"))
416 .as_deref()
417 {
418 Some("live") | Some("live-verify") | Some("verify-live") => "live-verify".to_string(),
419 Some("friction") | Some("context-pack-friction") => "friction".to_string(),
420 Some("replay") | Some("fixture") | Some("run-record") => "replay".to_string(),
421 Some(other) if !other.is_empty() => other.to_string(),
422 _ if case.task.is_some()
423 || case.workspace.is_some()
424 || case.project.is_some()
425 || case.verify_command.is_some()
426 || !case.expected_output_paths.is_empty()
427 || !case.required_output_snippets.is_empty() =>
428 {
429 "live-verify".to_string()
430 }
431 _ if case.friction_events.is_some() => "friction".to_string(),
432 _ => "replay".to_string(),
433 }
434}
435
436pub fn eval_ledger_read_report(
437 options: Option<serde_json::Value>,
438) -> Result<EvalLedgerReadReport, VmError> {
439 let options = eval_ledger_options(options)?;
440 let namespace = eval_ledger_namespace(&options);
441 let topic = eval_ledger_topic(&namespace)?;
442 let log = ensure_eval_ledger_event_log(None);
443 let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &options))?;
444 Ok(EvalLedgerReadReport { rows })
445}
446
447pub fn eval_ledger_append_rows_report(
448 rows: serde_json::Value,
449 options: Option<serde_json::Value>,
450) -> Result<EvalLedgerAppendReport, VmError> {
451 let options = eval_ledger_options(options)?;
452 let namespace = eval_ledger_namespace(&options);
453 let topic = eval_ledger_topic(&namespace)?;
454 let provenance = eval_ledger_provenance(None, &options, None);
455 let rows = parse_eval_ledger_rows(rows)?
456 .into_iter()
457 .map(|mut row| {
458 normalize_eval_ledger_row(&mut row, &options, &provenance);
459 row
460 })
461 .collect::<Vec<_>>();
462 let log = ensure_eval_ledger_event_log(None);
463 futures::executor::block_on(append_eval_ledger_rows(&log, &topic, rows))
464}
465
466pub fn eval_ledger_prior_commit_rows_report(
467 options: serde_json::Value,
468) -> Result<EvalLedgerPriorCommitReport, VmError> {
469 let options = eval_ledger_options(Some(options))?;
470 let namespace = eval_ledger_namespace(&options);
471 let topic = eval_ledger_topic(&namespace)?;
472 let log = ensure_eval_ledger_event_log(None);
473 let mut read_options = options.clone();
474 read_options.commit = None;
475 read_options.case_fingerprint = None;
476 read_options.harness_config_fingerprint = None;
477 let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
478 Ok(prior_commit_report(rows, &options))
479}
480
481pub fn eval_ledger_resume_plan_report(
482 manifest: &EvalPackManifest,
483 options: Option<serde_json::Value>,
484) -> Result<EvalLedgerResumePlan, VmError> {
485 let split_report = validate_eval_pack_split(manifest)?;
486 let harness_config_fingerprint = eval_pack_harness_config_fingerprint(manifest)?;
487 let options = eval_ledger_options(options)?;
488 let base_dir = manifest.base_dir.as_deref().map(Path::new);
489 let suite = options.suite.clone().unwrap_or_else(|| manifest.id.clone());
490 let model = options
491 .model
492 .clone()
493 .or_else(|| eval_pack_manifest_model(manifest))
494 .unwrap_or_else(|| "unknown".to_string());
495 let provenance = eval_ledger_provenance(base_dir, &options, Some(&manifest.metadata));
496 let commit = options
497 .commit
498 .clone()
499 .unwrap_or_else(|| provenance.commit.clone());
500 let namespace = eval_pack_ledger_namespace(manifest, &options);
501 let topic = eval_ledger_topic(&namespace)?;
502 let log = ensure_eval_ledger_event_log(base_dir);
503 let read_options = eval_pack_ledger_read_options(&suite, &model, &commit);
504 let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
505 Ok(build_eval_ledger_resume_plan(
506 manifest,
507 &split_report,
508 &rows,
509 &suite,
510 &model,
511 &commit,
512 &harness_config_fingerprint,
513 ))
514}
515
516fn eval_ledger_options(value: Option<serde_json::Value>) -> Result<EvalLedgerOptions, VmError> {
517 let mut options = match value {
518 None | Some(serde_json::Value::Null) => EvalLedgerOptions::default(),
519 Some(value) => serde_json::from_value(value)
520 .map_err(|e| VmError::Runtime(format!("eval ledger options parse error: {e}")))?,
521 };
522 normalize_optional_string(&mut options.namespace);
523 normalize_optional_string(&mut options.suite);
524 normalize_optional_string(&mut options.model);
525 normalize_optional_string(&mut options.split);
526 normalize_optional_string(&mut options.commit);
527 normalize_optional_string(&mut options.branch);
528 normalize_optional_string(&mut options.case_name);
529 normalize_optional_string(&mut options.case_fingerprint);
530 normalize_optional_string(&mut options.harness_config_fingerprint);
531 Ok(options)
532}
533
534fn normalize_optional_string(value: &mut Option<String>) {
535 if value.as_deref().is_some_and(|text| text.trim().is_empty()) {
536 *value = None;
537 }
538}
539
540fn parse_eval_ledger_rows(value: serde_json::Value) -> Result<Vec<EvalLedgerRow>, VmError> {
541 match value {
542 serde_json::Value::Array(_) => serde_json::from_value(value)
543 .map_err(|e| VmError::Runtime(format!("eval ledger rows parse error: {e}"))),
544 serde_json::Value::Object(_) => serde_json::from_value(value)
545 .map(|row| vec![row])
546 .map_err(|e| VmError::Runtime(format!("eval ledger row parse error: {e}"))),
547 _ => Err(VmError::Runtime(
548 "eval ledger rows must be a row dict or list of row dicts".to_string(),
549 )),
550 }
551}
552
553fn eval_ledger_namespace(options: &EvalLedgerOptions) -> String {
554 options
555 .namespace
556 .clone()
557 .or_else(|| options.suite.clone())
558 .unwrap_or_else(|| "default".to_string())
559}
560
561fn eval_pack_ledger_namespace(manifest: &EvalPackManifest, options: &EvalLedgerOptions) -> String {
562 options
563 .namespace
564 .clone()
565 .or_else(|| metadata_string(&manifest.metadata, &["ledger_namespace", "ledgerNamespace"]))
566 .or_else(|| options.suite.clone())
567 .unwrap_or_else(|| manifest.id.clone())
568}
569
570fn eval_pack_ledger_read_options(suite: &str, model: &str, commit: &str) -> EvalLedgerOptions {
571 EvalLedgerOptions {
572 suite: Some(suite.to_string()),
573 model: Some(model.to_string()),
574 commit: Some(commit.to_string()),
575 ..EvalLedgerOptions::default()
576 }
577}
578
579fn eval_ledger_topic(namespace: &str) -> Result<crate::event_log::Topic, VmError> {
580 let safe_namespace = crate::event_log::sanitize_topic_component(namespace);
581 crate::event_log::Topic::new(format!("{EVAL_LEDGER_TOPIC_PREFIX}.{safe_namespace}"))
582 .map_err(eval_ledger_log_error)
583}
584
585fn ensure_eval_ledger_event_log(base_dir: Option<&Path>) -> Arc<crate::event_log::AnyEventLog> {
586 if let Some(log) = crate::event_log::active_event_log() {
587 return log;
588 }
589 if let Some(base_dir) = base_dir {
590 if crate::event_log::install_lazy_default_for_base_dir(base_dir).is_ok() {
591 if let Some(log) = crate::event_log::active_event_log() {
592 return log;
593 }
594 }
595 } else if let Ok(cwd) = std::env::current_dir() {
596 if crate::event_log::install_lazy_default_for_base_dir(&cwd).is_ok() {
597 if let Some(log) = crate::event_log::active_event_log() {
598 return log;
599 }
600 }
601 }
602 crate::event_log::install_memory_for_current_thread(EVAL_LEDGER_QUEUE_DEPTH)
603}
604
605async fn read_eval_ledger_rows(
606 log: &Arc<crate::event_log::AnyEventLog>,
607 topic: &crate::event_log::Topic,
608 options: &EvalLedgerOptions,
609) -> Result<Vec<EvalLedgerRow>, VmError> {
610 let mut rows = Vec::new();
611 let mut cursor = None;
612 loop {
613 let batch = log
614 .read_range(topic, cursor, EVAL_LEDGER_READ_BATCH_LIMIT)
615 .await
616 .map_err(eval_ledger_log_error)?;
617 if batch.is_empty() {
618 break;
619 }
620 for (event_id, event) in batch {
621 cursor = Some(event_id);
622 if let Some(row) = parse_eval_ledger_row(event_id, event) {
623 if eval_ledger_row_matches(&row, options) {
624 rows.push(row);
625 if options.limit.is_some_and(|limit| rows.len() >= limit) {
626 return Ok(rows);
627 }
628 }
629 }
630 }
631 }
632 Ok(rows)
633}
634
635async fn append_eval_ledger_rows(
636 log: &Arc<crate::event_log::AnyEventLog>,
637 topic: &crate::event_log::Topic,
638 rows: Vec<EvalLedgerRow>,
639) -> Result<EvalLedgerAppendReport, VmError> {
640 let mut report = EvalLedgerAppendReport {
641 appended: rows.len(),
642 all_skipped: !rows.is_empty() && rows.iter().all(eval_ledger_row_is_skip),
643 ..EvalLedgerAppendReport::default()
644 };
645 for row in rows {
646 let identity = eval_ledger_row_identity(&row)?;
647 let mut headers = BTreeMap::new();
648 headers.insert(EVAL_LEDGER_IDENTITY_HEADER.to_string(), identity.clone());
649 headers.insert("suite".to_string(), row.suite.clone());
650 headers.insert("model".to_string(), row.model.clone());
651 headers.insert("commit".to_string(), row.commit.clone());
652 headers.insert("case_name".to_string(), row.case_name.clone());
653 headers.insert("trial".to_string(), row.trial.to_string());
654 let payload = serde_json::to_value(&row)
655 .map_err(|e| VmError::Runtime(format!("eval ledger row encode error: {e}")))?;
656 let outcome = log
657 .append_idempotent_by_header(
658 topic,
659 EVAL_LEDGER_IDENTITY_HEADER,
660 &identity,
661 crate::event_log::LogEvent::new(EVAL_LEDGER_ROW_KIND, payload)
662 .with_headers(headers),
663 )
664 .await
665 .map_err(eval_ledger_log_error)?;
666 if outcome.inserted {
667 report.inserted += 1;
668 } else {
669 report.duplicates += 1;
670 }
671 report.event_ids.push(outcome.event_id);
672 if let Some(stored) = parse_eval_ledger_row(outcome.event_id, outcome.event) {
673 report.rows.push(stored);
674 }
675 }
676 log.flush().await.map_err(eval_ledger_log_error)?;
677 Ok(report)
678}
679
680fn parse_eval_ledger_row(
681 event_id: crate::event_log::EventId,
682 event: crate::event_log::LogEvent,
683) -> Option<EvalLedgerRow> {
684 if event.kind != EVAL_LEDGER_ROW_KIND {
685 return None;
686 }
687 let mut row: EvalLedgerRow = serde_json::from_value(event.payload).ok()?;
688 if row.schema != EVAL_LEDGER_ROW_SCHEMA {
689 return None;
690 }
691 row.event_id = Some(event_id);
692 Some(row)
693}
694
695fn eval_ledger_row_matches(row: &EvalLedgerRow, options: &EvalLedgerOptions) -> bool {
696 option_matches(options.suite.as_deref(), &row.suite)
697 && option_matches(options.model.as_deref(), &row.model)
698 && option_matches(options.commit.as_deref(), &row.commit)
699 && option_matches(options.case_name.as_deref(), &row.case_name)
700 && option_matches(options.case_fingerprint.as_deref(), &row.case_fingerprint)
701 && option_matches(
702 options.harness_config_fingerprint.as_deref(),
703 &row.harness_config_fingerprint,
704 )
705 && match options.split.as_deref() {
706 Some(expected) => row.split.as_deref() == Some(expected),
707 None => true,
708 }
709}
710
711fn option_matches(expected: Option<&str>, actual: &str) -> bool {
712 expected.is_none_or(|expected| expected == actual)
713}
714
715fn normalize_eval_ledger_row(
716 row: &mut EvalLedgerRow,
717 options: &EvalLedgerOptions,
718 provenance: &EvalLedgerProvenance,
719) {
720 if row.schema.is_empty() {
721 row.schema = EVAL_LEDGER_ROW_SCHEMA.to_string();
722 }
723 if row.suite.is_empty() {
724 row.suite = options
725 .suite
726 .clone()
727 .unwrap_or_else(|| eval_ledger_namespace(options));
728 }
729 if row.model.is_empty() {
730 row.model = options
731 .model
732 .clone()
733 .unwrap_or_else(|| "unknown".to_string());
734 }
735 if row.split.is_none() {
736 row.split = options.split.clone();
737 }
738 if row.commit.is_empty() {
739 row.commit = options
740 .commit
741 .clone()
742 .unwrap_or_else(|| provenance.commit.clone());
743 }
744 if row.case_name.is_empty() {
745 row.case_name = options
746 .case_name
747 .clone()
748 .filter(|name| !name.is_empty())
749 .unwrap_or_else(|| row.name.clone());
750 }
751 if row.name.is_empty() {
752 row.name = row.case_name.clone();
753 }
754 if row.case_fingerprint.is_empty() {
755 row.case_fingerprint = options.case_fingerprint.clone().unwrap_or_default();
756 }
757 if row.harness_config_fingerprint.is_empty() {
758 row.harness_config_fingerprint = options
759 .harness_config_fingerprint
760 .clone()
761 .unwrap_or_default();
762 }
763 if row.trial == 0 {
764 row.trial = 1;
765 }
766 if row.trials == 0 {
767 row.trials = 1;
768 }
769 if row.status.is_empty() {
770 row.status = if row.passes > 0 {
771 "PASS"
772 } else if row.fails > 0 {
773 "FAIL"
774 } else {
775 "skip"
776 }
777 .to_string();
778 }
779 if row.verification.is_empty() {
780 row.verification = row.status.clone();
781 }
782 if row.passes + row.fails + row.skips == 0 {
783 match row.status.to_ascii_uppercase().as_str() {
784 "PASS" => row.passes = 1,
785 "FAIL" => row.fails = 1,
786 _ => row.skips = 1,
787 }
788 }
789 if row.pass_rate == 0.0 && row.passes > 0 {
790 row.pass_rate = row.passes as f64 / row.trials.max(1) as f64;
791 }
792 if row.provenance.commit.is_empty() {
793 row.provenance.commit = row.commit.clone();
794 }
795 if row.provenance.branch.is_none() {
796 row.provenance.branch = provenance.branch.clone();
797 }
798 if row.provenance.ts.is_empty() {
799 row.provenance.ts = provenance.ts.clone();
800 }
801 if row.provenance.harn_version.is_empty() {
802 row.provenance.harn_version = provenance.harn_version.clone();
803 }
804 if row.provenance.host.is_empty() {
805 row.provenance.host = provenance.host.clone();
806 }
807}
808
809fn eval_ledger_row_identity(row: &EvalLedgerRow) -> Result<String, VmError> {
810 let material = serde_json::json!({
811 "schema": EVAL_LEDGER_ROW_SCHEMA,
812 "suite": row.suite,
813 "model": row.model,
814 "split": row.split,
815 "commit": row.commit,
816 "case_name": row.case_name,
817 "case_fingerprint": row.case_fingerprint,
818 "harness_config_fingerprint": row.harness_config_fingerprint,
819 "trial": row.trial,
820 });
821 let bytes = serde_json::to_vec(&material)
822 .map_err(|e| VmError::Runtime(format!("eval ledger identity encode error: {e}")))?;
823 Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
824}
825
826fn eval_ledger_row_is_skip(row: &EvalLedgerRow) -> bool {
827 row.skipped || row.skips > 0 || row.status.eq_ignore_ascii_case("skip")
828}
829
830fn eval_ledger_provenance(
831 base_dir: Option<&Path>,
832 options: &EvalLedgerOptions,
833 metadata: Option<&BTreeMap<String, serde_json::Value>>,
834) -> EvalLedgerProvenance {
835 let commit = options
836 .commit
837 .clone()
838 .or_else(|| {
839 metadata.and_then(|metadata| {
840 metadata_string(metadata, &["commit", "git_commit", "source_commit"])
841 })
842 })
843 .or_else(|| env_string(&["HARN_EVAL_COMMIT", "HARN_GIT_COMMIT", "GITHUB_SHA"]))
844 .or_else(|| git_output(base_dir, &["rev-parse", "HEAD"]))
845 .unwrap_or_else(|| "unknown".to_string());
846 let branch = options
847 .branch
848 .clone()
849 .or_else(|| {
850 metadata.and_then(|metadata| {
851 metadata_string(metadata, &["branch", "git_branch", "source_branch"])
852 })
853 })
854 .or_else(|| env_string(&["HARN_EVAL_BRANCH", "HARN_GIT_BRANCH", "GITHUB_REF_NAME"]))
855 .or_else(|| git_output(base_dir, &["rev-parse", "--abbrev-ref", "HEAD"]));
856 EvalLedgerProvenance {
857 commit,
858 branch,
859 ts: now_rfc3339(),
860 harn_version: crate::bytecode_cache::HARN_VERSION.to_string(),
861 host: env_string(&["HOSTNAME", "COMPUTERNAME"]).unwrap_or_else(|| "unknown".to_string()),
862 }
863}
864
865fn env_string(keys: &[&str]) -> Option<String> {
866 keys.iter().find_map(|key| {
867 std::env::var(key)
868 .ok()
869 .map(|value| value.trim().to_string())
870 .filter(|value| !value.is_empty())
871 })
872}
873
874fn git_output(base_dir: Option<&Path>, args: &[&str]) -> Option<String> {
875 let mut command = std::process::Command::new("git");
876 if let Some(base_dir) = base_dir {
877 command.arg("-C").arg(base_dir);
878 }
879 let output = command.args(args).output().ok()?;
880 if !output.status.success() {
881 return None;
882 }
883 String::from_utf8(output.stdout)
884 .ok()
885 .map(|value| value.trim().to_string())
886 .filter(|value| !value.is_empty())
887}
888
889fn metadata_string(
890 metadata: &BTreeMap<String, serde_json::Value>,
891 keys: &[&str],
892) -> Option<String> {
893 keys.iter()
894 .find_map(|key| json_value_string(metadata.get(*key)?))
895}
896
897fn json_value_string(value: &serde_json::Value) -> Option<String> {
898 match value {
899 serde_json::Value::String(value) => Some(value.trim().to_string()),
900 serde_json::Value::Number(value) => Some(value.to_string()),
901 serde_json::Value::Bool(value) => Some(value.to_string()),
902 _ => None,
903 }
904 .filter(|value| !value.is_empty())
905}
906
907fn eval_pack_manifest_model(manifest: &EvalPackManifest) -> Option<String> {
908 metadata_string(&manifest.metadata, &["model", "provider_model", "route"])
909 .or_else(|| {
910 manifest
911 .judge
912 .as_ref()
913 .and_then(|judge| judge.model.clone())
914 })
915 .or_else(|| {
916 manifest
917 .defaults
918 .judge
919 .as_ref()
920 .and_then(|judge| judge.model.clone())
921 })
922}
923
924fn prior_commit_report(
925 rows: Vec<EvalLedgerRow>,
926 options: &EvalLedgerOptions,
927) -> EvalLedgerPriorCommitReport {
928 let current_commit = options.commit.as_deref().unwrap_or_default();
929 let mut fingerprint_mismatches = Vec::new();
930 let mut candidates = Vec::new();
931 let mut latest_event_by_commit = BTreeMap::<String, u64>::new();
932 for row in rows {
933 if row.commit == current_commit {
934 continue;
935 }
936 if let Some(mismatch) = fingerprint_mismatch_for_row(&row, options) {
937 fingerprint_mismatches.push(mismatch);
938 continue;
939 }
940 let event_id = row.event_id.unwrap_or_default();
941 latest_event_by_commit
942 .entry(row.commit.clone())
943 .and_modify(|existing| *existing = (*existing).max(event_id))
944 .or_insert(event_id);
945 candidates.push(row);
946 }
947 let selected_commit = latest_event_by_commit
948 .iter()
949 .max_by_key(|(_, event_id)| *event_id)
950 .map(|(commit, _)| commit.clone());
951 let rows = selected_commit
952 .as_ref()
953 .map(|commit| {
954 candidates
955 .into_iter()
956 .filter(|row| &row.commit == commit)
957 .collect()
958 })
959 .unwrap_or_default();
960 EvalLedgerPriorCommitReport {
961 commit: selected_commit,
962 model: options.model.clone().unwrap_or_default(),
963 split: options.split.clone(),
964 rows,
965 fingerprint_mismatches,
966 }
967}
968
969fn fingerprint_mismatch_for_row(
970 row: &EvalLedgerRow,
971 options: &EvalLedgerOptions,
972) -> Option<EvalLedgerFingerprintMismatch> {
973 let expected_case = options.case_fingerprint.as_deref();
974 let expected_harness = options.harness_config_fingerprint.as_deref();
975 let case_mismatch = expected_case.is_some_and(|expected| expected != row.case_fingerprint);
976 let harness_mismatch =
977 expected_harness.is_some_and(|expected| expected != row.harness_config_fingerprint);
978 if !(case_mismatch || harness_mismatch) {
979 return None;
980 }
981 Some(EvalLedgerFingerprintMismatch {
982 case_name: row.case_name.clone(),
983 split: row.split.clone(),
984 commit: row.commit.clone(),
985 trial: row.trial,
986 case_fingerprint: row.case_fingerprint.clone(),
987 harness_config_fingerprint: row.harness_config_fingerprint.clone(),
988 expected_case_fingerprint: expected_case.unwrap_or_default().to_string(),
989 expected_harness_config_fingerprint: expected_harness.unwrap_or_default().to_string(),
990 })
991}
992
993fn build_eval_ledger_resume_plan(
994 manifest: &EvalPackManifest,
995 split_report: &EvalPackSplitValidationReport,
996 rows: &[EvalLedgerRow],
997 suite: &str,
998 model: &str,
999 commit: &str,
1000 harness_config_fingerprint: &str,
1001) -> EvalLedgerResumePlan {
1002 let split_by_case = split_by_case_id(split_report);
1003 let mut cells = Vec::new();
1004 let mut fingerprint_refusals = Vec::new();
1005 let mut skipped_cells = 0usize;
1006 for (index, case) in manifest.cases.iter().enumerate() {
1007 let case_id = eval_pack_case_id(case, index);
1008 let split = split_by_case.get(&case_id).cloned();
1009 let trial_count = case.trials.unwrap_or(manifest.trials);
1010 for trial in 1..=trial_count {
1011 let matching = ledger_rows_for_cell(
1012 rows,
1013 suite,
1014 model,
1015 split.as_deref(),
1016 commit,
1017 &case_id,
1018 trial,
1019 );
1020 let exact = matching
1021 .iter()
1022 .copied()
1023 .filter(|row| {
1024 row.case_fingerprint == case.case_fingerprint
1025 && row.harness_config_fingerprint == harness_config_fingerprint
1026 })
1027 .max_by_key(|row| row.event_id.unwrap_or_default());
1028 if let Some(row) = exact {
1029 skipped_cells += 1;
1030 cells.push(EvalLedgerResumeCell {
1031 case_name: case_id.clone(),
1032 split: split.clone(),
1033 trial,
1034 status: "skip".to_string(),
1035 reason: "matching ledger row".to_string(),
1036 event_id: row.event_id,
1037 });
1038 continue;
1039 }
1040 let mut refused = false;
1041 for row in matching {
1042 if row.case_fingerprint != case.case_fingerprint
1043 || row.harness_config_fingerprint != harness_config_fingerprint
1044 {
1045 fingerprint_refusals.push(EvalLedgerFingerprintMismatch {
1046 case_name: case_id.clone(),
1047 split: split.clone(),
1048 commit: row.commit.clone(),
1049 trial,
1050 case_fingerprint: row.case_fingerprint.clone(),
1051 harness_config_fingerprint: row.harness_config_fingerprint.clone(),
1052 expected_case_fingerprint: case.case_fingerprint.clone(),
1053 expected_harness_config_fingerprint: harness_config_fingerprint.to_string(),
1054 });
1055 refused = true;
1056 }
1057 }
1058 cells.push(EvalLedgerResumeCell {
1059 case_name: case_id.clone(),
1060 split: split.clone(),
1061 trial,
1062 status: "run".to_string(),
1063 reason: if refused {
1064 "fingerprint mismatch".to_string()
1065 } else {
1066 "missing ledger row".to_string()
1067 },
1068 event_id: None,
1069 });
1070 }
1071 }
1072 let requested_cells = cells.len();
1073 let remaining_cells = requested_cells.saturating_sub(skipped_cells);
1074 EvalLedgerResumePlan {
1075 schema: EVAL_LEDGER_RESUME_PLAN_SCHEMA.to_string(),
1076 suite: suite.to_string(),
1077 model: model.to_string(),
1078 commit: commit.to_string(),
1079 harness_config_fingerprint: harness_config_fingerprint.to_string(),
1080 requested_cells,
1081 completed_cells: skipped_cells,
1082 skipped_cells,
1083 remaining_cells,
1084 all_skipped: requested_cells > 0 && remaining_cells == 0,
1085 fingerprint_refusals,
1086 cells,
1087 }
1088}
1089
1090fn ledger_rows_for_cell<'a>(
1091 rows: &'a [EvalLedgerRow],
1092 suite: &str,
1093 model: &str,
1094 split: Option<&str>,
1095 commit: &str,
1096 case_name: &str,
1097 trial: usize,
1098) -> Vec<&'a EvalLedgerRow> {
1099 rows.iter()
1100 .filter(|row| {
1101 row.suite == suite
1102 && row.model == model
1103 && row.split.as_deref() == split
1104 && row.commit == commit
1105 && row.case_name == case_name
1106 && row.trial == trial
1107 })
1108 .collect()
1109}
1110
1111fn eval_ledger_log_error(error: crate::event_log::LogError) -> VmError {
1112 VmError::Runtime(format!("eval ledger: event log: {error}"))
1113}
1114
1115impl EvalPackLedgerRun {
1116 fn start(
1117 manifest: &EvalPackManifest,
1118 base_dir: Option<&Path>,
1119 options: Option<serde_json::Value>,
1120 ) -> Result<Self, VmError> {
1121 let options = eval_ledger_options(options)?;
1122 let suite = options.suite.clone().unwrap_or_else(|| manifest.id.clone());
1123 let model = options
1124 .model
1125 .clone()
1126 .or_else(|| eval_pack_manifest_model(manifest))
1127 .unwrap_or_else(|| "unknown".to_string());
1128 let provenance = eval_ledger_provenance(base_dir, &options, Some(&manifest.metadata));
1129 let commit = options
1130 .commit
1131 .clone()
1132 .unwrap_or_else(|| provenance.commit.clone());
1133 let namespace = eval_pack_ledger_namespace(manifest, &options);
1134 let topic = eval_ledger_topic(&namespace)?;
1135 let log = ensure_eval_ledger_event_log(base_dir);
1136 let read_options = eval_pack_ledger_read_options(&suite, &model, &commit);
1137 let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
1138 Ok(Self {
1139 log,
1140 topic,
1141 rows,
1142 suite,
1143 model,
1144 commit,
1145 branch: provenance.branch.clone(),
1146 provenance,
1147 inserted: 0,
1148 duplicates: 0,
1149 fingerprint_refusals: Vec::new(),
1150 })
1151 }
1152
1153 fn replay_row_for_cell(
1154 &mut self,
1155 case_id: &str,
1156 split: Option<&str>,
1157 trial: usize,
1158 case_fingerprint: &str,
1159 harness_config_fingerprint: &str,
1160 ) -> Option<EvalLedgerRow> {
1161 let matching = ledger_rows_for_cell(
1162 &self.rows,
1163 &self.suite,
1164 &self.model,
1165 split,
1166 &self.commit,
1167 case_id,
1168 trial,
1169 );
1170 let exact = matching
1171 .iter()
1172 .copied()
1173 .filter(|row| {
1174 row.case_fingerprint == case_fingerprint
1175 && row.harness_config_fingerprint == harness_config_fingerprint
1176 })
1177 .max_by_key(|row| row.event_id.unwrap_or_default())
1178 .cloned();
1179 if exact.is_some() {
1180 return exact;
1181 }
1182 for row in matching {
1183 if row.case_fingerprint != case_fingerprint
1184 || row.harness_config_fingerprint != harness_config_fingerprint
1185 {
1186 self.fingerprint_refusals
1187 .push(EvalLedgerFingerprintMismatch {
1188 case_name: case_id.to_string(),
1189 split: split.map(str::to_string),
1190 commit: row.commit.clone(),
1191 trial,
1192 case_fingerprint: row.case_fingerprint.clone(),
1193 harness_config_fingerprint: row.harness_config_fingerprint.clone(),
1194 expected_case_fingerprint: case_fingerprint.to_string(),
1195 expected_harness_config_fingerprint: harness_config_fingerprint.to_string(),
1196 });
1197 }
1198 }
1199 None
1200 }
1201
1202 fn append_trial_row(&mut self, row: EvalLedgerRow) -> Result<(), VmError> {
1203 let report = futures::executor::block_on(append_eval_ledger_rows(
1204 &self.log,
1205 &self.topic,
1206 vec![row],
1207 ))?;
1208 self.inserted += report.inserted;
1209 self.duplicates += report.duplicates;
1210 self.rows.extend(report.rows);
1211 Ok(())
1212 }
1213
1214 fn finish(
1215 &self,
1216 requested_cells: usize,
1217 skipped_cells: usize,
1218 executed_cells: usize,
1219 ) -> Result<EvalPackRunState, VmError> {
1220 let remaining_cells = requested_cells.saturating_sub(skipped_cells + executed_cells);
1221 let mut state = EvalPackRunState {
1222 schema: EVAL_LEDGER_RUN_STATE_SCHEMA.to_string(),
1223 suite: self.suite.clone(),
1224 model: self.model.clone(),
1225 commit: self.commit.clone(),
1226 branch: self.branch.clone(),
1227 requested_cells,
1228 completed_cells: skipped_cells + executed_cells,
1229 skipped_cells,
1230 executed_cells,
1231 remaining_cells,
1232 ledger_rows_inserted: self.inserted,
1233 ledger_rows_duplicate: self.duplicates,
1234 fingerprint_refusals: self.fingerprint_refusals.len(),
1235 all_skipped: requested_cells > 0 && skipped_cells == requested_cells,
1236 heartbeat_event_id: None,
1237 };
1238 let event_id = self.append_run_state(&state)?;
1239 state.heartbeat_event_id = Some(event_id);
1240 Ok(state)
1241 }
1242
1243 fn append_run_state(&self, state: &EvalPackRunState) -> Result<u64, VmError> {
1244 let payload = serde_json::to_value(state)
1245 .map_err(|e| VmError::Runtime(format!("eval run-state encode error: {e}")))?;
1246 let event_id = futures::executor::block_on(self.log.append(
1247 &self.topic,
1248 crate::event_log::LogEvent::new(EVAL_LEDGER_RUN_STATE_KIND, payload),
1249 ))
1250 .map_err(eval_ledger_log_error)?;
1251 futures::executor::block_on(self.log.flush()).map_err(eval_ledger_log_error)?;
1252 Ok(event_id)
1253 }
1254}
1255
1256pub fn validate_eval_pack_split(
1257 manifest: &EvalPackManifest,
1258) -> Result<EvalPackSplitValidationReport, VmError> {
1259 let report = eval_pack_split_validation_report(manifest);
1260 if !report.valid {
1261 return Err(VmError::Runtime(format!(
1262 "eval pack split invalid: {}",
1263 render_split_validation_errors(&report).join("; ")
1264 )));
1265 }
1266 Ok(report)
1267}
1268
1269fn eval_pack_split_validation_report(manifest: &EvalPackManifest) -> EvalPackSplitValidationReport {
1270 let case_ids = eval_pack_case_ids(manifest);
1271 let mut duplicate_case_ids = duplicates(&case_ids);
1272 duplicate_case_ids.sort();
1273
1274 let case_set = case_ids.iter().cloned().collect::<BTreeSet<_>>();
1275 let Some(split) = &manifest.split else {
1276 return EvalPackSplitValidationReport {
1277 valid: duplicate_case_ids.is_empty(),
1278 case_count: case_ids.len(),
1279 covered_count: 0,
1280 duplicate_case_ids,
1281 ..EvalPackSplitValidationReport::default()
1282 };
1283 };
1284
1285 let mut duplicate_partition_cases = Vec::new();
1286 let mut unknown_cases = Vec::new();
1287 let mut seen_by_case: BTreeMap<String, Vec<String>> = BTreeMap::new();
1288 for (partition, cases) in &split.partitions {
1289 let mut local_seen = BTreeSet::new();
1290 for case_id in cases {
1291 if !local_seen.insert(case_id.clone()) {
1292 duplicate_partition_cases.push(format!("{partition}:{case_id}"));
1293 }
1294 if !case_set.contains(case_id) {
1295 unknown_cases.push(format!("{partition}:{case_id}"));
1296 }
1297 let partitions = seen_by_case.entry(case_id.clone()).or_default();
1298 if !partitions.contains(partition) {
1299 partitions.push(partition.clone());
1300 }
1301 }
1302 }
1303
1304 let mut overlap_cases = seen_by_case
1305 .iter()
1306 .filter(|(case_id, partitions)| case_set.contains(*case_id) && partitions.len() > 1)
1307 .map(|(case_id, partitions)| format!("{case_id}:{}", partitions.join(",")))
1308 .collect::<Vec<_>>();
1309 let mut missing_cases = case_set
1310 .iter()
1311 .filter(|case_id| !seen_by_case.contains_key(*case_id))
1312 .cloned()
1313 .collect::<Vec<_>>();
1314 duplicate_partition_cases.sort();
1315 unknown_cases.sort();
1316 overlap_cases.sort();
1317 missing_cases.sort();
1318
1319 let covered_count = case_set
1320 .iter()
1321 .filter(|case_id| seen_by_case.contains_key(*case_id))
1322 .count();
1323 let valid = duplicate_case_ids.is_empty()
1324 && duplicate_partition_cases.is_empty()
1325 && unknown_cases.is_empty()
1326 && overlap_cases.is_empty()
1327 && missing_cases.is_empty();
1328 EvalPackSplitValidationReport {
1329 valid,
1330 partitions: split.partitions.clone(),
1331 case_count: case_ids.len(),
1332 covered_count,
1333 duplicate_case_ids,
1334 duplicate_partition_cases,
1335 overlap_cases,
1336 unknown_cases,
1337 missing_cases,
1338 }
1339}
1340
1341fn eval_pack_case_ids(manifest: &EvalPackManifest) -> Vec<String> {
1342 manifest
1343 .cases
1344 .iter()
1345 .enumerate()
1346 .map(|(index, case)| eval_pack_case_id(case, index))
1347 .collect()
1348}
1349
1350fn eval_pack_case_id(case: &EvalPackCase, index: usize) -> String {
1351 case.id
1352 .clone()
1353 .filter(|id| !id.trim().is_empty())
1354 .unwrap_or_else(|| format!("case_{}", index + 1))
1355}
1356
1357fn duplicates(values: &[String]) -> Vec<String> {
1358 let mut seen = BTreeSet::new();
1359 let mut duplicates = BTreeSet::new();
1360 for value in values {
1361 if !seen.insert(value.clone()) {
1362 duplicates.insert(value.clone());
1363 }
1364 }
1365 duplicates.into_iter().collect()
1366}
1367
1368fn render_split_validation_errors(report: &EvalPackSplitValidationReport) -> Vec<String> {
1369 let mut errors = Vec::new();
1370 if !report.duplicate_case_ids.is_empty() {
1371 errors.push(format!(
1372 "duplicate case ids: {}",
1373 report.duplicate_case_ids.join(", ")
1374 ));
1375 }
1376 if !report.duplicate_partition_cases.is_empty() {
1377 errors.push(format!(
1378 "duplicate partition entries: {}",
1379 report.duplicate_partition_cases.join(", ")
1380 ));
1381 }
1382 if !report.overlap_cases.is_empty() {
1383 errors.push(format!(
1384 "overlapping cases: {}",
1385 report.overlap_cases.join(", ")
1386 ));
1387 }
1388 if !report.unknown_cases.is_empty() {
1389 errors.push(format!(
1390 "unknown cases: {}",
1391 report.unknown_cases.join(", ")
1392 ));
1393 }
1394 if !report.missing_cases.is_empty() {
1395 errors.push(format!(
1396 "missing cases: {}",
1397 report.missing_cases.join(", ")
1398 ));
1399 }
1400 if errors.is_empty() {
1401 errors.push("unknown split validation error".to_string());
1402 }
1403 errors
1404}
1405
1406fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1407 let content = std::fs::read_to_string(path)
1408 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1409 serde_json::from_str(&content)
1410 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1411}
1412
1413fn load_run_record_from_fixture_ref(
1414 fixture: &EvalPackFixtureRef,
1415 base_dir: Option<&Path>,
1416) -> Result<RunRecord, VmError> {
1417 if let Some(inline) = &fixture.inline {
1418 let run: RunRecord = serde_json::from_value(inline.clone())
1419 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1420 return Ok(run);
1421 }
1422 let path = fixture.path.as_deref().ok_or_else(|| {
1423 VmError::Runtime(format!(
1424 "fixture '{}' is missing path or inline run",
1425 fixture.id
1426 ))
1427 })?;
1428 load_run_record(&resolve_manifest_path(base_dir, path))
1429}
1430
1431fn load_replay_fixture_from_ref(
1432 fixture: &EvalPackFixtureRef,
1433 base_dir: Option<&Path>,
1434) -> Result<ReplayFixture, VmError> {
1435 if let Some(inline) = &fixture.inline {
1436 return serde_json::from_value(inline.clone())
1437 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1438 }
1439 let path = fixture.path.as_deref().ok_or_else(|| {
1440 VmError::Runtime(format!(
1441 "fixture '{}' is missing path or inline replay fixture",
1442 fixture.id
1443 ))
1444 })?;
1445 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1446}
1447
1448fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1449 let path_buf = PathBuf::from(path);
1450 if path_buf.is_absolute() {
1451 path_buf
1452 } else if let Some(base_dir) = base_dir {
1453 base_dir.join(path_buf)
1454 } else {
1455 path_buf
1456 }
1457}
1458
1459fn run_eval_pack_command(
1460 spec: &EvalPackCommandSpec,
1461 default_cwd: &Path,
1462 stdin_payload: Option<&serde_json::Value>,
1463 default_timeout_seconds: f64,
1464) -> Result<EvalPackCommandOutput, VmError> {
1465 let timeout_seconds = command_timeout(spec).unwrap_or(default_timeout_seconds);
1466 let timeout = timeout_seconds
1467 .is_finite()
1468 .then_some(timeout_seconds)
1469 .filter(|seconds| *seconds > 0.0)
1470 .ok_or_else(|| {
1471 VmError::Runtime(
1472 "eval pack command timeout must be a positive finite number of seconds".to_string(),
1473 )
1474 })?;
1475 let mut command = eval_pack_command(spec)?;
1476 command
1477 .current_dir(command_cwd(spec, default_cwd))
1478 .stdin(if stdin_payload.is_some() {
1479 Stdio::piped()
1480 } else {
1481 Stdio::null()
1482 })
1483 .stdout(Stdio::piped())
1484 .stderr(Stdio::piped());
1485 apply_command_env(spec, &mut command);
1486
1487 let started = crate::clock_mock::leak_audit::instant_now("eval_pack.command.started");
1488 let mut child = command
1489 .spawn()
1490 .map_err(|e| VmError::Runtime(format!("eval pack command spawn failed: {e}")))?;
1491 let stdout_reader = child.stdout.take().map(|mut pipe| {
1492 std::thread::spawn(move || {
1493 let mut bytes = Vec::new();
1494 pipe.read_to_end(&mut bytes).map(|_| bytes)
1495 })
1496 });
1497 let stderr_reader = child.stderr.take().map(|mut pipe| {
1498 std::thread::spawn(move || {
1499 let mut bytes = Vec::new();
1500 pipe.read_to_end(&mut bytes).map(|_| bytes)
1501 })
1502 });
1503
1504 let mut stdin_error = None;
1505 if let Some(payload) = stdin_payload {
1506 match child.stdin.take() {
1507 Some(mut stdin) => {
1508 if let Err(error) = serde_json::to_writer(&mut stdin, payload) {
1509 stdin_error = Some(format!("eval pack command stdin encode failed: {error}"));
1510 } else if let Err(error) = stdin.write_all(b"\n") {
1511 stdin_error = Some(format!("eval pack command stdin write failed: {error}"));
1512 }
1513 }
1514 None => {
1515 stdin_error = Some("eval pack command stdin pipe was unavailable".to_string());
1516 }
1517 }
1518 }
1519
1520 let timeout = Duration::from_secs_f64(timeout);
1521 let status = if stdin_error.is_some() {
1522 let _ = child.kill();
1523 let _ = child.wait();
1524 None
1525 } else {
1526 match child
1527 .wait_timeout(timeout)
1528 .map_err(|e| VmError::Runtime(format!("eval pack command wait failed: {e}")))?
1529 {
1530 Some(status) => Some(status),
1531 None => {
1532 let _ = child.kill();
1533 let _ = child.wait();
1534 None
1535 }
1536 }
1537 };
1538 if let Some(error) = stdin_error {
1539 let _ = join_command_reader(stdout_reader, "stdout")?;
1540 let _ = join_command_reader(stderr_reader, "stderr")?;
1541 return Err(VmError::Runtime(error));
1542 }
1543 let wall_time_seconds = started.elapsed().as_secs_f64();
1544 let stdout = join_command_reader(stdout_reader, "stdout")?;
1545 let stderr = join_command_reader(stderr_reader, "stderr")?;
1546 let timed_out = status.is_none();
1547 let exit_code = status.and_then(|status| status.code()).unwrap_or(-1) as i64;
1548 Ok(EvalPackCommandOutput {
1549 exit_code,
1550 stdout,
1551 stderr,
1552 timed_out,
1553 wall_time_seconds,
1554 })
1555}
1556
1557fn eval_pack_command(spec: &EvalPackCommandSpec) -> Result<Command, VmError> {
1558 match spec {
1559 EvalPackCommandSpec::Shell(command) => shell_command(command),
1560 EvalPackCommandSpec::Argv(argv) => argv_command(argv),
1561 EvalPackCommandSpec::Object(object) => {
1562 if let Some(command) = object.command.as_deref() {
1563 shell_command(command)
1564 } else {
1565 argv_command(&object.argv)
1566 }
1567 }
1568 }
1569}
1570
1571fn shell_command(command: &str) -> Result<Command, VmError> {
1572 let command = command.trim();
1573 if command.is_empty() {
1574 return Err(VmError::Runtime(
1575 "eval pack shell command must not be empty".to_string(),
1576 ));
1577 }
1578 #[cfg(windows)]
1579 {
1580 let mut cmd = Command::new("cmd");
1581 cmd.args(["/C", command]);
1582 Ok(cmd)
1583 }
1584 #[cfg(not(windows))]
1585 {
1586 let mut cmd = Command::new("/bin/sh");
1587 cmd.args(["-c", command]);
1588 Ok(cmd)
1589 }
1590}
1591
1592fn argv_command(argv: &[String]) -> Result<Command, VmError> {
1593 let Some((program, args)) = argv.split_first() else {
1594 return Err(VmError::Runtime(
1595 "eval pack argv command must include a program".to_string(),
1596 ));
1597 };
1598 if program.trim().is_empty() {
1599 return Err(VmError::Runtime(
1600 "eval pack argv command program must not be empty".to_string(),
1601 ));
1602 }
1603 let mut command = Command::new(program);
1604 command.args(args);
1605 Ok(command)
1606}
1607
1608fn command_cwd(spec: &EvalPackCommandSpec, default_cwd: &Path) -> PathBuf {
1609 let cwd = match spec {
1610 EvalPackCommandSpec::Object(EvalPackCommandObject { cwd: Some(cwd), .. }) => cwd.as_str(),
1611 _ => return default_cwd.to_path_buf(),
1612 };
1613 let path = PathBuf::from(cwd);
1614 if path.is_absolute() {
1615 path
1616 } else {
1617 default_cwd.join(path)
1618 }
1619}
1620
1621fn apply_command_env(spec: &EvalPackCommandSpec, command: &mut Command) {
1622 if let EvalPackCommandSpec::Object(object) = spec {
1623 command.envs(&object.env);
1624 }
1625}
1626
1627fn command_timeout(spec: &EvalPackCommandSpec) -> Option<f64> {
1628 match spec {
1629 EvalPackCommandSpec::Object(object) => object.timeout_seconds,
1630 _ => None,
1631 }
1632}
1633
1634fn join_command_reader(
1635 reader: Option<std::thread::JoinHandle<std::io::Result<Vec<u8>>>>,
1636 stream: &str,
1637) -> Result<String, VmError> {
1638 let Some(reader) = reader else {
1639 return Ok(String::new());
1640 };
1641 let bytes = reader
1642 .join()
1643 .map_err(|_| VmError::Runtime(format!("eval pack command {stream} reader panicked")))?
1644 .map_err(|e| VmError::Runtime(format!("eval pack command {stream} read failed: {e}")))?;
1645 Ok(String::from_utf8_lossy(&bytes).to_string())
1646}
1647
1648fn eval_pack_live_workspace(
1649 case: &EvalPackCase,
1650 base_dir: Option<&Path>,
1651) -> Result<PathBuf, VmError> {
1652 let workspace = case
1653 .workspace
1654 .as_deref()
1655 .or(case.project.as_deref())
1656 .ok_or_else(|| {
1657 VmError::Runtime("eval pack live-verify case is missing workspace".to_string())
1658 })?;
1659 let workspace = resolve_manifest_path(base_dir, workspace);
1660 if !workspace.is_dir() {
1661 return Err(VmError::Runtime(format!(
1662 "eval pack live-verify workspace does not exist: {}",
1663 workspace.display()
1664 )));
1665 }
1666 Ok(workspace)
1667}
1668
1669fn eval_pack_live_executor_request(
1670 manifest: &EvalPackManifest,
1671 case: &EvalPackCase,
1672 case_id: &str,
1673 trial: usize,
1674 trial_count: usize,
1675 workspace: &Path,
1676 base_dir: Option<&Path>,
1677) -> Result<serde_json::Value, VmError> {
1678 Ok(serde_json::json!({
1679 "schema": LIVE_EXECUTOR_REQUEST_SCHEMA,
1680 "manifest": {
1681 "id": &manifest.id,
1682 "base_dir": base_dir.map(|path| path.display().to_string()),
1683 "metadata": &manifest.metadata,
1684 },
1685 "case": {
1686 "id": case_id,
1687 "name": &case.name,
1688 "task": &case.task,
1689 "workspace": workspace.display().to_string(),
1690 "project": &case.project,
1691 "verify_command": command_spec_json(case.verify_command.as_ref())?,
1692 "expected_output_paths": &case.expected_output_paths,
1693 "required_output_snippets": &case.required_output_snippets,
1694 "tool_budgets": &case.tool_budgets,
1695 "metadata": &case.metadata,
1696 "case_fingerprint": &case.case_fingerprint,
1697 },
1698 "trial": trial,
1699 "trials": trial_count,
1700 }))
1701}
1702
1703fn command_spec_json(spec: Option<&EvalPackCommandSpec>) -> Result<serde_json::Value, VmError> {
1704 match spec {
1705 Some(spec) => serde_json::to_value(spec)
1706 .map_err(|e| VmError::Runtime(format!("eval pack command encode failed: {e}"))),
1707 None => Ok(serde_json::Value::Null),
1708 }
1709}
1710
1711fn live_outcome_from_executor_output(
1712 output: EvalPackCommandOutput,
1713 failures: &mut Vec<String>,
1714) -> EvalPackLiveVerifyOutcome {
1715 let mut outcome = parse_live_outcome_stdout(&output.stdout).unwrap_or_else(|error| {
1716 if !output.stdout.trim().is_empty() {
1717 failures.push(error);
1718 }
1719 EvalPackLiveVerifyOutcome::default()
1720 });
1721 if output.timed_out {
1722 outcome.timed_out = true;
1723 }
1724 if output.exit_code != 0 {
1725 failures.push(format!(
1726 "live executor exited {}{}",
1727 output.exit_code,
1728 command_failure_excerpt(&output)
1729 ));
1730 }
1731 if outcome.wall_time_seconds == 0.0 {
1732 outcome.wall_time_seconds = output.wall_time_seconds;
1733 }
1734 outcome
1735}
1736
1737fn parse_live_outcome_stdout(stdout: &str) -> Result<EvalPackLiveVerifyOutcome, String> {
1738 let trimmed = stdout.trim();
1739 if trimmed.is_empty() {
1740 return Ok(EvalPackLiveVerifyOutcome::default());
1741 }
1742 serde_json::from_str(trimmed)
1743 .or_else(|_| {
1744 trimmed
1745 .lines()
1746 .rev()
1747 .find(|line| !line.trim().is_empty())
1748 .ok_or_else(|| serde_json::Error::io(std::io::ErrorKind::UnexpectedEof.into()))
1749 .and_then(|line| serde_json::from_str(line.trim()))
1750 })
1751 .map_err(|error| format!("live executor stdout did not contain a JSON outcome: {error}"))
1752}
1753
1754fn live_outcome_verification(outcome: &EvalPackLiveVerifyOutcome) -> String {
1755 if let Some(verification) = outcome.verification.as_deref() {
1756 return normalize_live_verification(verification);
1757 }
1758 if outcome.timed_out {
1759 return "FAIL".to_string();
1760 }
1761 if let Some(exit_code) = outcome.verification_exit_code {
1762 return if exit_code == 0 { "PASS" } else { "FAIL" }.to_string();
1763 }
1764 if let Some(passed) = outcome.passed {
1765 return if passed { "PASS" } else { "FAIL" }.to_string();
1766 }
1767 "PASS".to_string()
1768}
1769
1770fn normalize_live_verification(verification: &str) -> String {
1771 match verification.trim().to_ascii_lowercase().as_str() {
1772 "pass" | "passed" | "success" | "ok" => "PASS".to_string(),
1773 "skip" | "skipped" => "skip".to_string(),
1774 _ => "FAIL".to_string(),
1775 }
1776}
1777
1778fn command_failure_excerpt(output: &EvalPackCommandOutput) -> String {
1779 let stderr = compact_output_excerpt(&output.stderr);
1780 if !stderr.is_empty() {
1781 return format!("; stderr: {stderr}");
1782 }
1783 let stdout = compact_output_excerpt(&output.stdout);
1784 if stdout.is_empty() {
1785 String::new()
1786 } else {
1787 format!("; stdout: {stdout}")
1788 }
1789}
1790
1791fn compact_output_excerpt(output: &str) -> String {
1792 let compact = output.split_whitespace().collect::<Vec<_>>().join(" ");
1793 let max_chars = 240;
1794 if compact.chars().count() > max_chars {
1795 format!("{}...", compact.chars().take(max_chars).collect::<String>())
1796 } else {
1797 compact
1798 }
1799}
1800
1801fn normalized_live_produced_paths(
1802 case: &EvalPackCase,
1803 outcome: &EvalPackLiveVerifyOutcome,
1804) -> Vec<String> {
1805 let mut seen = BTreeSet::new();
1806 let mut paths = Vec::new();
1807 for path in outcome
1808 .produced_paths
1809 .iter()
1810 .chain(case.expected_output_paths.iter())
1811 {
1812 if !path.trim().is_empty() && seen.insert(path.clone()) {
1813 paths.push(path.clone());
1814 }
1815 }
1816 paths
1817}
1818
1819fn eval_pack_live_expected_path_failures(workspace: &Path, paths: &[String]) -> Vec<String> {
1820 paths
1821 .iter()
1822 .filter_map(|path| {
1823 let resolved = resolve_manifest_path(Some(workspace), path);
1824 (!resolved.exists()).then(|| {
1825 format!(
1826 "expected output path does not exist: {}",
1827 resolved.display()
1828 )
1829 })
1830 })
1831 .collect()
1832}
1833
1834fn eval_pack_live_required_snippet_failures(
1835 workspace: &Path,
1836 paths: &[String],
1837 snippets: &[String],
1838) -> Vec<String> {
1839 let readable_outputs = paths
1840 .iter()
1841 .map(|path| resolve_manifest_path(Some(workspace), path))
1842 .filter(|path| path.is_file())
1843 .collect::<Vec<_>>();
1844 snippets
1845 .iter()
1846 .filter(|snippet| !snippet.is_empty())
1847 .filter_map(|snippet| {
1848 let found = readable_outputs.iter().any(|path| {
1849 std::fs::read_to_string(path)
1850 .map(|content| content.contains(snippet))
1851 .unwrap_or(false)
1852 });
1853 (!found).then(|| format!("required output snippet not found: {snippet:?}"))
1854 })
1855 .collect()
1856}
1857
1858fn eval_pack_live_tool_budget_failures(
1859 budgets: &BTreeMap<String, usize>,
1860 summary: &serde_json::Value,
1861) -> Vec<String> {
1862 budgets
1863 .iter()
1864 .filter_map(|(name, limit)| {
1865 let count = live_tool_summary_count(summary, name)?;
1866 (count > *limit)
1867 .then(|| format!("tool budget {name} exceeded: {count} calls > {limit}"))
1868 })
1869 .collect()
1870}
1871
1872fn live_tool_summary_count(summary: &serde_json::Value, name: &str) -> Option<usize> {
1873 let normalized = name.trim();
1874 if normalized.is_empty() {
1875 return None;
1876 }
1877 if normalized == "total" {
1878 return json_usize_from_keys(summary, &["total", "calls", "tool_calls", "toolCalls"]);
1879 }
1880 json_usize_from_keys(summary, &[normalized])
1881 .or_else(|| {
1882 summary
1883 .get("by_tool")
1884 .or_else(|| summary.get("byTool"))
1885 .and_then(|value| json_usize_from_keys(value, &[normalized]))
1886 })
1887 .or_else(|| {
1888 summary
1889 .get("tools")
1890 .and_then(|value| json_usize_from_keys(value, &[normalized]))
1891 })
1892 .or_else(|| {
1893 summary
1899 .get("sequence")
1900 .and_then(serde_json::Value::as_array)
1901 .map(|calls| {
1902 calls
1903 .iter()
1904 .filter(|call| call.as_str() == Some(normalized))
1905 .count()
1906 })
1907 })
1908}
1909
1910fn json_usize_from_keys(value: &serde_json::Value, keys: &[&str]) -> Option<usize> {
1911 keys.iter()
1912 .find_map(|key| value.get(*key))
1913 .and_then(json_value_usize)
1914}
1915
1916fn json_value_usize(value: &serde_json::Value) -> Option<usize> {
1917 value
1918 .as_u64()
1919 .and_then(|value| usize::try_from(value).ok())
1920 .or_else(|| value.as_i64().and_then(|value| usize::try_from(value).ok()))
1921}
1922
1923pub fn evaluate_run_suite_manifest(
1924 manifest: &EvalSuiteManifest,
1925) -> Result<ReplayEvalSuiteReport, VmError> {
1926 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1927 let mut reports = Vec::new();
1928 for case in &manifest.cases {
1929 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1930 let run = load_run_record(&run_path)?;
1931 let fixture = match &case.fixture_path {
1932 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1933 None => run
1934 .replay_fixture
1935 .clone()
1936 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1937 };
1938 let eval = evaluate_run_against_fixture(&run, &fixture);
1939 let mut pass = eval.pass;
1940 let mut failures = eval.failures;
1941 let comparison = match &case.compare_to {
1942 Some(path) => {
1943 let baseline_path = resolve_manifest_path(base_dir, path);
1944 let baseline = load_run_record(&baseline_path)?;
1945 let diff = diff_run_records(&baseline, &run);
1946 if !diff.identical {
1947 pass = false;
1948 failures.push(format!(
1949 "run differs from baseline {} with {} stage changes",
1950 baseline_path.display(),
1951 diff.stage_diffs.len()
1952 ));
1953 }
1954 Some(diff)
1955 }
1956 None => None,
1957 };
1958 reports.push(ReplayEvalCaseReport {
1959 run_id: run.id.clone(),
1960 workflow_id: run.workflow_id.clone(),
1961 label: case.label.clone(),
1962 pass,
1963 failures,
1964 stage_count: eval.stage_count,
1965 source_path: Some(run_path.display().to_string()),
1966 comparison,
1967 });
1968 }
1969 let total = reports.len();
1970 let passed = reports.iter().filter(|report| report.pass).count();
1971 let failed = total.saturating_sub(passed);
1972 Ok(ReplayEvalSuiteReport {
1973 pass: failed == 0,
1974 total,
1975 passed,
1976 failed,
1977 cases: reports,
1978 })
1979}
1980
1981pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1982 let mut live_executor = EvalPackShellLiveExecutor;
1983 evaluate_eval_pack_manifest_inner(manifest, false, None, &mut live_executor)
1984}
1985
1986pub fn evaluate_eval_pack_manifest_resumable(
1987 manifest: &EvalPackManifest,
1988 ledger_options: Option<serde_json::Value>,
1989) -> Result<EvalPackReport, VmError> {
1990 let mut live_executor = EvalPackShellLiveExecutor;
1991 evaluate_eval_pack_manifest_inner(manifest, true, ledger_options, &mut live_executor)
1992}
1993
1994pub fn evaluate_eval_pack_manifest_with_live_executor(
1995 manifest: &EvalPackManifest,
1996 live_executor: &mut dyn EvalPackLiveExecutor,
1997) -> Result<EvalPackReport, VmError> {
1998 evaluate_eval_pack_manifest_inner(manifest, false, None, live_executor)
1999}
2000
2001pub fn evaluate_eval_pack_manifest_resumable_with_live_executor(
2002 manifest: &EvalPackManifest,
2003 ledger_options: Option<serde_json::Value>,
2004 live_executor: &mut dyn EvalPackLiveExecutor,
2005) -> Result<EvalPackReport, VmError> {
2006 evaluate_eval_pack_manifest_inner(manifest, true, ledger_options, live_executor)
2007}
2008
2009fn evaluate_eval_pack_manifest_inner(
2010 manifest: &EvalPackManifest,
2011 ledger_enabled: bool,
2012 ledger_options: Option<serde_json::Value>,
2013 live_executor: &mut dyn EvalPackLiveExecutor,
2014) -> Result<EvalPackReport, VmError> {
2015 let base_dir = manifest.base_dir.as_deref().map(Path::new);
2016 let fixture_base_dir_buf = manifest
2017 .defaults
2018 .fixture_root
2019 .as_deref()
2020 .map(|root| resolve_manifest_path(base_dir, root));
2021 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
2022 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
2023 .fixtures
2024 .iter()
2025 .filter(|fixture| !fixture.id.is_empty())
2026 .map(|fixture| (fixture.id.as_str(), fixture))
2027 .collect();
2028 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
2029 .rubrics
2030 .iter()
2031 .filter(|rubric| !rubric.id.is_empty())
2032 .map(|rubric| (rubric.id.as_str(), rubric))
2033 .collect();
2034
2035 let split_report = validate_eval_pack_split(manifest)?;
2036 let split_by_case = split_by_case_id(&split_report);
2037 let harness_config_fingerprint = eval_pack_harness_config_fingerprint(manifest)?;
2038 let mut ledger = if ledger_enabled {
2039 Some(EvalPackLedgerRun::start(
2040 manifest,
2041 base_dir,
2042 ledger_options,
2043 )?)
2044 } else {
2045 None
2046 };
2047 let mut requested_cells = 0usize;
2048 let mut skipped_cells = 0usize;
2049 let mut executed_cells = 0usize;
2050 let mut reports = Vec::new();
2051 for (index, case) in manifest.cases.iter().enumerate() {
2052 let case_id = eval_pack_case_id(case, index);
2053 let label = case
2054 .name
2055 .clone()
2056 .or_else(|| case.id.clone())
2057 .unwrap_or_else(|| case_id.clone());
2058 let severity = eval_pack_case_severity(manifest, case);
2059 let blocking = severity == "blocking";
2060 let trial_count = case.trials.unwrap_or(manifest.trials);
2061 let split = split_by_case.get(&case_id).cloned();
2062 requested_cells += trial_count;
2063 let mut trials = Vec::with_capacity(trial_count);
2064 for trial in 1..=trial_count {
2065 if let Some(ledger) = ledger.as_mut() {
2066 if let Some(row) = ledger.replay_row_for_cell(
2067 &case_id,
2068 split.as_deref(),
2069 trial,
2070 &case.case_fingerprint,
2071 &harness_config_fingerprint,
2072 ) {
2073 skipped_cells += 1;
2074 trials.push(eval_pack_trial_report_from_ledger_row(&row, blocking));
2075 continue;
2076 }
2077 }
2078 let report = match eval_pack_case_kind(case) {
2079 EvalPackCaseKind::LiveVerify => evaluate_eval_pack_live_verify_trial(
2080 manifest,
2081 case,
2082 &case_id,
2083 trial,
2084 trial_count,
2085 &severity,
2086 blocking,
2087 base_dir,
2088 live_executor,
2089 )?,
2090 EvalPackCaseKind::Friction => evaluate_eval_pack_friction_trial(
2091 manifest,
2092 case,
2093 trial,
2094 &severity,
2095 blocking,
2096 base_dir,
2097 fixture_base_dir,
2098 &fixtures_by_id,
2099 &rubrics_by_id,
2100 )?,
2101 EvalPackCaseKind::Replay => evaluate_eval_pack_run_trial(
2102 manifest,
2103 case,
2104 trial,
2105 &severity,
2106 blocking,
2107 base_dir,
2108 fixture_base_dir,
2109 &fixtures_by_id,
2110 &rubrics_by_id,
2111 )?,
2112 };
2113 if let Some(ledger) = ledger.as_mut() {
2114 let row = eval_ledger_row_from_trial(
2115 case,
2116 &case_id,
2117 split.clone(),
2118 &ledger.suite,
2119 &ledger.model,
2120 &ledger.commit,
2121 &ledger.provenance,
2122 &harness_config_fingerprint,
2123 &report,
2124 );
2125 ledger.append_trial_row(row)?;
2126 }
2127 executed_cells += 1;
2128 trials.push(report);
2129 }
2130 reports.push(eval_pack_case_report_from_trials(
2131 case,
2132 case_id,
2133 label,
2134 severity,
2135 split,
2136 blocking,
2137 harness_config_fingerprint.clone(),
2138 trials,
2139 ));
2140 }
2141
2142 let mut ladder_reports = Vec::new();
2143 for ladder in &manifest.ladders {
2144 let mut ladder = ladder.clone();
2145 if ladder.base_dir.is_none() {
2146 ladder.base_dir = manifest.base_dir.clone();
2147 }
2148 ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2149 }
2150
2151 let stats_rows = reports
2152 .iter()
2153 .map(|report| report.stats_row.clone())
2154 .collect::<Vec<_>>();
2155 let stats = eval_pack_stats_report(&stats_rows);
2156 let case_total = reports.len();
2157 let ladder_total = ladder_reports.len();
2158 let total = case_total + ladder_total;
2159 let trial_count = reports.iter().map(|report| report.trial_count).sum();
2160 let case_blocking_failed = reports
2161 .iter()
2162 .filter(|report| report.blocking && report.reliability.status != "all-pass")
2163 .count();
2164 let ladder_blocking_failed = ladder_reports
2165 .iter()
2166 .filter(|report| report.blocking && !report.pass)
2167 .count();
2168 let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2169 let warning_failed = reports
2170 .iter()
2171 .filter(|report| !report.warnings.is_empty())
2172 .count()
2173 + ladder_reports
2174 .iter()
2175 .filter(|report| !report.pass && report.severity == "warning")
2176 .count();
2177 let informational_failed = reports
2178 .iter()
2179 .filter(|report| !report.informational.is_empty())
2180 .count()
2181 + ladder_reports
2182 .iter()
2183 .filter(|report| !report.pass && report.severity == "informational")
2184 .count();
2185 let passed = reports.iter().filter(|report| report.pass).count()
2186 + ladder_reports.iter().filter(|report| report.pass).count();
2187 let run_state = match ledger.as_ref() {
2188 Some(ledger) => ledger.finish(requested_cells, skipped_cells, executed_cells)?,
2189 None => EvalPackRunState {
2190 schema: EVAL_LEDGER_RUN_STATE_SCHEMA.to_string(),
2191 suite: manifest.id.clone(),
2192 model: eval_pack_manifest_model(manifest).unwrap_or_else(|| "unknown".to_string()),
2193 requested_cells,
2194 completed_cells: requested_cells,
2195 executed_cells: requested_cells,
2196 ..EvalPackRunState::default()
2197 },
2198 };
2199 Ok(EvalPackReport {
2200 pack_id: manifest.id.clone(),
2201 harness_config_fingerprint,
2202 pass: blocking_failed == 0,
2203 total,
2204 passed,
2205 failed: total.saturating_sub(passed),
2206 blocking_failed,
2207 warning_failed,
2208 informational_failed,
2209 trial_count,
2210 run_state,
2211 split: manifest.split.as_ref().map(|_| split_report),
2212 stats,
2213 stats_rows,
2214 cases: reports,
2215 ladders: ladder_reports,
2216 })
2217}
2218
2219#[allow(clippy::too_many_arguments)]
2220fn evaluate_eval_pack_live_verify_trial(
2221 manifest: &EvalPackManifest,
2222 case: &EvalPackCase,
2223 case_id: &str,
2224 trial: usize,
2225 trial_count: usize,
2226 severity: &str,
2227 blocking: bool,
2228 base_dir: Option<&Path>,
2229 live_executor: &mut dyn EvalPackLiveExecutor,
2230) -> Result<EvalPackTrialReport, VmError> {
2231 let workspace = eval_pack_live_workspace(case, base_dir)?;
2232 let executor = case.executor.as_ref().or(manifest.executor.as_ref());
2233 let verify_command = case.verify_command.as_ref().ok_or_else(|| {
2234 VmError::Runtime(format!(
2235 "eval pack live-verify case '{case_id}' is missing verify_command"
2236 ))
2237 })?;
2238 let Some(executor) = executor else {
2239 return Err(VmError::Runtime(format!(
2240 "eval pack live-verify case '{case_id}' is missing executor"
2241 )));
2242 };
2243
2244 let mut failures = Vec::new();
2245 let mut warnings = Vec::new();
2246 let mut informational = Vec::new();
2247 let request_payload = eval_pack_live_executor_request(
2248 manifest,
2249 case,
2250 case_id,
2251 trial,
2252 trial_count,
2253 &workspace,
2254 base_dir,
2255 )?;
2256 let request = EvalPackLiveExecutorRequest {
2257 executor: executor.clone(),
2258 payload: request_payload,
2259 manifest_id: manifest.id.clone(),
2260 case: case.clone(),
2261 case_id: case_id.to_string(),
2262 trial,
2263 trials: trial_count,
2264 workspace: workspace.clone(),
2265 base_dir: base_dir.map(Path::to_path_buf),
2266 };
2267 let mut outcome = match live_executor.execute(request) {
2268 Ok(outcome) => outcome,
2269 Err(error) => {
2270 failures.push(format!("live executor failed: {error}"));
2271 EvalPackLiveVerifyOutcome::default()
2272 }
2273 };
2274 failures.append(&mut outcome.failures);
2275 warnings.append(&mut outcome.warnings);
2276 informational.append(&mut outcome.informational);
2277 if outcome.timed_out {
2278 failures.push("live executor timed out".to_string());
2279 }
2280 if live_outcome_verification(&outcome) == "FAIL" {
2281 failures.push("live executor reported verification FAIL".to_string());
2282 }
2283
2284 let verify_output = run_eval_pack_command(
2285 verify_command,
2286 &workspace,
2287 None,
2288 DEFAULT_LIVE_VERIFY_TIMEOUT_SECONDS,
2289 );
2290 let verification_exit_code = match verify_output {
2291 Ok(output) => {
2292 let exit_code = output.exit_code;
2293 if output.timed_out {
2294 outcome.timed_out = true;
2295 failures.push("verify command timed out".to_string());
2296 }
2297 if exit_code != 0 {
2298 failures.push(format!(
2299 "verify command exited {exit_code}{}",
2300 command_failure_excerpt(&output)
2301 ));
2302 }
2303 if outcome.wall_time_seconds == 0.0 {
2304 outcome.wall_time_seconds = output.wall_time_seconds;
2305 }
2306 Some(exit_code)
2307 }
2308 Err(error) => {
2309 failures.push(format!("verify command failed: {error}"));
2310 None
2311 }
2312 };
2313
2314 let produced_paths = normalized_live_produced_paths(case, &outcome);
2315 failures.extend(eval_pack_live_expected_path_failures(
2316 &workspace,
2317 &case.expected_output_paths,
2318 ));
2319 failures.extend(eval_pack_live_required_snippet_failures(
2320 &workspace,
2321 &produced_paths,
2322 &case.required_output_snippets,
2323 ));
2324 failures.extend(eval_pack_live_tool_budget_failures(
2325 &case.tool_budgets,
2326 &outcome.tool_call_summary,
2327 ));
2328
2329 let mut report = eval_pack_trial_report(
2330 trial,
2331 severity,
2332 blocking,
2333 outcome
2334 .run_id
2335 .clone()
2336 .unwrap_or_else(|| format!("live:{case_id}:{trial}")),
2337 outcome
2338 .workflow_id
2339 .clone()
2340 .unwrap_or_else(|| "live-verify".to_string()),
2341 outcome
2342 .source_path
2343 .clone()
2344 .or_else(|| Some(workspace.display().to_string())),
2345 outcome.stage_count.unwrap_or_default(),
2346 outcome.timed_out,
2347 outcome.wall_time_seconds,
2348 outcome.cost_usd,
2349 failures,
2350 warnings,
2351 informational,
2352 None,
2353 );
2354 let outcome_verification = live_outcome_verification(&outcome);
2355 if report.failures.is_empty()
2356 && outcome_verification.eq_ignore_ascii_case("skip")
2357 && verification_exit_code.unwrap_or_default() == 0
2358 {
2359 report.verification = "skip".to_string();
2360 }
2361 report.verification_exit_code = verification_exit_code;
2362 report.produced_paths = produced_paths;
2363 report.tool_call_summary = outcome.tool_call_summary;
2364 Ok(report)
2365}
2366
2367#[allow(clippy::too_many_arguments)]
2368fn evaluate_eval_pack_run_trial(
2369 manifest: &EvalPackManifest,
2370 case: &EvalPackCase,
2371 trial: usize,
2372 severity: &str,
2373 blocking: bool,
2374 base_dir: Option<&Path>,
2375 fixture_base_dir: Option<&Path>,
2376 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2377 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2378) -> Result<EvalPackTrialReport, VmError> {
2379 let mut failures = Vec::new();
2380 let mut warnings = Vec::new();
2381 let informational = Vec::new();
2382 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2383 let fixture =
2384 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, fixtures_by_id, &run)?;
2385 let eval = evaluate_run_against_fixture(&run, &fixture);
2386 failures.extend(eval.failures);
2387 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2388 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2389
2390 let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2391 Some(path) => {
2392 let baseline_path = resolve_manifest_path(base_dir, path);
2393 let baseline = load_run_record(&baseline_path)?;
2394 let diff = diff_run_records(&baseline, &run);
2395 if !diff.identical {
2396 failures.push(format!(
2397 "run differs from baseline {} with {} stage changes",
2398 baseline_path.display(),
2399 diff.stage_diffs.len()
2400 ));
2401 }
2402 Some(diff)
2403 }
2404 None => None,
2405 };
2406
2407 for rubric_id in &case.rubrics {
2408 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2409 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2410 continue;
2411 };
2412 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2413 }
2414
2415 Ok(eval_pack_trial_report(
2416 trial,
2417 severity,
2418 blocking,
2419 run.id.clone(),
2420 run.workflow_id.clone(),
2421 eval_pack_case_source_path(case, base_dir, fixture_base_dir, fixtures_by_id),
2422 eval.stage_count,
2423 run.status.to_ascii_lowercase().contains("timeout"),
2424 run.usage
2425 .as_ref()
2426 .map(|usage| usage.total_duration_ms as f64 / 1000.0)
2427 .unwrap_or_default(),
2428 run.usage
2429 .as_ref()
2430 .map(|usage| usage.total_cost)
2431 .unwrap_or_default(),
2432 failures,
2433 warnings,
2434 informational,
2435 comparison,
2436 ))
2437}
2438
2439#[allow(clippy::too_many_arguments)]
2440fn evaluate_eval_pack_friction_trial(
2441 manifest: &EvalPackManifest,
2442 case: &EvalPackCase,
2443 trial: usize,
2444 severity: &str,
2445 blocking: bool,
2446 base_dir: Option<&Path>,
2447 fixture_base_dir: Option<&Path>,
2448 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2449 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2450) -> Result<EvalPackTrialReport, VmError> {
2451 let mut failures = Vec::new();
2452 let mut warnings = Vec::new();
2453 let informational = Vec::new();
2454 let events =
2455 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2456 let options = friction_suggestion_options(case, manifest);
2457 let suggestions = generate_context_pack_suggestions(&events, &options);
2458
2459 for rubric_id in &case.rubrics {
2460 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2461 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2462 continue;
2463 };
2464 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2465 }
2466
2467 if case.rubrics.is_empty() && suggestions.is_empty() {
2468 failures.push("friction fixture produced no context-pack suggestions".to_string());
2469 }
2470
2471 Ok(eval_pack_trial_report(
2472 trial,
2473 severity,
2474 blocking,
2475 "friction_events".to_string(),
2476 String::new(),
2477 eval_pack_case_friction_source_path(case, base_dir, fixture_base_dir, fixtures_by_id),
2478 events.len(),
2479 false,
2480 0.0,
2481 0.0,
2482 failures,
2483 warnings,
2484 informational,
2485 None,
2486 ))
2487}
2488
2489#[allow(clippy::too_many_arguments)]
2490fn eval_pack_trial_report(
2491 trial: usize,
2492 severity: &str,
2493 blocking: bool,
2494 run_id: String,
2495 workflow_id: String,
2496 source_path: Option<String>,
2497 stage_count: usize,
2498 timed_out: bool,
2499 wall_time_seconds: f64,
2500 cost_usd: f64,
2501 mut failures: Vec<String>,
2502 mut warnings: Vec<String>,
2503 mut informational: Vec<String>,
2504 comparison: Option<RunDiffReport>,
2505) -> EvalPackTrialReport {
2506 let verification = if failures.is_empty() { "PASS" } else { "FAIL" }.to_string();
2507 let pass = failures.is_empty() || !blocking;
2508 if !failures.is_empty() && !blocking {
2509 if severity == "warning" {
2510 warnings.append(&mut failures);
2511 } else {
2512 informational.append(&mut failures);
2513 }
2514 }
2515 EvalPackTrialReport {
2516 trial,
2517 verification,
2518 verification_exit_code: None,
2519 pass,
2520 blocking,
2521 run_id,
2522 workflow_id,
2523 source_path,
2524 stage_count,
2525 failures,
2526 warnings,
2527 informational,
2528 comparison,
2529 timed_out,
2530 wall_time_seconds,
2531 cost_usd,
2532 produced_paths: Vec::new(),
2533 tool_call_summary: serde_json::Value::Null,
2534 }
2535}
2536
2537#[allow(clippy::too_many_arguments)]
2538fn eval_ledger_row_from_trial(
2539 case: &EvalPackCase,
2540 case_id: &str,
2541 split: Option<String>,
2542 suite: &str,
2543 model: &str,
2544 commit: &str,
2545 provenance: &EvalLedgerProvenance,
2546 harness_config_fingerprint: &str,
2547 trial: &EvalPackTrialReport,
2548) -> EvalLedgerRow {
2549 let passes = usize::from(trial.verification == "PASS");
2550 let fails = usize::from(trial.verification == "FAIL");
2551 let skips = usize::from(trial.verification.eq_ignore_ascii_case("skip"));
2552 let timeouts = usize::from(trial.timed_out);
2553 EvalLedgerRow {
2554 schema: EVAL_LEDGER_ROW_SCHEMA.to_string(),
2555 suite: suite.to_string(),
2556 model: model.to_string(),
2557 split,
2558 commit: commit.to_string(),
2559 case_name: case_id.to_string(),
2560 name: case_id.to_string(),
2561 case_fingerprint: case.case_fingerprint.clone(),
2562 harness_config_fingerprint: harness_config_fingerprint.to_string(),
2563 trial: trial.trial,
2564 trials: 1,
2565 passes,
2566 fails,
2567 skips,
2568 timeouts,
2569 pass_rate: passes as f64,
2570 status: trial.verification.clone(),
2571 verification: trial.verification.clone(),
2572 skipped: false,
2573 wall_time_seconds: trial.wall_time_seconds,
2574 cost_usd: trial.cost_usd,
2575 mean_wall_time_seconds: trial.wall_time_seconds,
2576 total_cost_usd: trial.cost_usd,
2577 run_id: trial.run_id.clone(),
2578 workflow_id: trial.workflow_id.clone(),
2579 source_path: trial.source_path.clone(),
2580 trial_report: Some(trial.clone()),
2581 provenance: provenance.clone(),
2582 metadata: case.metadata.clone(),
2583 ..EvalLedgerRow::default()
2584 }
2585}
2586
2587fn eval_pack_trial_report_from_ledger_row(
2588 row: &EvalLedgerRow,
2589 blocking: bool,
2590) -> EvalPackTrialReport {
2591 if let Some(mut report) = row.trial_report.clone() {
2592 report.trial = row.trial;
2593 return report;
2594 }
2595 let mut failures = Vec::new();
2596 let verification = if row.verification.is_empty() {
2597 row.status.clone()
2598 } else {
2599 row.verification.clone()
2600 };
2601 if verification == "FAIL" {
2602 failures.push("ledger row recorded a failed trial".to_string());
2603 }
2604 EvalPackTrialReport {
2605 trial: row.trial,
2606 verification: verification.clone(),
2607 verification_exit_code: None,
2608 pass: verification != "FAIL" || !blocking,
2609 blocking,
2610 run_id: row.run_id.clone(),
2611 workflow_id: row.workflow_id.clone(),
2612 source_path: row.source_path.clone(),
2613 stage_count: 0,
2614 failures,
2615 warnings: Vec::new(),
2616 informational: Vec::new(),
2617 comparison: None,
2618 timed_out: row.timeouts > 0,
2619 wall_time_seconds: row.wall_time_seconds,
2620 cost_usd: row.cost_usd,
2621 produced_paths: Vec::new(),
2622 tool_call_summary: serde_json::Value::Null,
2623 }
2624}
2625
2626#[allow(clippy::too_many_arguments)]
2627fn eval_pack_case_report_from_trials(
2628 case: &EvalPackCase,
2629 case_id: String,
2630 label: String,
2631 severity: String,
2632 split: Option<String>,
2633 blocking: bool,
2634 harness_config_fingerprint: String,
2635 trials: Vec<EvalPackTrialReport>,
2636) -> EvalPackCaseReport {
2637 let reliability = eval_pack_reliability_report(&trials);
2638 let stats_row = eval_pack_stats_row(
2639 case,
2640 &case_id,
2641 &harness_config_fingerprint,
2642 split.clone(),
2643 &trials,
2644 &reliability,
2645 );
2646 let first = trials.first();
2647 let pass = if blocking {
2648 reliability.status == "all-pass"
2649 } else {
2650 true
2651 };
2652 let failures = prefixed_trial_messages(&trials, |trial| &trial.failures);
2653 let warnings = prefixed_trial_messages(&trials, |trial| &trial.warnings);
2654 let informational = prefixed_trial_messages(&trials, |trial| &trial.informational);
2655 EvalPackCaseReport {
2656 id: case_id,
2657 label,
2658 severity,
2659 split,
2660 case_fingerprint: case.case_fingerprint.clone(),
2661 harness_config_fingerprint,
2662 pass,
2663 blocking,
2664 run_id: first.map(|trial| trial.run_id.clone()).unwrap_or_default(),
2665 workflow_id: first
2666 .map(|trial| trial.workflow_id.clone())
2667 .unwrap_or_default(),
2668 source_path: first.and_then(|trial| trial.source_path.clone()),
2669 stage_count: first.map(|trial| trial.stage_count).unwrap_or_default(),
2670 trial_count: trials.len(),
2671 total_stage_count: trials.iter().map(|trial| trial.stage_count).sum(),
2672 reliability,
2673 stats_row,
2674 comparison: first.and_then(|trial| trial.comparison.clone()),
2675 trials,
2676 failures,
2677 warnings,
2678 informational,
2679 }
2680}
2681
2682fn prefixed_trial_messages<F>(trials: &[EvalPackTrialReport], messages: F) -> Vec<String>
2683where
2684 F: Fn(&EvalPackTrialReport) -> &Vec<String>,
2685{
2686 let include_prefix = trials.len() > 1;
2687 let mut out = Vec::new();
2688 for trial in trials {
2689 for message in messages(trial) {
2690 if include_prefix {
2691 out.push(format!("trial {}: {message}", trial.trial));
2692 } else {
2693 out.push(message.clone());
2694 }
2695 }
2696 }
2697 out
2698}
2699
2700fn eval_pack_reliability_report(trials: &[EvalPackTrialReport]) -> EvalPackReliabilityReport {
2701 let passes = trials
2702 .iter()
2703 .filter(|trial| trial.verification == "PASS")
2704 .count();
2705 let fails = trials
2706 .iter()
2707 .filter(|trial| trial.verification == "FAIL")
2708 .count();
2709 let skips = trials
2710 .iter()
2711 .filter(|trial| trial.verification.eq_ignore_ascii_case("skip"))
2712 .count();
2713 let timeouts = trials.iter().filter(|trial| trial.timed_out).count();
2714 let decided = passes + fails;
2715 let majority = if passes > 0 && fails > 0 {
2716 Some(if passes >= fails { "PASS" } else { "FAIL" }.to_string())
2717 } else {
2718 None
2719 };
2720 let status = if decided == 0 {
2721 "no-decision"
2722 } else if fails == 0 {
2723 "all-pass"
2724 } else if passes == 0 {
2725 "all-fail"
2726 } else {
2727 "flaky"
2728 };
2729 EvalPackReliabilityReport {
2730 status: status.to_string(),
2731 trials: trials.len(),
2732 passes,
2733 fails,
2734 skips,
2735 timeouts,
2736 decided,
2737 pass_rate: if trials.is_empty() {
2738 0.0
2739 } else {
2740 passes as f64 / trials.len() as f64
2741 },
2742 majority,
2743 }
2744}
2745
2746fn eval_pack_stats_row(
2747 case: &EvalPackCase,
2748 case_id: &str,
2749 harness_config_fingerprint: &str,
2750 split: Option<String>,
2751 trials: &[EvalPackTrialReport],
2752 reliability: &EvalPackReliabilityReport,
2753) -> EvalPackStatsRow {
2754 let wall_times = trials
2755 .iter()
2756 .map(|trial| trial.wall_time_seconds)
2757 .collect::<Vec<_>>();
2758 let costs = trials
2759 .iter()
2760 .map(|trial| trial.cost_usd)
2761 .collect::<Vec<_>>();
2762 let group = case
2763 .metadata
2764 .get("group")
2765 .or_else(|| case.metadata.get("language"))
2766 .or_else(|| case.metadata.get("bucket"))
2767 .and_then(|value| value.as_str())
2768 .unwrap_or_default()
2769 .to_string();
2770 EvalPackStatsRow {
2771 name: case_id.to_string(),
2772 case_name: case_id.to_string(),
2773 case_fingerprint: case.case_fingerprint.clone(),
2774 harness_config_fingerprint: harness_config_fingerprint.to_string(),
2775 group,
2776 split,
2777 trials: trials.len(),
2778 passes: reliability.passes,
2779 fails: reliability.fails,
2780 skips: reliability.skips,
2781 timeouts: reliability.timeouts,
2782 pass_rate: reliability.pass_rate,
2783 status: match reliability.status.as_str() {
2784 "all-pass" => "PASS",
2785 "all-fail" => "FAIL",
2786 "flaky" => "FLAKY",
2787 _ => "skip",
2788 }
2789 .to_string(),
2790 majority: reliability.majority.clone(),
2791 wall_time_seconds: mean(&wall_times),
2792 cost_usd: costs.iter().sum(),
2793 mean_wall_time_seconds: mean(&wall_times),
2794 stdev_wall_time_seconds: stdev(&wall_times),
2795 total_cost_usd: costs.iter().sum(),
2796 }
2797}
2798
2799fn eval_pack_stats_report(rows: &[EvalPackStatsRow]) -> EvalPackStatsReport {
2800 EvalPackStatsReport {
2801 macro_pass_at_1: macro_pass_at_1(rows),
2802 reliability: eval_pack_reliability_breakdown(rows),
2803 }
2804}
2805
2806fn macro_pass_at_1(rows: &[EvalPackStatsRow]) -> f64 {
2807 let decided = rows
2808 .iter()
2809 .filter(|row| row.passes + row.fails > 0)
2810 .collect::<Vec<_>>();
2811 if decided.is_empty() {
2812 return 0.0;
2813 }
2814 decided.iter().map(|row| row.pass_rate).sum::<f64>() / decided.len() as f64
2815}
2816
2817fn eval_pack_reliability_breakdown(rows: &[EvalPackStatsRow]) -> EvalPackReliabilityBreakdown {
2818 let total_cases = rows.len();
2819 let all_pass_cases = rows
2820 .iter()
2821 .filter(|row| row.passes > 0 && row.fails == 0)
2822 .count();
2823 let flaky_cases = rows
2824 .iter()
2825 .filter(|row| row.passes > 0 && row.fails > 0)
2826 .count();
2827 let all_fail_cases = rows
2828 .iter()
2829 .filter(|row| row.passes == 0 && row.fails > 0)
2830 .count();
2831 let no_decision_cases = rows
2832 .iter()
2833 .filter(|row| row.passes + row.fails == 0)
2834 .count();
2835 EvalPackReliabilityBreakdown {
2836 all_pass_cases,
2837 flaky_cases,
2838 all_fail_cases,
2839 no_decision_cases,
2840 total_cases,
2841 all_pass_fraction: rate(all_pass_cases, total_cases),
2842 flaky_fraction: rate(flaky_cases, total_cases),
2843 all_fail_fraction: rate(all_fail_cases, total_cases),
2844 no_decision_fraction: rate(no_decision_cases, total_cases),
2845 }
2846}
2847
2848fn split_by_case_id(report: &EvalPackSplitValidationReport) -> BTreeMap<String, String> {
2849 let mut out = BTreeMap::new();
2850 for (partition, cases) in &report.partitions {
2851 for case_id in cases {
2852 out.insert(case_id.clone(), partition.clone());
2853 }
2854 }
2855 out
2856}
2857
2858fn mean(values: &[f64]) -> f64 {
2859 if values.is_empty() {
2860 return 0.0;
2861 }
2862 values.iter().sum::<f64>() / values.len() as f64
2863}
2864
2865fn stdev(values: &[f64]) -> f64 {
2866 if values.is_empty() {
2867 return 0.0;
2868 }
2869 let mean = mean(values);
2870 let variance = values
2871 .iter()
2872 .map(|value| {
2873 let diff = value - mean;
2874 diff * diff
2875 })
2876 .sum::<f64>()
2877 / values.len() as f64;
2878 variance.sqrt()
2879}
2880
2881fn rate(count: usize, denom: usize) -> f64 {
2882 if denom == 0 {
2883 0.0
2884 } else {
2885 count as f64 / denom as f64
2886 }
2887}
2888
2889fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2890 normalize_eval_pack_severity(
2891 case.severity
2892 .as_deref()
2893 .or(case.thresholds.severity.as_deref())
2894 .or(manifest.defaults.severity.as_deref())
2895 .or(manifest.defaults.thresholds.severity.as_deref())
2896 .unwrap_or("blocking"),
2897 )
2898}
2899
2900fn normalize_eval_pack_severity(value: &str) -> String {
2901 match value.trim().to_ascii_lowercase().as_str() {
2902 "warn" | "warning" => "warning".to_string(),
2903 "info" | "informational" => "informational".to_string(),
2904 _ => "blocking".to_string(),
2905 }
2906}
2907
2908fn load_eval_pack_case_run(
2909 case: &EvalPackCase,
2910 base_dir: Option<&Path>,
2911 fixture_base_dir: Option<&Path>,
2912 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2913) -> Result<RunRecord, VmError> {
2914 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2915 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2916 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2917 }
2918 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2919 }
2920 Err(VmError::Runtime(
2921 "eval pack case is missing run or run_path".to_string(),
2922 ))
2923}
2924
2925fn load_eval_pack_case_fixture(
2926 case: &EvalPackCase,
2927 base_dir: Option<&Path>,
2928 fixture_base_dir: Option<&Path>,
2929 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2930 run: &RunRecord,
2931) -> Result<ReplayFixture, VmError> {
2932 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2933 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2934 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2935 }
2936 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2937 }
2938 Ok(run
2939 .replay_fixture
2940 .clone()
2941 .unwrap_or_else(|| replay_fixture_from_run(run)))
2942}
2943
2944fn eval_pack_case_source_path(
2945 case: &EvalPackCase,
2946 base_dir: Option<&Path>,
2947 fixture_base_dir: Option<&Path>,
2948 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2949) -> Option<String> {
2950 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2951 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2952 return fixture.path.as_ref().map(|path| {
2953 resolve_manifest_path(fixture_base_dir, path)
2954 .display()
2955 .to_string()
2956 });
2957 }
2958 Some(
2959 resolve_manifest_path(base_dir, run_ref)
2960 .display()
2961 .to_string(),
2962 )
2963}
2964
2965fn load_eval_pack_case_friction_events(
2966 case: &EvalPackCase,
2967 base_dir: Option<&Path>,
2968 fixture_base_dir: Option<&Path>,
2969 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2970) -> Result<Vec<FrictionEvent>, VmError> {
2971 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2972 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2973 })?;
2974 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2975 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2976 }
2977 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2978}
2979
2980fn load_friction_events_from_fixture_ref(
2981 fixture: &EvalPackFixtureRef,
2982 base_dir: Option<&Path>,
2983) -> Result<Vec<FrictionEvent>, VmError> {
2984 if let Some(inline) = &fixture.inline {
2985 return normalize_friction_events_json(inline.clone());
2986 }
2987 let path = fixture.path.as_deref().ok_or_else(|| {
2988 VmError::Runtime(format!(
2989 "fixture '{}' is missing path or inline friction events",
2990 fixture.id
2991 ))
2992 })?;
2993 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2994}
2995
2996fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2997 let content = std::fs::read_to_string(path)
2998 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2999 let value: serde_json::Value = serde_json::from_str(&content)
3000 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
3001 normalize_friction_events_json(value)
3002}
3003
3004fn eval_pack_case_friction_source_path(
3005 case: &EvalPackCase,
3006 base_dir: Option<&Path>,
3007 fixture_base_dir: Option<&Path>,
3008 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
3009) -> Option<String> {
3010 let event_ref = case.friction_events.as_deref()?;
3011 if let Some(fixture) = fixtures_by_id.get(event_ref) {
3012 return fixture.path.as_ref().map(|path| {
3013 resolve_manifest_path(fixture_base_dir, path)
3014 .display()
3015 .to_string()
3016 });
3017 }
3018 Some(
3019 resolve_manifest_path(base_dir, event_ref)
3020 .display()
3021 .to_string(),
3022 )
3023}
3024
3025fn friction_suggestion_options(
3026 case: &EvalPackCase,
3027 manifest: &EvalPackManifest,
3028) -> ContextPackSuggestionOptions {
3029 let min_occurrences = case
3030 .metadata
3031 .get("min_occurrences")
3032 .or_else(|| manifest.metadata.get("min_occurrences"))
3033 .and_then(|value| value.as_u64())
3034 .unwrap_or(2) as usize;
3035 let owner = case
3036 .metadata
3037 .get("owner")
3038 .or_else(|| manifest.metadata.get("owner"))
3039 .and_then(|value| value.as_str())
3040 .map(str::to_string)
3041 .or_else(|| {
3042 manifest
3043 .package
3044 .as_ref()
3045 .and_then(|package| package.name.clone())
3046 });
3047 ContextPackSuggestionOptions {
3048 min_occurrences,
3049 owner,
3050 }
3051}
3052
3053fn apply_eval_pack_thresholds(
3054 run: &RunRecord,
3055 thresholds: &super::types::EvalPackThresholds,
3056 failures: &mut Vec<String>,
3057) {
3058 if let Some(max_stage_count) = thresholds.max_stage_count {
3059 if run.stages.len() > max_stage_count {
3060 failures.push(format!(
3061 "stage count {} exceeds threshold {}",
3062 run.stages.len(),
3063 max_stage_count
3064 ));
3065 }
3066 }
3067 if let Some(max_latency_ms) = thresholds.max_latency_ms {
3068 let actual = run
3069 .usage
3070 .as_ref()
3071 .map(|usage| usage.total_duration_ms)
3072 .unwrap_or_default();
3073 if actual > max_latency_ms {
3074 failures.push(format!(
3075 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
3076 ));
3077 }
3078 }
3079 if let Some(max_cost_usd) = thresholds.max_cost_usd {
3080 let actual = run
3081 .usage
3082 .as_ref()
3083 .map(|usage| usage.total_cost)
3084 .unwrap_or_default();
3085 if actual > max_cost_usd {
3086 failures.push(format!(
3087 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
3088 ));
3089 }
3090 }
3091 if let Some(max_tokens) = thresholds.max_tokens {
3092 let actual = run
3093 .usage
3094 .as_ref()
3095 .map(|usage| usage.input_tokens + usage.output_tokens)
3096 .unwrap_or_default();
3097 if actual > max_tokens {
3098 failures.push(format!(
3099 "token count {actual} exceeds threshold {max_tokens}"
3100 ));
3101 }
3102 }
3103}
3104
3105fn apply_eval_pack_rubric(
3106 rubric: &EvalPackRubric,
3107 run: &RunRecord,
3108 failures: &mut Vec<String>,
3109 warnings: &mut Vec<String>,
3110) {
3111 match rubric.kind.as_str() {
3112 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
3113 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
3114 for assertion in &rubric.assertions {
3115 apply_eval_pack_assertion(rubric, assertion, run, failures);
3116 }
3117 }
3118 "llm-judge" | "llm_as_judge" | "judge" => {
3119 let severity = normalize_eval_pack_severity(
3120 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
3121 );
3122 let message = format!(
3123 "rubric '{}' requires an external LLM judge and was not run locally",
3124 rubric.id
3125 );
3126 if severity == "blocking" {
3127 failures.push(message);
3128 } else {
3129 warnings.push(message);
3130 }
3131 }
3132 other => warnings.push(format!(
3133 "rubric '{}' has unknown kind '{}' and was not run locally",
3134 rubric.id, other
3135 )),
3136 }
3137}
3138
3139fn apply_eval_pack_friction_rubric(
3140 rubric: &EvalPackRubric,
3141 suggestions: &[super::super::ContextPackSuggestion],
3142 failures: &mut Vec<String>,
3143 warnings: &mut Vec<String>,
3144) {
3145 match rubric.kind.as_str() {
3146 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
3147 let mut expectations = Vec::new();
3148 for assertion in &rubric.assertions {
3149 match assertion.kind.as_str() {
3150 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
3151 let expectation = context_pack_expectation_from_assertion(assertion);
3152 expectations.push(expectation);
3153 }
3154 other => failures.push(format!(
3155 "rubric '{}' has unsupported friction assertion kind '{}'",
3156 rubric.id, other
3157 )),
3158 }
3159 }
3160 failures.extend(evaluate_context_pack_suggestion_expectations(
3161 suggestions,
3162 &expectations,
3163 ));
3164 }
3165 other => warnings.push(format!(
3166 "rubric '{}' has unknown friction kind '{}' and was not run locally",
3167 rubric.id, other
3168 )),
3169 }
3170}
3171
3172fn context_pack_expectation_from_assertion(
3173 assertion: &EvalPackAssertion,
3174) -> ContextPackSuggestionExpectation {
3175 let expected = assertion
3176 .expected
3177 .as_ref()
3178 .and_then(|value| value.as_object());
3179 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
3180 ContextPackSuggestionExpectation {
3181 min_suggestions: expected
3182 .and_then(|map| map.get("min_suggestions"))
3183 .and_then(|value| value.as_u64())
3184 .map(|value| value as usize),
3185 recommended_artifact: expected
3186 .and_then(|map| map.get("recommended_artifact"))
3187 .and_then(|value| value.as_str())
3188 .map(str::to_string)
3189 .or_else(|| expected_string.map(str::to_string)),
3190 title_contains: assertion.contains.clone().or_else(|| {
3191 expected
3192 .and_then(|map| map.get("title_contains"))
3193 .and_then(|value| value.as_str())
3194 .map(str::to_string)
3195 }),
3196 manifest_name_contains: expected
3197 .and_then(|map| map.get("manifest_name_contains"))
3198 .and_then(|value| value.as_str())
3199 .map(str::to_string),
3200 required_capability: expected
3201 .and_then(|map| map.get("required_capability"))
3202 .and_then(|value| value.as_str())
3203 .map(str::to_string),
3204 required_output_slot: expected
3205 .and_then(|map| map.get("required_output_slot"))
3206 .and_then(|value| value.as_str())
3207 .map(str::to_string),
3208 }
3209}
3210
3211fn apply_eval_pack_assertion(
3212 rubric: &EvalPackRubric,
3213 assertion: &EvalPackAssertion,
3214 run: &RunRecord,
3215 failures: &mut Vec<String>,
3216) {
3217 match assertion.kind.as_str() {
3218 "run-status" | "run_status" | "status" => {
3219 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
3220 if let Some(expected) = expected {
3221 if run.status != expected {
3222 failures.push(format!(
3223 "rubric '{}' expected run status {}, got {}",
3224 rubric.id, expected, run.status
3225 ));
3226 }
3227 }
3228 }
3229 "stage-status" | "stage_status" => {
3230 let Some(stage_id) = assertion.stage.as_deref() else {
3231 failures.push(format!(
3232 "rubric '{}' stage-status assertion missing stage",
3233 rubric.id
3234 ));
3235 return;
3236 };
3237 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
3238 let Some(expected) = expected else {
3239 failures.push(format!(
3240 "rubric '{}' stage-status assertion missing expected string",
3241 rubric.id
3242 ));
3243 return;
3244 };
3245 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
3246 Some(stage) if stage.status == expected => {}
3247 Some(stage) => failures.push(format!(
3248 "rubric '{}' expected stage {} status {}, got {}",
3249 rubric.id, stage_id, expected, stage.status
3250 )),
3251 None => failures.push(format!(
3252 "rubric '{}' expected stage {} to exist",
3253 rubric.id, stage_id
3254 )),
3255 }
3256 }
3257 "visible-text-contains" | "visible_text_contains" => {
3258 let Some(needle) = assertion.contains.as_deref() else {
3259 failures.push(format!(
3260 "rubric '{}' visible-text assertion missing contains",
3261 rubric.id
3262 ));
3263 return;
3264 };
3265 let matched = match assertion.stage.as_deref() {
3266 Some(stage_id) => run
3267 .stages
3268 .iter()
3269 .find(|stage| stage.node_id == stage_id)
3270 .and_then(|stage| stage.visible_text.as_deref())
3271 .is_some_and(|text| text.contains(needle)),
3272 None => run
3273 .stages
3274 .iter()
3275 .filter_map(|stage| stage.visible_text.as_deref())
3276 .any(|text| text.contains(needle)),
3277 };
3278 if !matched {
3279 failures.push(format!(
3280 "rubric '{}' expected visible text to contain {:?}",
3281 rubric.id, needle
3282 ));
3283 }
3284 }
3285 "hitl-question-contains" | "hitl_question_contains" => {
3286 let Some(needle) = assertion.contains.as_deref() else {
3287 failures.push(format!(
3288 "rubric '{}' HITL assertion missing contains",
3289 rubric.id
3290 ));
3291 return;
3292 };
3293 if !run
3294 .hitl_questions
3295 .iter()
3296 .any(|question| question.prompt.contains(needle))
3297 {
3298 failures.push(format!(
3299 "rubric '{}' expected HITL question to contain {:?}",
3300 rubric.id, needle
3301 ));
3302 }
3303 }
3304 "" => {}
3305 other => failures.push(format!(
3306 "rubric '{}' has unsupported assertion kind '{}'",
3307 rubric.id, other
3308 )),
3309 }
3310}
3311
3312pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
3313 ReplayFixture {
3314 type_name: "replay_fixture".to_string(),
3315 id: new_id("fixture"),
3316 source_run_id: run.id.clone(),
3317 workflow_id: run.workflow_id.clone(),
3318 workflow_name: run.workflow_name.clone(),
3319 created_at: now_rfc3339(),
3320 eval_kind: Some("replay".to_string()),
3321 clarifying_question: None,
3322 expected_status: run.status.clone(),
3323 stage_assertions: run
3324 .stages
3325 .iter()
3326 .map(|stage| ReplayStageAssertion {
3327 node_id: stage.node_id.clone(),
3328 expected_status: stage.status.clone(),
3329 expected_outcome: stage.outcome.clone(),
3330 expected_branch: stage.branch.clone(),
3331 required_artifact_kinds: stage
3332 .artifacts
3333 .iter()
3334 .map(|artifact| artifact.kind.clone())
3335 .collect(),
3336 visible_text_contains: stage
3337 .visible_text
3338 .as_ref()
3339 .filter(|text| !text.is_empty())
3340 .map(|text| text.chars().take(80).collect()),
3341 })
3342 .collect(),
3343 }
3344}
3345
3346pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
3347 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
3348 return evaluate_clarifying_question(run, fixture);
3349 }
3350 let mut failures = Vec::new();
3351 if run.status != fixture.expected_status {
3352 failures.push(format!(
3353 "run status mismatch: expected {}, got {}",
3354 fixture.expected_status, run.status
3355 ));
3356 }
3357 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
3358 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
3359 for assertion in &fixture.stage_assertions {
3360 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
3361 failures.push(format!("missing stage {}", assertion.node_id));
3362 continue;
3363 };
3364 if stage.status != assertion.expected_status {
3365 failures.push(format!(
3366 "stage {} status mismatch: expected {}, got {}",
3367 assertion.node_id, assertion.expected_status, stage.status
3368 ));
3369 }
3370 if stage.outcome != assertion.expected_outcome {
3371 failures.push(format!(
3372 "stage {} outcome mismatch: expected {}, got {}",
3373 assertion.node_id, assertion.expected_outcome, stage.outcome
3374 ));
3375 }
3376 if stage.branch != assertion.expected_branch {
3377 failures.push(format!(
3378 "stage {} branch mismatch: expected {:?}, got {:?}",
3379 assertion.node_id, assertion.expected_branch, stage.branch
3380 ));
3381 }
3382 for required_kind in &assertion.required_artifact_kinds {
3383 if !stage
3384 .artifacts
3385 .iter()
3386 .any(|artifact| &artifact.kind == required_kind)
3387 {
3388 failures.push(format!(
3389 "stage {} missing artifact kind {}",
3390 assertion.node_id, required_kind
3391 ));
3392 }
3393 }
3394 if let Some(snippet) = &assertion.visible_text_contains {
3395 let actual = stage.visible_text.clone().unwrap_or_default();
3396 if !actual.contains(snippet) {
3397 failures.push(format!(
3398 "stage {} visible text does not contain expected snippet {:?}",
3399 assertion.node_id, snippet
3400 ));
3401 }
3402 }
3403 }
3404
3405 ReplayEvalReport {
3406 pass: failures.is_empty(),
3407 failures,
3408 stage_count: run.stages.len(),
3409 }
3410}
3411
3412fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
3413 let mut failures = Vec::new();
3414 let spec = fixture.clarifying_question.clone().unwrap_or_default();
3415 let min_questions = clarifying_min_questions(&spec);
3416 let max_questions = clarifying_max_questions(&spec);
3417 let questions = &run.hitl_questions;
3418
3419 if run.status != fixture.expected_status {
3420 failures.push(format!(
3421 "run status mismatch: expected {}, got {}",
3422 fixture.expected_status, run.status
3423 ));
3424 }
3425 if questions.len() < min_questions {
3426 failures.push(format!(
3427 "expected at least {min_questions} clarifying question(s), got {}",
3428 questions.len()
3429 ));
3430 }
3431 if questions.len() > max_questions {
3432 failures.push(format!(
3433 "expected at most {max_questions} clarifying question(s), got {}",
3434 questions.len()
3435 ));
3436 }
3437
3438 let normalized_expected = spec
3439 .expected_question
3440 .as_deref()
3441 .map(normalize_question_text);
3442 let normalized_accepted = spec
3443 .accepted_questions
3444 .iter()
3445 .map(|question| normalize_question_text(question))
3446 .collect::<Vec<_>>();
3447 let required_terms = spec
3448 .required_terms
3449 .iter()
3450 .map(|term| normalize_question_text(term))
3451 .collect::<Vec<_>>();
3452 let forbidden_terms = spec
3453 .forbidden_terms
3454 .iter()
3455 .map(|term| normalize_question_text(term))
3456 .collect::<Vec<_>>();
3457
3458 let matched = questions.iter().any(|question| {
3459 let normalized = normalize_question_text(&question.prompt);
3460 let matches_expected = normalized_expected
3461 .as_ref()
3462 .is_none_or(|expected| &normalized == expected)
3463 && (normalized_accepted.is_empty()
3464 || normalized_accepted
3465 .iter()
3466 .any(|candidate| candidate == &normalized));
3467 let has_required_terms = required_terms
3468 .iter()
3469 .all(|term| normalized.contains(term.as_str()));
3470 let avoids_forbidden_terms = forbidden_terms
3471 .iter()
3472 .all(|term| !normalized.contains(term.as_str()));
3473 matches_expected && has_required_terms && avoids_forbidden_terms
3474 });
3475
3476 if !questions.is_empty()
3477 && (!normalized_accepted.is_empty()
3478 || normalized_expected.is_some()
3479 || !required_terms.is_empty()
3480 || !forbidden_terms.is_empty())
3481 && !matched
3482 {
3483 failures.push(format!(
3484 "no clarifying question matched fixture; actual questions: {}",
3485 questions
3486 .iter()
3487 .map(|question| format!("{:?}", question.prompt))
3488 .collect::<Vec<_>>()
3489 .join(", ")
3490 ));
3491 }
3492
3493 ReplayEvalReport {
3494 pass: failures.is_empty(),
3495 failures,
3496 stage_count: run.stages.len(),
3497 }
3498}
3499
3500pub fn evaluate_run_suite(
3501 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
3502) -> ReplayEvalSuiteReport {
3503 let mut reports = Vec::new();
3504 for (run, fixture, source_path) in cases {
3505 let report = evaluate_run_against_fixture(&run, &fixture);
3506 reports.push(ReplayEvalCaseReport {
3507 run_id: run.id.clone(),
3508 workflow_id: run.workflow_id.clone(),
3509 label: None,
3510 pass: report.pass,
3511 failures: report.failures,
3512 stage_count: report.stage_count,
3513 source_path,
3514 comparison: None,
3515 });
3516 }
3517 let total = reports.len();
3518 let passed = reports.iter().filter(|report| report.pass).count();
3519 let failed = total.saturating_sub(passed);
3520 ReplayEvalSuiteReport {
3521 pass: failed == 0,
3522 total,
3523 passed,
3524 failed,
3525 cases: reports,
3526 }
3527}
3528
3529#[cfg(test)]
3530mod live_tool_budget_tests {
3531 use super::*;
3532 use std::collections::BTreeMap;
3533
3534 #[test]
3535 fn per_tool_budget_counts_from_sequence_when_no_by_tool_map() {
3536 let summary = serde_json::json!({
3539 "total": 4,
3540 "rejected": 0,
3541 "sequence": ["read", "edit", "edit", "run"],
3542 "successful": ["read", "edit", "edit", "run"],
3543 });
3544 assert_eq!(live_tool_summary_count(&summary, "edit"), Some(2));
3545 assert_eq!(live_tool_summary_count(&summary, "read"), Some(1));
3546 assert_eq!(live_tool_summary_count(&summary, "delete"), Some(0));
3547 assert_eq!(live_tool_summary_count(&summary, "total"), Some(4));
3548 }
3549
3550 #[test]
3551 fn per_tool_budget_is_enforced_against_sequence_only_summary() {
3552 let summary = serde_json::json!({
3553 "total": 3,
3554 "sequence": ["edit", "edit", "run"],
3555 });
3556 let budgets = BTreeMap::from([("edit".to_string(), 1usize)]);
3557 let failures = eval_pack_live_tool_budget_failures(&budgets, &summary);
3558 assert_eq!(failures.len(), 1, "edit budget of 1 must trip on 2 edits");
3559 assert!(failures[0].contains("edit"));
3560
3561 let within = BTreeMap::from([("edit".to_string(), 2usize)]);
3562 assert!(eval_pack_live_tool_budget_failures(&within, &summary).is_empty());
3563 }
3564
3565 #[test]
3566 fn explicit_by_tool_map_still_takes_precedence() {
3567 let summary = serde_json::json!({
3568 "total": 1,
3569 "byTool": {"edit": 1},
3570 });
3571 assert_eq!(live_tool_summary_count(&summary, "edit"), Some(1));
3572 }
3573}