Skip to main content

harn_vm/orchestration/records/
eval_pack.rs

1//! Eval-suite manifest + eval-pack manifest loading, evaluation, replay-fixture comparison.
2
3use std::collections::{BTreeMap, BTreeSet};
4use std::io::{Read, Write};
5use std::path::{Path, PathBuf};
6use std::process::{Command, Stdio};
7use std::sync::Arc;
8use std::time::Duration;
9
10use crate::event_log::EventLog;
11use sha2::{Digest, Sha256};
12use wait_timeout::ChildExt;
13
14use super::super::{
15    evaluate_context_pack_suggestion_expectations, generate_context_pack_suggestions, new_id,
16    normalize_friction_events_json, now_rfc3339, parse_json_value, run_persona_eval_ladder,
17    ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent,
18};
19use super::diff::diff_run_records;
20use super::json::{clarifying_max_questions, clarifying_min_questions, normalize_question_text};
21use super::persistence::load_run_record;
22use super::types::{
23    EvalLedgerAppendReport, EvalLedgerFingerprintMismatch, EvalLedgerPriorCommitReport,
24    EvalLedgerProvenance, EvalLedgerReadReport, EvalLedgerResumeCell, EvalLedgerResumePlan,
25    EvalLedgerRow, EvalPackAssertion, EvalPackCase, EvalPackCaseReport, EvalPackCommandObject,
26    EvalPackCommandSpec, EvalPackFixtureRef, EvalPackManifest, EvalPackReliabilityBreakdown,
27    EvalPackReliabilityReport, EvalPackReport, EvalPackRubric, EvalPackRunState,
28    EvalPackSplitValidationReport, EvalPackStatsReport, EvalPackStatsRow, EvalPackTrialReport,
29    EvalSuiteManifest, ReplayEvalCaseReport, ReplayEvalReport, ReplayEvalSuiteReport,
30    ReplayFixture, ReplayStageAssertion, RunDiffReport, RunRecord, RunStageRecord,
31};
32use crate::value::{VmError, VmValue};
33
34const EVAL_LEDGER_ROW_SCHEMA: &str = "harn.eval.ledger.row.v1";
35const EVAL_LEDGER_RUN_STATE_SCHEMA: &str = "harn.eval.run-state.v1";
36const EVAL_LEDGER_RESUME_PLAN_SCHEMA: &str = "harn.eval.resume-plan.v1";
37const EVAL_LEDGER_ROW_KIND: &str = "eval.ledger.row";
38const EVAL_LEDGER_RUN_STATE_KIND: &str = "eval.ledger.run_state";
39const EVAL_LEDGER_TOPIC_PREFIX: &str = "eval.ledger";
40const EVAL_LEDGER_IDENTITY_HEADER: &str = "eval_ledger_identity";
41const EVAL_LEDGER_QUEUE_DEPTH: usize =
42    crate::runtime_limits::RuntimeLimits::DEFAULT.default_event_log_queue_depth;
43const EVAL_LEDGER_READ_BATCH_LIMIT: usize = 1024;
44const LIVE_EXECUTOR_REQUEST_SCHEMA: &str = "harn.eval.live_verify.executor_request.v1";
45const DEFAULT_LIVE_EXECUTOR_TIMEOUT_SECONDS: f64 = 600.0;
46const DEFAULT_LIVE_VERIFY_TIMEOUT_SECONDS: f64 = 120.0;
47
48#[derive(Clone, Debug, Default, serde::Deserialize)]
49#[serde(default)]
50struct EvalLedgerOptions {
51    namespace: Option<String>,
52    suite: Option<String>,
53    model: Option<String>,
54    split: Option<String>,
55    commit: Option<String>,
56    branch: Option<String>,
57    #[serde(alias = "case")]
58    case_name: Option<String>,
59    case_fingerprint: Option<String>,
60    harness_config_fingerprint: Option<String>,
61    limit: Option<usize>,
62}
63
64#[derive(Clone, Copy, Debug, PartialEq, Eq)]
65enum EvalPackCaseKind {
66    Replay,
67    Friction,
68    LiveVerify,
69}
70
71#[derive(Clone, Debug, Default, serde::Deserialize, serde::Serialize)]
72#[serde(default)]
73pub struct EvalPackLiveVerifyOutcome {
74    pub verification: Option<String>,
75    #[serde(alias = "verificationExitCode")]
76    pub verification_exit_code: Option<i64>,
77    #[serde(alias = "pass", alias = "success")]
78    pub passed: Option<bool>,
79    #[serde(alias = "timedOut")]
80    pub timed_out: bool,
81    #[serde(alias = "wallTimeSeconds")]
82    pub wall_time_seconds: f64,
83    #[serde(alias = "costUsd")]
84    pub cost_usd: f64,
85    #[serde(default, alias = "producedPaths")]
86    pub produced_paths: Vec<String>,
87    #[serde(default, alias = "toolCallSummary", alias = "tool_summary")]
88    pub tool_call_summary: serde_json::Value,
89    pub failures: Vec<String>,
90    pub warnings: Vec<String>,
91    pub informational: Vec<String>,
92    #[serde(alias = "runId")]
93    pub run_id: Option<String>,
94    #[serde(alias = "workflowId")]
95    pub workflow_id: Option<String>,
96    #[serde(alias = "sourcePath")]
97    pub source_path: Option<String>,
98    #[serde(alias = "stageCount")]
99    pub stage_count: Option<usize>,
100}
101
102#[derive(Clone, Debug)]
103pub struct EvalPackLiveExecutorRequest {
104    pub executor: EvalPackCommandSpec,
105    pub payload: serde_json::Value,
106    pub manifest_id: String,
107    pub case: EvalPackCase,
108    pub case_id: String,
109    pub trial: usize,
110    pub trials: usize,
111    pub workspace: PathBuf,
112    pub base_dir: Option<PathBuf>,
113}
114
115pub trait EvalPackLiveExecutor {
116    fn execute(
117        &mut self,
118        request: EvalPackLiveExecutorRequest,
119    ) -> Result<EvalPackLiveVerifyOutcome, VmError>;
120}
121
122struct EvalPackShellLiveExecutor;
123
124impl EvalPackLiveExecutor for EvalPackShellLiveExecutor {
125    fn execute(
126        &mut self,
127        request: EvalPackLiveExecutorRequest,
128    ) -> Result<EvalPackLiveVerifyOutcome, VmError> {
129        let output = run_eval_pack_command(
130            &request.executor,
131            &request.workspace,
132            Some(&request.payload),
133            DEFAULT_LIVE_EXECUTOR_TIMEOUT_SECONDS,
134        )?;
135        let mut failures = Vec::new();
136        let mut outcome = live_outcome_from_executor_output(output, &mut failures);
137        outcome.failures.extend(failures);
138        Ok(outcome)
139    }
140}
141
142#[derive(Clone, Debug)]
143struct EvalPackCommandOutput {
144    exit_code: i64,
145    stdout: String,
146    stderr: String,
147    timed_out: bool,
148    wall_time_seconds: f64,
149}
150
151struct EvalPackLedgerRun {
152    log: Arc<crate::event_log::AnyEventLog>,
153    topic: crate::event_log::Topic,
154    rows: Vec<EvalLedgerRow>,
155    suite: String,
156    model: String,
157    commit: String,
158    branch: Option<String>,
159    provenance: EvalLedgerProvenance,
160    inserted: usize,
161    duplicates: usize,
162    fingerprint_refusals: Vec<EvalLedgerFingerprintMismatch>,
163}
164
165pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
166    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
167    if manifest.type_name.is_empty() {
168        manifest.type_name = "eval_suite_manifest".to_string();
169    }
170    if manifest.id.is_empty() {
171        manifest.id = new_id("eval_suite");
172    }
173    Ok(manifest)
174}
175
176pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
177    let content = std::fs::read_to_string(path)
178        .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
179    let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
180        .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
181    if manifest.base_dir.is_none() {
182        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
183    }
184    Ok(manifest)
185}
186
187pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
188    let content = std::fs::read_to_string(path)
189        .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
190    let mut manifest: EvalPackManifest =
191        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
192            serde_json::from_str(&content)
193                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
194        } else {
195            toml::from_str(&content)
196                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
197        };
198    normalize_eval_pack_manifest(&mut manifest)?;
199    if manifest.base_dir.is_none() {
200        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
201    }
202    Ok(manifest)
203}
204
205pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
206    let mut manifest: EvalPackManifest = parse_json_value(value)?;
207    normalize_eval_pack_manifest(&mut manifest)?;
208    Ok(manifest)
209}
210
211fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) -> Result<(), VmError> {
212    if manifest.version == 0 {
213        manifest.version = 1;
214    }
215    if manifest.trials == 0 {
216        manifest.trials = 1;
217    }
218    if manifest.id.is_empty() {
219        manifest.id = manifest
220            .name
221            .clone()
222            .filter(|name| !name.trim().is_empty())
223            .unwrap_or_else(|| new_id("eval_pack"));
224    }
225    let rubrics_by_id = manifest
226        .rubrics
227        .iter()
228        .filter(|rubric| !rubric.id.is_empty())
229        .map(|rubric| (rubric.id.as_str(), rubric))
230        .collect::<BTreeMap<_, _>>();
231    let fixtures_by_id = manifest
232        .fixtures
233        .iter()
234        .filter(|fixture| !fixture.id.is_empty())
235        .map(|fixture| (fixture.id.as_str(), fixture))
236        .collect::<BTreeMap<_, _>>();
237    for case in &mut manifest.cases {
238        if case.trials == Some(0) {
239            return Err(VmError::Runtime(format!(
240                "eval pack case '{}' has trials = 0",
241                case.id.as_deref().unwrap_or("<unnamed>")
242            )));
243        }
244        case.case_fingerprint =
245            eval_pack_case_fingerprint_with_refs(case, &rubrics_by_id, &fixtures_by_id)?;
246    }
247    for ladder in &mut manifest.ladders {
248        super::super::normalize_persona_eval_ladder_manifest(ladder);
249    }
250    Ok(())
251}
252
253pub fn eval_pack_case_fingerprint(case: &EvalPackCase) -> Result<String, VmError> {
254    eval_pack_case_fingerprint_with_refs(case, &BTreeMap::new(), &BTreeMap::new())
255}
256
257fn eval_pack_case_fingerprint_with_refs(
258    case: &EvalPackCase,
259    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
260    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
261) -> Result<String, VmError> {
262    let mut task = BTreeMap::new();
263    insert_json_field(&mut task, "kind", &normalized_eval_pack_case_kind(case))?;
264    insert_json_field(&mut task, "run", &case.run)?;
265    insert_json_field(&mut task, "run_path", &case.run_path)?;
266    insert_json_field(&mut task, "friction_events", &case.friction_events)?;
267    insert_json_field(&mut task, "task", &case.task)?;
268    insert_json_field(&mut task, "workspace", &case.workspace)?;
269    insert_json_field(&mut task, "project", &case.project)?;
270
271    let mut expected_outputs = BTreeMap::new();
272    insert_json_field(&mut expected_outputs, "fixture", &case.fixture)?;
273    insert_json_field(&mut expected_outputs, "fixture_path", &case.fixture_path)?;
274    insert_json_field(
275        &mut expected_outputs,
276        "expected_output_paths",
277        &case.expected_output_paths,
278    )?;
279    insert_json_field(
280        &mut expected_outputs,
281        "required_output_snippets",
282        &case.required_output_snippets,
283    )?;
284    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
285        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
286            insert_json_field(&mut expected_outputs, "fixture_ref", *fixture)?;
287        }
288    }
289
290    let resolved_rubrics = case
291        .rubrics
292        .iter()
293        .filter_map(|rubric_id| rubrics_by_id.get(rubric_id.as_str()))
294        .map(|rubric| {
295            serde_json::to_value(rubric)
296                .map_err(|e| VmError::Runtime(format!("failed to encode eval pack rubric: {e}")))
297        })
298        .collect::<Result<Vec<_>, _>>()?;
299    let mut verify = BTreeMap::new();
300    insert_json_field(&mut verify, "compare_to", &case.compare_to)?;
301    insert_json_field(&mut verify, "verify_command", &case.verify_command)?;
302    insert_json_field(&mut verify, "tool_budgets", &case.tool_budgets)?;
303    insert_json_field(&mut verify, "rubric_ids", &case.rubrics)?;
304    verify.insert(
305        "rubrics".to_string(),
306        serde_json::Value::Array(resolved_rubrics),
307    );
308
309    let mut flags = BTreeMap::new();
310    insert_json_field(&mut flags, "severity", &case.severity)?;
311    insert_json_field(&mut flags, "thresholds", &case.thresholds)?;
312    insert_json_field(&mut flags, "metadata", &case.metadata)?;
313    insert_json_field(&mut flags, "executor", &case.executor)?;
314
315    let mut payload = BTreeMap::new();
316    payload.insert("task".to_string(), encode_json(&task)?);
317    payload.insert(
318        "expected_outputs".to_string(),
319        encode_json(&expected_outputs)?,
320    );
321    payload.insert("verify".to_string(), encode_json(&verify)?);
322    payload.insert("flags".to_string(), encode_json(&flags)?);
323    fingerprint_json(&payload)
324}
325
326pub fn eval_pack_harness_config_fingerprint(
327    manifest: &EvalPackManifest,
328) -> Result<String, VmError> {
329    let rubric_harness = manifest
330        .rubrics
331        .iter()
332        .map(|rubric| {
333            let mut item = BTreeMap::new();
334            insert_json_field(&mut item, "id", &rubric.id)?;
335            insert_json_field(&mut item, "kind", &rubric.kind)?;
336            insert_json_field(&mut item, "prompt", &rubric.prompt)?;
337            insert_json_field(&mut item, "judge", &rubric.judge)?;
338            encode_json(&item)
339        })
340        .collect::<Result<Vec<_>, VmError>>()?;
341    let mut harness_metadata = BTreeMap::new();
342    for key in [
343        "model",
344        "provider",
345        "route",
346        "prompt",
347        "promptVersion",
348        "prompt_version",
349        "toolFormat",
350        "tool_format",
351        "pipelineRev",
352        "pipeline_rev",
353        "pipelineRevision",
354        "pipeline_revision",
355        "harnVersion",
356        "harn_version",
357        "harness",
358        "harnessConfig",
359        "harness_config",
360    ] {
361        if let Some(value) = manifest.metadata.get(key) {
362            harness_metadata.insert(key.to_string(), value.clone());
363        }
364    }
365
366    let mut payload = BTreeMap::new();
367    insert_json_field(&mut payload, "executor", &manifest.executor)?;
368    insert_json_field(&mut payload, "manifest_judge", &manifest.judge)?;
369    insert_json_field(&mut payload, "default_judge", &manifest.defaults.judge)?;
370    insert_json_field(&mut payload, "package", &manifest.package)?;
371    payload.insert(
372        "harness_metadata".to_string(),
373        encode_json(&harness_metadata)?,
374    );
375    payload.insert(
376        "rubric_harness".to_string(),
377        serde_json::Value::Array(rubric_harness),
378    );
379    fingerprint_json(&payload)
380}
381
382fn insert_json_field<T: serde::Serialize>(
383    map: &mut BTreeMap<String, serde_json::Value>,
384    key: &str,
385    value: &T,
386) -> Result<(), VmError> {
387    map.insert(key.to_string(), encode_json(value)?);
388    Ok(())
389}
390
391fn encode_json<T: serde::Serialize>(value: &T) -> Result<serde_json::Value, VmError> {
392    serde_json::to_value(value)
393        .map_err(|e| VmError::Runtime(format!("failed to encode eval pack fingerprint: {e}")))
394}
395
396fn fingerprint_json<T: serde::Serialize>(value: &T) -> Result<String, VmError> {
397    let bytes = serde_json::to_vec(value)
398        .map_err(|e| VmError::Runtime(format!("failed to encode eval pack fingerprint: {e}")))?;
399    let digest = hex::encode(Sha256::digest(bytes));
400    Ok(digest.chars().take(16).collect())
401}
402
403fn eval_pack_case_kind(case: &EvalPackCase) -> EvalPackCaseKind {
404    match normalized_eval_pack_case_kind(case).as_str() {
405        "live-verify" => EvalPackCaseKind::LiveVerify,
406        "friction" => EvalPackCaseKind::Friction,
407        _ => EvalPackCaseKind::Replay,
408    }
409}
410
411fn normalized_eval_pack_case_kind(case: &EvalPackCase) -> String {
412    match case
413        .kind
414        .as_deref()
415        .map(|kind| kind.trim().to_ascii_lowercase().replace('_', "-"))
416        .as_deref()
417    {
418        Some("live") | Some("live-verify") | Some("verify-live") => "live-verify".to_string(),
419        Some("friction") | Some("context-pack-friction") => "friction".to_string(),
420        Some("replay") | Some("fixture") | Some("run-record") => "replay".to_string(),
421        Some(other) if !other.is_empty() => other.to_string(),
422        _ if case.task.is_some()
423            || case.workspace.is_some()
424            || case.project.is_some()
425            || case.verify_command.is_some()
426            || !case.expected_output_paths.is_empty()
427            || !case.required_output_snippets.is_empty() =>
428        {
429            "live-verify".to_string()
430        }
431        _ if case.friction_events.is_some() => "friction".to_string(),
432        _ => "replay".to_string(),
433    }
434}
435
436pub fn eval_ledger_read_report(
437    options: Option<serde_json::Value>,
438) -> Result<EvalLedgerReadReport, VmError> {
439    let options = eval_ledger_options(options)?;
440    let namespace = eval_ledger_namespace(&options);
441    let topic = eval_ledger_topic(&namespace)?;
442    let log = ensure_eval_ledger_event_log(None);
443    let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &options))?;
444    Ok(EvalLedgerReadReport { rows })
445}
446
447pub fn eval_ledger_append_rows_report(
448    rows: serde_json::Value,
449    options: Option<serde_json::Value>,
450) -> Result<EvalLedgerAppendReport, VmError> {
451    let options = eval_ledger_options(options)?;
452    let namespace = eval_ledger_namespace(&options);
453    let topic = eval_ledger_topic(&namespace)?;
454    let provenance = eval_ledger_provenance(None, &options, None);
455    let rows = parse_eval_ledger_rows(rows)?
456        .into_iter()
457        .map(|mut row| {
458            normalize_eval_ledger_row(&mut row, &options, &provenance);
459            row
460        })
461        .collect::<Vec<_>>();
462    let log = ensure_eval_ledger_event_log(None);
463    futures::executor::block_on(append_eval_ledger_rows(&log, &topic, rows))
464}
465
466pub fn eval_ledger_prior_commit_rows_report(
467    options: serde_json::Value,
468) -> Result<EvalLedgerPriorCommitReport, VmError> {
469    let options = eval_ledger_options(Some(options))?;
470    let namespace = eval_ledger_namespace(&options);
471    let topic = eval_ledger_topic(&namespace)?;
472    let log = ensure_eval_ledger_event_log(None);
473    let mut read_options = options.clone();
474    read_options.commit = None;
475    read_options.case_fingerprint = None;
476    read_options.harness_config_fingerprint = None;
477    let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
478    Ok(prior_commit_report(rows, &options))
479}
480
481pub fn eval_ledger_resume_plan_report(
482    manifest: &EvalPackManifest,
483    options: Option<serde_json::Value>,
484) -> Result<EvalLedgerResumePlan, VmError> {
485    let split_report = validate_eval_pack_split(manifest)?;
486    let harness_config_fingerprint = eval_pack_harness_config_fingerprint(manifest)?;
487    let options = eval_ledger_options(options)?;
488    let base_dir = manifest.base_dir.as_deref().map(Path::new);
489    let suite = options.suite.clone().unwrap_or_else(|| manifest.id.clone());
490    let model = options
491        .model
492        .clone()
493        .or_else(|| eval_pack_manifest_model(manifest))
494        .unwrap_or_else(|| "unknown".to_string());
495    let provenance = eval_ledger_provenance(base_dir, &options, Some(&manifest.metadata));
496    let commit = options
497        .commit
498        .clone()
499        .unwrap_or_else(|| provenance.commit.clone());
500    let namespace = eval_pack_ledger_namespace(manifest, &options);
501    let topic = eval_ledger_topic(&namespace)?;
502    let log = ensure_eval_ledger_event_log(base_dir);
503    let read_options = eval_pack_ledger_read_options(&suite, &model, &commit);
504    let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
505    Ok(build_eval_ledger_resume_plan(
506        manifest,
507        &split_report,
508        &rows,
509        &suite,
510        &model,
511        &commit,
512        &harness_config_fingerprint,
513    ))
514}
515
516fn eval_ledger_options(value: Option<serde_json::Value>) -> Result<EvalLedgerOptions, VmError> {
517    let mut options = match value {
518        None | Some(serde_json::Value::Null) => EvalLedgerOptions::default(),
519        Some(value) => serde_json::from_value(value)
520            .map_err(|e| VmError::Runtime(format!("eval ledger options parse error: {e}")))?,
521    };
522    normalize_optional_string(&mut options.namespace);
523    normalize_optional_string(&mut options.suite);
524    normalize_optional_string(&mut options.model);
525    normalize_optional_string(&mut options.split);
526    normalize_optional_string(&mut options.commit);
527    normalize_optional_string(&mut options.branch);
528    normalize_optional_string(&mut options.case_name);
529    normalize_optional_string(&mut options.case_fingerprint);
530    normalize_optional_string(&mut options.harness_config_fingerprint);
531    Ok(options)
532}
533
534fn normalize_optional_string(value: &mut Option<String>) {
535    if value.as_deref().is_some_and(|text| text.trim().is_empty()) {
536        *value = None;
537    }
538}
539
540fn parse_eval_ledger_rows(value: serde_json::Value) -> Result<Vec<EvalLedgerRow>, VmError> {
541    match value {
542        serde_json::Value::Array(_) => serde_json::from_value(value)
543            .map_err(|e| VmError::Runtime(format!("eval ledger rows parse error: {e}"))),
544        serde_json::Value::Object(_) => serde_json::from_value(value)
545            .map(|row| vec![row])
546            .map_err(|e| VmError::Runtime(format!("eval ledger row parse error: {e}"))),
547        _ => Err(VmError::Runtime(
548            "eval ledger rows must be a row dict or list of row dicts".to_string(),
549        )),
550    }
551}
552
553fn eval_ledger_namespace(options: &EvalLedgerOptions) -> String {
554    options
555        .namespace
556        .clone()
557        .or_else(|| options.suite.clone())
558        .unwrap_or_else(|| "default".to_string())
559}
560
561fn eval_pack_ledger_namespace(manifest: &EvalPackManifest, options: &EvalLedgerOptions) -> String {
562    options
563        .namespace
564        .clone()
565        .or_else(|| metadata_string(&manifest.metadata, &["ledger_namespace", "ledgerNamespace"]))
566        .or_else(|| options.suite.clone())
567        .unwrap_or_else(|| manifest.id.clone())
568}
569
570fn eval_pack_ledger_read_options(suite: &str, model: &str, commit: &str) -> EvalLedgerOptions {
571    EvalLedgerOptions {
572        suite: Some(suite.to_string()),
573        model: Some(model.to_string()),
574        commit: Some(commit.to_string()),
575        ..EvalLedgerOptions::default()
576    }
577}
578
579fn eval_ledger_topic(namespace: &str) -> Result<crate::event_log::Topic, VmError> {
580    let safe_namespace = crate::event_log::sanitize_topic_component(namespace);
581    crate::event_log::Topic::new(format!("{EVAL_LEDGER_TOPIC_PREFIX}.{safe_namespace}"))
582        .map_err(eval_ledger_log_error)
583}
584
585fn ensure_eval_ledger_event_log(base_dir: Option<&Path>) -> Arc<crate::event_log::AnyEventLog> {
586    if let Some(log) = crate::event_log::active_event_log() {
587        return log;
588    }
589    if let Some(base_dir) = base_dir {
590        if crate::event_log::install_lazy_default_for_base_dir(base_dir).is_ok() {
591            if let Some(log) = crate::event_log::active_event_log() {
592                return log;
593            }
594        }
595    } else if let Ok(cwd) = std::env::current_dir() {
596        if crate::event_log::install_lazy_default_for_base_dir(&cwd).is_ok() {
597            if let Some(log) = crate::event_log::active_event_log() {
598                return log;
599            }
600        }
601    }
602    crate::event_log::install_memory_for_current_thread(EVAL_LEDGER_QUEUE_DEPTH)
603}
604
605async fn read_eval_ledger_rows(
606    log: &Arc<crate::event_log::AnyEventLog>,
607    topic: &crate::event_log::Topic,
608    options: &EvalLedgerOptions,
609) -> Result<Vec<EvalLedgerRow>, VmError> {
610    let mut rows = Vec::new();
611    let mut cursor = None;
612    loop {
613        let batch = log
614            .read_range(topic, cursor, EVAL_LEDGER_READ_BATCH_LIMIT)
615            .await
616            .map_err(eval_ledger_log_error)?;
617        if batch.is_empty() {
618            break;
619        }
620        for (event_id, event) in batch {
621            cursor = Some(event_id);
622            if let Some(row) = parse_eval_ledger_row(event_id, event) {
623                if eval_ledger_row_matches(&row, options) {
624                    rows.push(row);
625                    if options.limit.is_some_and(|limit| rows.len() >= limit) {
626                        return Ok(rows);
627                    }
628                }
629            }
630        }
631    }
632    Ok(rows)
633}
634
635async fn append_eval_ledger_rows(
636    log: &Arc<crate::event_log::AnyEventLog>,
637    topic: &crate::event_log::Topic,
638    rows: Vec<EvalLedgerRow>,
639) -> Result<EvalLedgerAppendReport, VmError> {
640    let mut report = EvalLedgerAppendReport {
641        appended: rows.len(),
642        all_skipped: !rows.is_empty() && rows.iter().all(eval_ledger_row_is_skip),
643        ..EvalLedgerAppendReport::default()
644    };
645    for row in rows {
646        let identity = eval_ledger_row_identity(&row)?;
647        let mut headers = BTreeMap::new();
648        headers.insert(EVAL_LEDGER_IDENTITY_HEADER.to_string(), identity.clone());
649        headers.insert("suite".to_string(), row.suite.clone());
650        headers.insert("model".to_string(), row.model.clone());
651        headers.insert("commit".to_string(), row.commit.clone());
652        headers.insert("case_name".to_string(), row.case_name.clone());
653        headers.insert("trial".to_string(), row.trial.to_string());
654        let payload = serde_json::to_value(&row)
655            .map_err(|e| VmError::Runtime(format!("eval ledger row encode error: {e}")))?;
656        let outcome = log
657            .append_idempotent_by_header(
658                topic,
659                EVAL_LEDGER_IDENTITY_HEADER,
660                &identity,
661                crate::event_log::LogEvent::new(EVAL_LEDGER_ROW_KIND, payload)
662                    .with_headers(headers),
663            )
664            .await
665            .map_err(eval_ledger_log_error)?;
666        if outcome.inserted {
667            report.inserted += 1;
668        } else {
669            report.duplicates += 1;
670        }
671        report.event_ids.push(outcome.event_id);
672        if let Some(stored) = parse_eval_ledger_row(outcome.event_id, outcome.event) {
673            report.rows.push(stored);
674        }
675    }
676    log.flush().await.map_err(eval_ledger_log_error)?;
677    Ok(report)
678}
679
680fn parse_eval_ledger_row(
681    event_id: crate::event_log::EventId,
682    event: crate::event_log::LogEvent,
683) -> Option<EvalLedgerRow> {
684    if event.kind != EVAL_LEDGER_ROW_KIND {
685        return None;
686    }
687    let mut row: EvalLedgerRow = serde_json::from_value(event.payload).ok()?;
688    if row.schema != EVAL_LEDGER_ROW_SCHEMA {
689        return None;
690    }
691    row.event_id = Some(event_id);
692    Some(row)
693}
694
695fn eval_ledger_row_matches(row: &EvalLedgerRow, options: &EvalLedgerOptions) -> bool {
696    option_matches(options.suite.as_deref(), &row.suite)
697        && option_matches(options.model.as_deref(), &row.model)
698        && option_matches(options.commit.as_deref(), &row.commit)
699        && option_matches(options.case_name.as_deref(), &row.case_name)
700        && option_matches(options.case_fingerprint.as_deref(), &row.case_fingerprint)
701        && option_matches(
702            options.harness_config_fingerprint.as_deref(),
703            &row.harness_config_fingerprint,
704        )
705        && match options.split.as_deref() {
706            Some(expected) => row.split.as_deref() == Some(expected),
707            None => true,
708        }
709}
710
711fn option_matches(expected: Option<&str>, actual: &str) -> bool {
712    expected.is_none_or(|expected| expected == actual)
713}
714
715fn normalize_eval_ledger_row(
716    row: &mut EvalLedgerRow,
717    options: &EvalLedgerOptions,
718    provenance: &EvalLedgerProvenance,
719) {
720    if row.schema.is_empty() {
721        row.schema = EVAL_LEDGER_ROW_SCHEMA.to_string();
722    }
723    if row.suite.is_empty() {
724        row.suite = options
725            .suite
726            .clone()
727            .unwrap_or_else(|| eval_ledger_namespace(options));
728    }
729    if row.model.is_empty() {
730        row.model = options
731            .model
732            .clone()
733            .unwrap_or_else(|| "unknown".to_string());
734    }
735    if row.split.is_none() {
736        row.split = options.split.clone();
737    }
738    if row.commit.is_empty() {
739        row.commit = options
740            .commit
741            .clone()
742            .unwrap_or_else(|| provenance.commit.clone());
743    }
744    if row.case_name.is_empty() {
745        row.case_name = options
746            .case_name
747            .clone()
748            .filter(|name| !name.is_empty())
749            .unwrap_or_else(|| row.name.clone());
750    }
751    if row.name.is_empty() {
752        row.name = row.case_name.clone();
753    }
754    if row.case_fingerprint.is_empty() {
755        row.case_fingerprint = options.case_fingerprint.clone().unwrap_or_default();
756    }
757    if row.harness_config_fingerprint.is_empty() {
758        row.harness_config_fingerprint = options
759            .harness_config_fingerprint
760            .clone()
761            .unwrap_or_default();
762    }
763    if row.trial == 0 {
764        row.trial = 1;
765    }
766    if row.trials == 0 {
767        row.trials = 1;
768    }
769    if row.status.is_empty() {
770        row.status = if row.passes > 0 {
771            "PASS"
772        } else if row.fails > 0 {
773            "FAIL"
774        } else {
775            "skip"
776        }
777        .to_string();
778    }
779    if row.verification.is_empty() {
780        row.verification = row.status.clone();
781    }
782    if row.passes + row.fails + row.skips == 0 {
783        match row.status.to_ascii_uppercase().as_str() {
784            "PASS" => row.passes = 1,
785            "FAIL" => row.fails = 1,
786            _ => row.skips = 1,
787        }
788    }
789    if row.pass_rate == 0.0 && row.passes > 0 {
790        row.pass_rate = row.passes as f64 / row.trials.max(1) as f64;
791    }
792    if row.provenance.commit.is_empty() {
793        row.provenance.commit = row.commit.clone();
794    }
795    if row.provenance.branch.is_none() {
796        row.provenance.branch = provenance.branch.clone();
797    }
798    if row.provenance.ts.is_empty() {
799        row.provenance.ts = provenance.ts.clone();
800    }
801    if row.provenance.harn_version.is_empty() {
802        row.provenance.harn_version = provenance.harn_version.clone();
803    }
804    if row.provenance.host.is_empty() {
805        row.provenance.host = provenance.host.clone();
806    }
807}
808
809fn eval_ledger_row_identity(row: &EvalLedgerRow) -> Result<String, VmError> {
810    let material = serde_json::json!({
811        "schema": EVAL_LEDGER_ROW_SCHEMA,
812        "suite": row.suite,
813        "model": row.model,
814        "split": row.split,
815        "commit": row.commit,
816        "case_name": row.case_name,
817        "case_fingerprint": row.case_fingerprint,
818        "harness_config_fingerprint": row.harness_config_fingerprint,
819        "trial": row.trial,
820    });
821    let bytes = serde_json::to_vec(&material)
822        .map_err(|e| VmError::Runtime(format!("eval ledger identity encode error: {e}")))?;
823    Ok(format!("sha256:{}", hex::encode(Sha256::digest(bytes))))
824}
825
826fn eval_ledger_row_is_skip(row: &EvalLedgerRow) -> bool {
827    row.skipped || row.skips > 0 || row.status.eq_ignore_ascii_case("skip")
828}
829
830fn eval_ledger_provenance(
831    base_dir: Option<&Path>,
832    options: &EvalLedgerOptions,
833    metadata: Option<&BTreeMap<String, serde_json::Value>>,
834) -> EvalLedgerProvenance {
835    let commit = options
836        .commit
837        .clone()
838        .or_else(|| {
839            metadata.and_then(|metadata| {
840                metadata_string(metadata, &["commit", "git_commit", "source_commit"])
841            })
842        })
843        .or_else(|| env_string(&["HARN_EVAL_COMMIT", "HARN_GIT_COMMIT", "GITHUB_SHA"]))
844        .or_else(|| git_output(base_dir, &["rev-parse", "HEAD"]))
845        .unwrap_or_else(|| "unknown".to_string());
846    let branch = options
847        .branch
848        .clone()
849        .or_else(|| {
850            metadata.and_then(|metadata| {
851                metadata_string(metadata, &["branch", "git_branch", "source_branch"])
852            })
853        })
854        .or_else(|| env_string(&["HARN_EVAL_BRANCH", "HARN_GIT_BRANCH", "GITHUB_REF_NAME"]))
855        .or_else(|| git_output(base_dir, &["rev-parse", "--abbrev-ref", "HEAD"]));
856    EvalLedgerProvenance {
857        commit,
858        branch,
859        ts: now_rfc3339(),
860        harn_version: crate::bytecode_cache::HARN_VERSION.to_string(),
861        host: env_string(&["HOSTNAME", "COMPUTERNAME"]).unwrap_or_else(|| "unknown".to_string()),
862    }
863}
864
865fn env_string(keys: &[&str]) -> Option<String> {
866    keys.iter().find_map(|key| {
867        std::env::var(key)
868            .ok()
869            .map(|value| value.trim().to_string())
870            .filter(|value| !value.is_empty())
871    })
872}
873
874fn git_output(base_dir: Option<&Path>, args: &[&str]) -> Option<String> {
875    let mut command = std::process::Command::new("git");
876    if let Some(base_dir) = base_dir {
877        command.arg("-C").arg(base_dir);
878    }
879    let output = command.args(args).output().ok()?;
880    if !output.status.success() {
881        return None;
882    }
883    String::from_utf8(output.stdout)
884        .ok()
885        .map(|value| value.trim().to_string())
886        .filter(|value| !value.is_empty())
887}
888
889fn metadata_string(
890    metadata: &BTreeMap<String, serde_json::Value>,
891    keys: &[&str],
892) -> Option<String> {
893    keys.iter()
894        .find_map(|key| json_value_string(metadata.get(*key)?))
895}
896
897fn json_value_string(value: &serde_json::Value) -> Option<String> {
898    match value {
899        serde_json::Value::String(value) => Some(value.trim().to_string()),
900        serde_json::Value::Number(value) => Some(value.to_string()),
901        serde_json::Value::Bool(value) => Some(value.to_string()),
902        _ => None,
903    }
904    .filter(|value| !value.is_empty())
905}
906
907fn eval_pack_manifest_model(manifest: &EvalPackManifest) -> Option<String> {
908    metadata_string(&manifest.metadata, &["model", "provider_model", "route"])
909        .or_else(|| {
910            manifest
911                .judge
912                .as_ref()
913                .and_then(|judge| judge.model.clone())
914        })
915        .or_else(|| {
916            manifest
917                .defaults
918                .judge
919                .as_ref()
920                .and_then(|judge| judge.model.clone())
921        })
922}
923
924fn prior_commit_report(
925    rows: Vec<EvalLedgerRow>,
926    options: &EvalLedgerOptions,
927) -> EvalLedgerPriorCommitReport {
928    let current_commit = options.commit.as_deref().unwrap_or_default();
929    let mut fingerprint_mismatches = Vec::new();
930    let mut candidates = Vec::new();
931    let mut latest_event_by_commit = BTreeMap::<String, u64>::new();
932    for row in rows {
933        if row.commit == current_commit {
934            continue;
935        }
936        if let Some(mismatch) = fingerprint_mismatch_for_row(&row, options) {
937            fingerprint_mismatches.push(mismatch);
938            continue;
939        }
940        let event_id = row.event_id.unwrap_or_default();
941        latest_event_by_commit
942            .entry(row.commit.clone())
943            .and_modify(|existing| *existing = (*existing).max(event_id))
944            .or_insert(event_id);
945        candidates.push(row);
946    }
947    let selected_commit = latest_event_by_commit
948        .iter()
949        .max_by_key(|(_, event_id)| *event_id)
950        .map(|(commit, _)| commit.clone());
951    let rows = selected_commit
952        .as_ref()
953        .map(|commit| {
954            candidates
955                .into_iter()
956                .filter(|row| &row.commit == commit)
957                .collect()
958        })
959        .unwrap_or_default();
960    EvalLedgerPriorCommitReport {
961        commit: selected_commit,
962        model: options.model.clone().unwrap_or_default(),
963        split: options.split.clone(),
964        rows,
965        fingerprint_mismatches,
966    }
967}
968
969fn fingerprint_mismatch_for_row(
970    row: &EvalLedgerRow,
971    options: &EvalLedgerOptions,
972) -> Option<EvalLedgerFingerprintMismatch> {
973    let expected_case = options.case_fingerprint.as_deref();
974    let expected_harness = options.harness_config_fingerprint.as_deref();
975    let case_mismatch = expected_case.is_some_and(|expected| expected != row.case_fingerprint);
976    let harness_mismatch =
977        expected_harness.is_some_and(|expected| expected != row.harness_config_fingerprint);
978    if !(case_mismatch || harness_mismatch) {
979        return None;
980    }
981    Some(EvalLedgerFingerprintMismatch {
982        case_name: row.case_name.clone(),
983        split: row.split.clone(),
984        commit: row.commit.clone(),
985        trial: row.trial,
986        case_fingerprint: row.case_fingerprint.clone(),
987        harness_config_fingerprint: row.harness_config_fingerprint.clone(),
988        expected_case_fingerprint: expected_case.unwrap_or_default().to_string(),
989        expected_harness_config_fingerprint: expected_harness.unwrap_or_default().to_string(),
990    })
991}
992
993fn build_eval_ledger_resume_plan(
994    manifest: &EvalPackManifest,
995    split_report: &EvalPackSplitValidationReport,
996    rows: &[EvalLedgerRow],
997    suite: &str,
998    model: &str,
999    commit: &str,
1000    harness_config_fingerprint: &str,
1001) -> EvalLedgerResumePlan {
1002    let split_by_case = split_by_case_id(split_report);
1003    let mut cells = Vec::new();
1004    let mut fingerprint_refusals = Vec::new();
1005    let mut skipped_cells = 0usize;
1006    for (index, case) in manifest.cases.iter().enumerate() {
1007        let case_id = eval_pack_case_id(case, index);
1008        let split = split_by_case.get(&case_id).cloned();
1009        let trial_count = case.trials.unwrap_or(manifest.trials);
1010        for trial in 1..=trial_count {
1011            let matching = ledger_rows_for_cell(
1012                rows,
1013                suite,
1014                model,
1015                split.as_deref(),
1016                commit,
1017                &case_id,
1018                trial,
1019            );
1020            let exact = matching
1021                .iter()
1022                .copied()
1023                .filter(|row| {
1024                    row.case_fingerprint == case.case_fingerprint
1025                        && row.harness_config_fingerprint == harness_config_fingerprint
1026                })
1027                .max_by_key(|row| row.event_id.unwrap_or_default());
1028            if let Some(row) = exact {
1029                skipped_cells += 1;
1030                cells.push(EvalLedgerResumeCell {
1031                    case_name: case_id.clone(),
1032                    split: split.clone(),
1033                    trial,
1034                    status: "skip".to_string(),
1035                    reason: "matching ledger row".to_string(),
1036                    event_id: row.event_id,
1037                });
1038                continue;
1039            }
1040            let mut refused = false;
1041            for row in matching {
1042                if row.case_fingerprint != case.case_fingerprint
1043                    || row.harness_config_fingerprint != harness_config_fingerprint
1044                {
1045                    fingerprint_refusals.push(EvalLedgerFingerprintMismatch {
1046                        case_name: case_id.clone(),
1047                        split: split.clone(),
1048                        commit: row.commit.clone(),
1049                        trial,
1050                        case_fingerprint: row.case_fingerprint.clone(),
1051                        harness_config_fingerprint: row.harness_config_fingerprint.clone(),
1052                        expected_case_fingerprint: case.case_fingerprint.clone(),
1053                        expected_harness_config_fingerprint: harness_config_fingerprint.to_string(),
1054                    });
1055                    refused = true;
1056                }
1057            }
1058            cells.push(EvalLedgerResumeCell {
1059                case_name: case_id.clone(),
1060                split: split.clone(),
1061                trial,
1062                status: "run".to_string(),
1063                reason: if refused {
1064                    "fingerprint mismatch".to_string()
1065                } else {
1066                    "missing ledger row".to_string()
1067                },
1068                event_id: None,
1069            });
1070        }
1071    }
1072    let requested_cells = cells.len();
1073    let remaining_cells = requested_cells.saturating_sub(skipped_cells);
1074    EvalLedgerResumePlan {
1075        schema: EVAL_LEDGER_RESUME_PLAN_SCHEMA.to_string(),
1076        suite: suite.to_string(),
1077        model: model.to_string(),
1078        commit: commit.to_string(),
1079        harness_config_fingerprint: harness_config_fingerprint.to_string(),
1080        requested_cells,
1081        completed_cells: skipped_cells,
1082        skipped_cells,
1083        remaining_cells,
1084        all_skipped: requested_cells > 0 && remaining_cells == 0,
1085        fingerprint_refusals,
1086        cells,
1087    }
1088}
1089
1090fn ledger_rows_for_cell<'a>(
1091    rows: &'a [EvalLedgerRow],
1092    suite: &str,
1093    model: &str,
1094    split: Option<&str>,
1095    commit: &str,
1096    case_name: &str,
1097    trial: usize,
1098) -> Vec<&'a EvalLedgerRow> {
1099    rows.iter()
1100        .filter(|row| {
1101            row.suite == suite
1102                && row.model == model
1103                && row.split.as_deref() == split
1104                && row.commit == commit
1105                && row.case_name == case_name
1106                && row.trial == trial
1107        })
1108        .collect()
1109}
1110
1111fn eval_ledger_log_error(error: crate::event_log::LogError) -> VmError {
1112    VmError::Runtime(format!("eval ledger: event log: {error}"))
1113}
1114
1115impl EvalPackLedgerRun {
1116    fn start(
1117        manifest: &EvalPackManifest,
1118        base_dir: Option<&Path>,
1119        options: Option<serde_json::Value>,
1120    ) -> Result<Self, VmError> {
1121        let options = eval_ledger_options(options)?;
1122        let suite = options.suite.clone().unwrap_or_else(|| manifest.id.clone());
1123        let model = options
1124            .model
1125            .clone()
1126            .or_else(|| eval_pack_manifest_model(manifest))
1127            .unwrap_or_else(|| "unknown".to_string());
1128        let provenance = eval_ledger_provenance(base_dir, &options, Some(&manifest.metadata));
1129        let commit = options
1130            .commit
1131            .clone()
1132            .unwrap_or_else(|| provenance.commit.clone());
1133        let namespace = eval_pack_ledger_namespace(manifest, &options);
1134        let topic = eval_ledger_topic(&namespace)?;
1135        let log = ensure_eval_ledger_event_log(base_dir);
1136        let read_options = eval_pack_ledger_read_options(&suite, &model, &commit);
1137        let rows = futures::executor::block_on(read_eval_ledger_rows(&log, &topic, &read_options))?;
1138        Ok(Self {
1139            log,
1140            topic,
1141            rows,
1142            suite,
1143            model,
1144            commit,
1145            branch: provenance.branch.clone(),
1146            provenance,
1147            inserted: 0,
1148            duplicates: 0,
1149            fingerprint_refusals: Vec::new(),
1150        })
1151    }
1152
1153    fn replay_row_for_cell(
1154        &mut self,
1155        case_id: &str,
1156        split: Option<&str>,
1157        trial: usize,
1158        case_fingerprint: &str,
1159        harness_config_fingerprint: &str,
1160    ) -> Option<EvalLedgerRow> {
1161        let matching = ledger_rows_for_cell(
1162            &self.rows,
1163            &self.suite,
1164            &self.model,
1165            split,
1166            &self.commit,
1167            case_id,
1168            trial,
1169        );
1170        let exact = matching
1171            .iter()
1172            .copied()
1173            .filter(|row| {
1174                row.case_fingerprint == case_fingerprint
1175                    && row.harness_config_fingerprint == harness_config_fingerprint
1176            })
1177            .max_by_key(|row| row.event_id.unwrap_or_default())
1178            .cloned();
1179        if exact.is_some() {
1180            return exact;
1181        }
1182        for row in matching {
1183            if row.case_fingerprint != case_fingerprint
1184                || row.harness_config_fingerprint != harness_config_fingerprint
1185            {
1186                self.fingerprint_refusals
1187                    .push(EvalLedgerFingerprintMismatch {
1188                        case_name: case_id.to_string(),
1189                        split: split.map(str::to_string),
1190                        commit: row.commit.clone(),
1191                        trial,
1192                        case_fingerprint: row.case_fingerprint.clone(),
1193                        harness_config_fingerprint: row.harness_config_fingerprint.clone(),
1194                        expected_case_fingerprint: case_fingerprint.to_string(),
1195                        expected_harness_config_fingerprint: harness_config_fingerprint.to_string(),
1196                    });
1197            }
1198        }
1199        None
1200    }
1201
1202    fn append_trial_row(&mut self, row: EvalLedgerRow) -> Result<(), VmError> {
1203        let report = futures::executor::block_on(append_eval_ledger_rows(
1204            &self.log,
1205            &self.topic,
1206            vec![row],
1207        ))?;
1208        self.inserted += report.inserted;
1209        self.duplicates += report.duplicates;
1210        self.rows.extend(report.rows);
1211        Ok(())
1212    }
1213
1214    fn finish(
1215        &self,
1216        requested_cells: usize,
1217        skipped_cells: usize,
1218        executed_cells: usize,
1219    ) -> Result<EvalPackRunState, VmError> {
1220        let remaining_cells = requested_cells.saturating_sub(skipped_cells + executed_cells);
1221        let mut state = EvalPackRunState {
1222            schema: EVAL_LEDGER_RUN_STATE_SCHEMA.to_string(),
1223            suite: self.suite.clone(),
1224            model: self.model.clone(),
1225            commit: self.commit.clone(),
1226            branch: self.branch.clone(),
1227            requested_cells,
1228            completed_cells: skipped_cells + executed_cells,
1229            skipped_cells,
1230            executed_cells,
1231            remaining_cells,
1232            ledger_rows_inserted: self.inserted,
1233            ledger_rows_duplicate: self.duplicates,
1234            fingerprint_refusals: self.fingerprint_refusals.len(),
1235            all_skipped: requested_cells > 0 && skipped_cells == requested_cells,
1236            heartbeat_event_id: None,
1237        };
1238        let event_id = self.append_run_state(&state)?;
1239        state.heartbeat_event_id = Some(event_id);
1240        Ok(state)
1241    }
1242
1243    fn append_run_state(&self, state: &EvalPackRunState) -> Result<u64, VmError> {
1244        let payload = serde_json::to_value(state)
1245            .map_err(|e| VmError::Runtime(format!("eval run-state encode error: {e}")))?;
1246        let event_id = futures::executor::block_on(self.log.append(
1247            &self.topic,
1248            crate::event_log::LogEvent::new(EVAL_LEDGER_RUN_STATE_KIND, payload),
1249        ))
1250        .map_err(eval_ledger_log_error)?;
1251        futures::executor::block_on(self.log.flush()).map_err(eval_ledger_log_error)?;
1252        Ok(event_id)
1253    }
1254}
1255
1256pub fn validate_eval_pack_split(
1257    manifest: &EvalPackManifest,
1258) -> Result<EvalPackSplitValidationReport, VmError> {
1259    let report = eval_pack_split_validation_report(manifest);
1260    if !report.valid {
1261        return Err(VmError::Runtime(format!(
1262            "eval pack split invalid: {}",
1263            render_split_validation_errors(&report).join("; ")
1264        )));
1265    }
1266    Ok(report)
1267}
1268
1269fn eval_pack_split_validation_report(manifest: &EvalPackManifest) -> EvalPackSplitValidationReport {
1270    let case_ids = eval_pack_case_ids(manifest);
1271    let mut duplicate_case_ids = duplicates(&case_ids);
1272    duplicate_case_ids.sort();
1273
1274    let case_set = case_ids.iter().cloned().collect::<BTreeSet<_>>();
1275    let Some(split) = &manifest.split else {
1276        return EvalPackSplitValidationReport {
1277            valid: duplicate_case_ids.is_empty(),
1278            case_count: case_ids.len(),
1279            covered_count: 0,
1280            duplicate_case_ids,
1281            ..EvalPackSplitValidationReport::default()
1282        };
1283    };
1284
1285    let mut duplicate_partition_cases = Vec::new();
1286    let mut unknown_cases = Vec::new();
1287    let mut seen_by_case: BTreeMap<String, Vec<String>> = BTreeMap::new();
1288    for (partition, cases) in &split.partitions {
1289        let mut local_seen = BTreeSet::new();
1290        for case_id in cases {
1291            if !local_seen.insert(case_id.clone()) {
1292                duplicate_partition_cases.push(format!("{partition}:{case_id}"));
1293            }
1294            if !case_set.contains(case_id) {
1295                unknown_cases.push(format!("{partition}:{case_id}"));
1296            }
1297            let partitions = seen_by_case.entry(case_id.clone()).or_default();
1298            if !partitions.contains(partition) {
1299                partitions.push(partition.clone());
1300            }
1301        }
1302    }
1303
1304    let mut overlap_cases = seen_by_case
1305        .iter()
1306        .filter(|(case_id, partitions)| case_set.contains(*case_id) && partitions.len() > 1)
1307        .map(|(case_id, partitions)| format!("{case_id}:{}", partitions.join(",")))
1308        .collect::<Vec<_>>();
1309    let mut missing_cases = case_set
1310        .iter()
1311        .filter(|case_id| !seen_by_case.contains_key(*case_id))
1312        .cloned()
1313        .collect::<Vec<_>>();
1314    duplicate_partition_cases.sort();
1315    unknown_cases.sort();
1316    overlap_cases.sort();
1317    missing_cases.sort();
1318
1319    let covered_count = case_set
1320        .iter()
1321        .filter(|case_id| seen_by_case.contains_key(*case_id))
1322        .count();
1323    let valid = duplicate_case_ids.is_empty()
1324        && duplicate_partition_cases.is_empty()
1325        && unknown_cases.is_empty()
1326        && overlap_cases.is_empty()
1327        && missing_cases.is_empty();
1328    EvalPackSplitValidationReport {
1329        valid,
1330        partitions: split.partitions.clone(),
1331        case_count: case_ids.len(),
1332        covered_count,
1333        duplicate_case_ids,
1334        duplicate_partition_cases,
1335        overlap_cases,
1336        unknown_cases,
1337        missing_cases,
1338    }
1339}
1340
1341fn eval_pack_case_ids(manifest: &EvalPackManifest) -> Vec<String> {
1342    manifest
1343        .cases
1344        .iter()
1345        .enumerate()
1346        .map(|(index, case)| eval_pack_case_id(case, index))
1347        .collect()
1348}
1349
1350fn eval_pack_case_id(case: &EvalPackCase, index: usize) -> String {
1351    case.id
1352        .clone()
1353        .filter(|id| !id.trim().is_empty())
1354        .unwrap_or_else(|| format!("case_{}", index + 1))
1355}
1356
1357fn duplicates(values: &[String]) -> Vec<String> {
1358    let mut seen = BTreeSet::new();
1359    let mut duplicates = BTreeSet::new();
1360    for value in values {
1361        if !seen.insert(value.clone()) {
1362            duplicates.insert(value.clone());
1363        }
1364    }
1365    duplicates.into_iter().collect()
1366}
1367
1368fn render_split_validation_errors(report: &EvalPackSplitValidationReport) -> Vec<String> {
1369    let mut errors = Vec::new();
1370    if !report.duplicate_case_ids.is_empty() {
1371        errors.push(format!(
1372            "duplicate case ids: {}",
1373            report.duplicate_case_ids.join(", ")
1374        ));
1375    }
1376    if !report.duplicate_partition_cases.is_empty() {
1377        errors.push(format!(
1378            "duplicate partition entries: {}",
1379            report.duplicate_partition_cases.join(", ")
1380        ));
1381    }
1382    if !report.overlap_cases.is_empty() {
1383        errors.push(format!(
1384            "overlapping cases: {}",
1385            report.overlap_cases.join(", ")
1386        ));
1387    }
1388    if !report.unknown_cases.is_empty() {
1389        errors.push(format!(
1390            "unknown cases: {}",
1391            report.unknown_cases.join(", ")
1392        ));
1393    }
1394    if !report.missing_cases.is_empty() {
1395        errors.push(format!(
1396            "missing cases: {}",
1397            report.missing_cases.join(", ")
1398        ));
1399    }
1400    if errors.is_empty() {
1401        errors.push("unknown split validation error".to_string());
1402    }
1403    errors
1404}
1405
1406fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1407    let content = std::fs::read_to_string(path)
1408        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1409    serde_json::from_str(&content)
1410        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1411}
1412
1413fn load_run_record_from_fixture_ref(
1414    fixture: &EvalPackFixtureRef,
1415    base_dir: Option<&Path>,
1416) -> Result<RunRecord, VmError> {
1417    if let Some(inline) = &fixture.inline {
1418        let run: RunRecord = serde_json::from_value(inline.clone())
1419            .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1420        return Ok(run);
1421    }
1422    let path = fixture.path.as_deref().ok_or_else(|| {
1423        VmError::Runtime(format!(
1424            "fixture '{}' is missing path or inline run",
1425            fixture.id
1426        ))
1427    })?;
1428    load_run_record(&resolve_manifest_path(base_dir, path))
1429}
1430
1431fn load_replay_fixture_from_ref(
1432    fixture: &EvalPackFixtureRef,
1433    base_dir: Option<&Path>,
1434) -> Result<ReplayFixture, VmError> {
1435    if let Some(inline) = &fixture.inline {
1436        return serde_json::from_value(inline.clone())
1437            .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1438    }
1439    let path = fixture.path.as_deref().ok_or_else(|| {
1440        VmError::Runtime(format!(
1441            "fixture '{}' is missing path or inline replay fixture",
1442            fixture.id
1443        ))
1444    })?;
1445    load_replay_fixture(&resolve_manifest_path(base_dir, path))
1446}
1447
1448fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1449    let path_buf = PathBuf::from(path);
1450    if path_buf.is_absolute() {
1451        path_buf
1452    } else if let Some(base_dir) = base_dir {
1453        base_dir.join(path_buf)
1454    } else {
1455        path_buf
1456    }
1457}
1458
1459fn run_eval_pack_command(
1460    spec: &EvalPackCommandSpec,
1461    default_cwd: &Path,
1462    stdin_payload: Option<&serde_json::Value>,
1463    default_timeout_seconds: f64,
1464) -> Result<EvalPackCommandOutput, VmError> {
1465    let timeout_seconds = command_timeout(spec).unwrap_or(default_timeout_seconds);
1466    let timeout = timeout_seconds
1467        .is_finite()
1468        .then_some(timeout_seconds)
1469        .filter(|seconds| *seconds > 0.0)
1470        .ok_or_else(|| {
1471            VmError::Runtime(
1472                "eval pack command timeout must be a positive finite number of seconds".to_string(),
1473            )
1474        })?;
1475    let mut command = eval_pack_command(spec)?;
1476    command
1477        .current_dir(command_cwd(spec, default_cwd))
1478        .stdin(if stdin_payload.is_some() {
1479            Stdio::piped()
1480        } else {
1481            Stdio::null()
1482        })
1483        .stdout(Stdio::piped())
1484        .stderr(Stdio::piped());
1485    apply_command_env(spec, &mut command);
1486
1487    let started = crate::clock_mock::leak_audit::instant_now("eval_pack.command.started");
1488    let mut child = command
1489        .spawn()
1490        .map_err(|e| VmError::Runtime(format!("eval pack command spawn failed: {e}")))?;
1491    let stdout_reader = child.stdout.take().map(|mut pipe| {
1492        std::thread::spawn(move || {
1493            let mut bytes = Vec::new();
1494            pipe.read_to_end(&mut bytes).map(|_| bytes)
1495        })
1496    });
1497    let stderr_reader = child.stderr.take().map(|mut pipe| {
1498        std::thread::spawn(move || {
1499            let mut bytes = Vec::new();
1500            pipe.read_to_end(&mut bytes).map(|_| bytes)
1501        })
1502    });
1503
1504    let mut stdin_error = None;
1505    if let Some(payload) = stdin_payload {
1506        match child.stdin.take() {
1507            Some(mut stdin) => {
1508                if let Err(error) = serde_json::to_writer(&mut stdin, payload) {
1509                    stdin_error = Some(format!("eval pack command stdin encode failed: {error}"));
1510                } else if let Err(error) = stdin.write_all(b"\n") {
1511                    stdin_error = Some(format!("eval pack command stdin write failed: {error}"));
1512                }
1513            }
1514            None => {
1515                stdin_error = Some("eval pack command stdin pipe was unavailable".to_string());
1516            }
1517        }
1518    }
1519
1520    let timeout = Duration::from_secs_f64(timeout);
1521    let status = if stdin_error.is_some() {
1522        let _ = child.kill();
1523        let _ = child.wait();
1524        None
1525    } else {
1526        match child
1527            .wait_timeout(timeout)
1528            .map_err(|e| VmError::Runtime(format!("eval pack command wait failed: {e}")))?
1529        {
1530            Some(status) => Some(status),
1531            None => {
1532                let _ = child.kill();
1533                let _ = child.wait();
1534                None
1535            }
1536        }
1537    };
1538    if let Some(error) = stdin_error {
1539        let _ = join_command_reader(stdout_reader, "stdout")?;
1540        let _ = join_command_reader(stderr_reader, "stderr")?;
1541        return Err(VmError::Runtime(error));
1542    }
1543    let wall_time_seconds = started.elapsed().as_secs_f64();
1544    let stdout = join_command_reader(stdout_reader, "stdout")?;
1545    let stderr = join_command_reader(stderr_reader, "stderr")?;
1546    let timed_out = status.is_none();
1547    let exit_code = status.and_then(|status| status.code()).unwrap_or(-1) as i64;
1548    Ok(EvalPackCommandOutput {
1549        exit_code,
1550        stdout,
1551        stderr,
1552        timed_out,
1553        wall_time_seconds,
1554    })
1555}
1556
1557fn eval_pack_command(spec: &EvalPackCommandSpec) -> Result<Command, VmError> {
1558    match spec {
1559        EvalPackCommandSpec::Shell(command) => shell_command(command),
1560        EvalPackCommandSpec::Argv(argv) => argv_command(argv),
1561        EvalPackCommandSpec::Object(object) => {
1562            if let Some(command) = object.command.as_deref() {
1563                shell_command(command)
1564            } else {
1565                argv_command(&object.argv)
1566            }
1567        }
1568    }
1569}
1570
1571fn shell_command(command: &str) -> Result<Command, VmError> {
1572    let command = command.trim();
1573    if command.is_empty() {
1574        return Err(VmError::Runtime(
1575            "eval pack shell command must not be empty".to_string(),
1576        ));
1577    }
1578    #[cfg(windows)]
1579    {
1580        let mut cmd = Command::new("cmd");
1581        cmd.args(["/C", command]);
1582        Ok(cmd)
1583    }
1584    #[cfg(not(windows))]
1585    {
1586        let mut cmd = Command::new("/bin/sh");
1587        cmd.args(["-c", command]);
1588        Ok(cmd)
1589    }
1590}
1591
1592fn argv_command(argv: &[String]) -> Result<Command, VmError> {
1593    let Some((program, args)) = argv.split_first() else {
1594        return Err(VmError::Runtime(
1595            "eval pack argv command must include a program".to_string(),
1596        ));
1597    };
1598    if program.trim().is_empty() {
1599        return Err(VmError::Runtime(
1600            "eval pack argv command program must not be empty".to_string(),
1601        ));
1602    }
1603    let mut command = Command::new(program);
1604    command.args(args);
1605    Ok(command)
1606}
1607
1608fn command_cwd(spec: &EvalPackCommandSpec, default_cwd: &Path) -> PathBuf {
1609    let cwd = match spec {
1610        EvalPackCommandSpec::Object(EvalPackCommandObject { cwd: Some(cwd), .. }) => cwd.as_str(),
1611        _ => return default_cwd.to_path_buf(),
1612    };
1613    let path = PathBuf::from(cwd);
1614    if path.is_absolute() {
1615        path
1616    } else {
1617        default_cwd.join(path)
1618    }
1619}
1620
1621fn apply_command_env(spec: &EvalPackCommandSpec, command: &mut Command) {
1622    if let EvalPackCommandSpec::Object(object) = spec {
1623        command.envs(&object.env);
1624    }
1625}
1626
1627fn command_timeout(spec: &EvalPackCommandSpec) -> Option<f64> {
1628    match spec {
1629        EvalPackCommandSpec::Object(object) => object.timeout_seconds,
1630        _ => None,
1631    }
1632}
1633
1634fn join_command_reader(
1635    reader: Option<std::thread::JoinHandle<std::io::Result<Vec<u8>>>>,
1636    stream: &str,
1637) -> Result<String, VmError> {
1638    let Some(reader) = reader else {
1639        return Ok(String::new());
1640    };
1641    let bytes = reader
1642        .join()
1643        .map_err(|_| VmError::Runtime(format!("eval pack command {stream} reader panicked")))?
1644        .map_err(|e| VmError::Runtime(format!("eval pack command {stream} read failed: {e}")))?;
1645    Ok(String::from_utf8_lossy(&bytes).to_string())
1646}
1647
1648fn eval_pack_live_workspace(
1649    case: &EvalPackCase,
1650    base_dir: Option<&Path>,
1651) -> Result<PathBuf, VmError> {
1652    let workspace = case
1653        .workspace
1654        .as_deref()
1655        .or(case.project.as_deref())
1656        .ok_or_else(|| {
1657            VmError::Runtime("eval pack live-verify case is missing workspace".to_string())
1658        })?;
1659    let workspace = resolve_manifest_path(base_dir, workspace);
1660    if !workspace.is_dir() {
1661        return Err(VmError::Runtime(format!(
1662            "eval pack live-verify workspace does not exist: {}",
1663            workspace.display()
1664        )));
1665    }
1666    Ok(workspace)
1667}
1668
1669fn eval_pack_live_executor_request(
1670    manifest: &EvalPackManifest,
1671    case: &EvalPackCase,
1672    case_id: &str,
1673    trial: usize,
1674    trial_count: usize,
1675    workspace: &Path,
1676    base_dir: Option<&Path>,
1677) -> Result<serde_json::Value, VmError> {
1678    Ok(serde_json::json!({
1679        "schema": LIVE_EXECUTOR_REQUEST_SCHEMA,
1680        "manifest": {
1681            "id": &manifest.id,
1682            "base_dir": base_dir.map(|path| path.display().to_string()),
1683            "metadata": &manifest.metadata,
1684        },
1685        "case": {
1686            "id": case_id,
1687            "name": &case.name,
1688            "task": &case.task,
1689            "workspace": workspace.display().to_string(),
1690            "project": &case.project,
1691            "verify_command": command_spec_json(case.verify_command.as_ref())?,
1692            "expected_output_paths": &case.expected_output_paths,
1693            "required_output_snippets": &case.required_output_snippets,
1694            "tool_budgets": &case.tool_budgets,
1695            "metadata": &case.metadata,
1696            "case_fingerprint": &case.case_fingerprint,
1697        },
1698        "trial": trial,
1699        "trials": trial_count,
1700    }))
1701}
1702
1703fn command_spec_json(spec: Option<&EvalPackCommandSpec>) -> Result<serde_json::Value, VmError> {
1704    match spec {
1705        Some(spec) => serde_json::to_value(spec)
1706            .map_err(|e| VmError::Runtime(format!("eval pack command encode failed: {e}"))),
1707        None => Ok(serde_json::Value::Null),
1708    }
1709}
1710
1711fn live_outcome_from_executor_output(
1712    output: EvalPackCommandOutput,
1713    failures: &mut Vec<String>,
1714) -> EvalPackLiveVerifyOutcome {
1715    let mut outcome = parse_live_outcome_stdout(&output.stdout).unwrap_or_else(|error| {
1716        if !output.stdout.trim().is_empty() {
1717            failures.push(error);
1718        }
1719        EvalPackLiveVerifyOutcome::default()
1720    });
1721    if output.timed_out {
1722        outcome.timed_out = true;
1723    }
1724    if output.exit_code != 0 {
1725        failures.push(format!(
1726            "live executor exited {}{}",
1727            output.exit_code,
1728            command_failure_excerpt(&output)
1729        ));
1730    }
1731    if outcome.wall_time_seconds == 0.0 {
1732        outcome.wall_time_seconds = output.wall_time_seconds;
1733    }
1734    outcome
1735}
1736
1737fn parse_live_outcome_stdout(stdout: &str) -> Result<EvalPackLiveVerifyOutcome, String> {
1738    let trimmed = stdout.trim();
1739    if trimmed.is_empty() {
1740        return Ok(EvalPackLiveVerifyOutcome::default());
1741    }
1742    serde_json::from_str(trimmed)
1743        .or_else(|_| {
1744            trimmed
1745                .lines()
1746                .rev()
1747                .find(|line| !line.trim().is_empty())
1748                .ok_or_else(|| serde_json::Error::io(std::io::ErrorKind::UnexpectedEof.into()))
1749                .and_then(|line| serde_json::from_str(line.trim()))
1750        })
1751        .map_err(|error| format!("live executor stdout did not contain a JSON outcome: {error}"))
1752}
1753
1754fn live_outcome_verification(outcome: &EvalPackLiveVerifyOutcome) -> String {
1755    if let Some(verification) = outcome.verification.as_deref() {
1756        return normalize_live_verification(verification);
1757    }
1758    if outcome.timed_out {
1759        return "FAIL".to_string();
1760    }
1761    if let Some(exit_code) = outcome.verification_exit_code {
1762        return if exit_code == 0 { "PASS" } else { "FAIL" }.to_string();
1763    }
1764    if let Some(passed) = outcome.passed {
1765        return if passed { "PASS" } else { "FAIL" }.to_string();
1766    }
1767    "PASS".to_string()
1768}
1769
1770fn normalize_live_verification(verification: &str) -> String {
1771    match verification.trim().to_ascii_lowercase().as_str() {
1772        "pass" | "passed" | "success" | "ok" => "PASS".to_string(),
1773        "skip" | "skipped" => "skip".to_string(),
1774        _ => "FAIL".to_string(),
1775    }
1776}
1777
1778fn command_failure_excerpt(output: &EvalPackCommandOutput) -> String {
1779    let stderr = compact_output_excerpt(&output.stderr);
1780    if !stderr.is_empty() {
1781        return format!("; stderr: {stderr}");
1782    }
1783    let stdout = compact_output_excerpt(&output.stdout);
1784    if stdout.is_empty() {
1785        String::new()
1786    } else {
1787        format!("; stdout: {stdout}")
1788    }
1789}
1790
1791fn compact_output_excerpt(output: &str) -> String {
1792    let compact = output.split_whitespace().collect::<Vec<_>>().join(" ");
1793    let max_chars = 240;
1794    if compact.chars().count() > max_chars {
1795        format!("{}...", compact.chars().take(max_chars).collect::<String>())
1796    } else {
1797        compact
1798    }
1799}
1800
1801fn normalized_live_produced_paths(
1802    case: &EvalPackCase,
1803    outcome: &EvalPackLiveVerifyOutcome,
1804) -> Vec<String> {
1805    let mut seen = BTreeSet::new();
1806    let mut paths = Vec::new();
1807    for path in outcome
1808        .produced_paths
1809        .iter()
1810        .chain(case.expected_output_paths.iter())
1811    {
1812        if !path.trim().is_empty() && seen.insert(path.clone()) {
1813            paths.push(path.clone());
1814        }
1815    }
1816    paths
1817}
1818
1819fn eval_pack_live_expected_path_failures(workspace: &Path, paths: &[String]) -> Vec<String> {
1820    paths
1821        .iter()
1822        .filter_map(|path| {
1823            let resolved = resolve_manifest_path(Some(workspace), path);
1824            (!resolved.exists()).then(|| {
1825                format!(
1826                    "expected output path does not exist: {}",
1827                    resolved.display()
1828                )
1829            })
1830        })
1831        .collect()
1832}
1833
1834fn eval_pack_live_required_snippet_failures(
1835    workspace: &Path,
1836    paths: &[String],
1837    snippets: &[String],
1838) -> Vec<String> {
1839    let readable_outputs = paths
1840        .iter()
1841        .map(|path| resolve_manifest_path(Some(workspace), path))
1842        .filter(|path| path.is_file())
1843        .collect::<Vec<_>>();
1844    snippets
1845        .iter()
1846        .filter(|snippet| !snippet.is_empty())
1847        .filter_map(|snippet| {
1848            let found = readable_outputs.iter().any(|path| {
1849                std::fs::read_to_string(path)
1850                    .map(|content| content.contains(snippet))
1851                    .unwrap_or(false)
1852            });
1853            (!found).then(|| format!("required output snippet not found: {snippet:?}"))
1854        })
1855        .collect()
1856}
1857
1858fn eval_pack_live_tool_budget_failures(
1859    budgets: &BTreeMap<String, usize>,
1860    summary: &serde_json::Value,
1861) -> Vec<String> {
1862    budgets
1863        .iter()
1864        .filter_map(|(name, limit)| {
1865            let count = live_tool_summary_count(summary, name)?;
1866            (count > *limit)
1867                .then(|| format!("tool budget {name} exceeded: {count} calls > {limit}"))
1868        })
1869        .collect()
1870}
1871
1872fn live_tool_summary_count(summary: &serde_json::Value, name: &str) -> Option<usize> {
1873    let normalized = name.trim();
1874    if normalized.is_empty() {
1875        return None;
1876    }
1877    if normalized == "total" {
1878        return json_usize_from_keys(summary, &["total", "calls", "tool_calls", "toolCalls"]);
1879    }
1880    json_usize_from_keys(summary, &[normalized])
1881        .or_else(|| {
1882            summary
1883                .get("by_tool")
1884                .or_else(|| summary.get("byTool"))
1885                .and_then(|value| json_usize_from_keys(value, &[normalized]))
1886        })
1887        .or_else(|| {
1888            summary
1889                .get("tools")
1890                .and_then(|value| json_usize_from_keys(value, &[normalized]))
1891        })
1892        .or_else(|| {
1893            // Fall back to counting occurrences in the per-call `sequence`
1894            // array. The in-process coding-agent executor only emits
1895            // `{total, rejected, sequence, successful}` (no `by_tool` map), so
1896            // without this a named per-tool budget like `{edit: 1}` would
1897            // silently never be enforced for live coding-agent evals.
1898            summary
1899                .get("sequence")
1900                .and_then(serde_json::Value::as_array)
1901                .map(|calls| {
1902                    calls
1903                        .iter()
1904                        .filter(|call| call.as_str() == Some(normalized))
1905                        .count()
1906                })
1907        })
1908}
1909
1910fn json_usize_from_keys(value: &serde_json::Value, keys: &[&str]) -> Option<usize> {
1911    keys.iter()
1912        .find_map(|key| value.get(*key))
1913        .and_then(json_value_usize)
1914}
1915
1916fn json_value_usize(value: &serde_json::Value) -> Option<usize> {
1917    value
1918        .as_u64()
1919        .and_then(|value| usize::try_from(value).ok())
1920        .or_else(|| value.as_i64().and_then(|value| usize::try_from(value).ok()))
1921}
1922
1923pub fn evaluate_run_suite_manifest(
1924    manifest: &EvalSuiteManifest,
1925) -> Result<ReplayEvalSuiteReport, VmError> {
1926    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1927    let mut reports = Vec::new();
1928    for case in &manifest.cases {
1929        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1930        let run = load_run_record(&run_path)?;
1931        let fixture = match &case.fixture_path {
1932            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1933            None => run
1934                .replay_fixture
1935                .clone()
1936                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1937        };
1938        let eval = evaluate_run_against_fixture(&run, &fixture);
1939        let mut pass = eval.pass;
1940        let mut failures = eval.failures;
1941        let comparison = match &case.compare_to {
1942            Some(path) => {
1943                let baseline_path = resolve_manifest_path(base_dir, path);
1944                let baseline = load_run_record(&baseline_path)?;
1945                let diff = diff_run_records(&baseline, &run);
1946                if !diff.identical {
1947                    pass = false;
1948                    failures.push(format!(
1949                        "run differs from baseline {} with {} stage changes",
1950                        baseline_path.display(),
1951                        diff.stage_diffs.len()
1952                    ));
1953                }
1954                Some(diff)
1955            }
1956            None => None,
1957        };
1958        reports.push(ReplayEvalCaseReport {
1959            run_id: run.id.clone(),
1960            workflow_id: run.workflow_id.clone(),
1961            label: case.label.clone(),
1962            pass,
1963            failures,
1964            stage_count: eval.stage_count,
1965            source_path: Some(run_path.display().to_string()),
1966            comparison,
1967        });
1968    }
1969    let total = reports.len();
1970    let passed = reports.iter().filter(|report| report.pass).count();
1971    let failed = total.saturating_sub(passed);
1972    Ok(ReplayEvalSuiteReport {
1973        pass: failed == 0,
1974        total,
1975        passed,
1976        failed,
1977        cases: reports,
1978    })
1979}
1980
1981pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1982    let mut live_executor = EvalPackShellLiveExecutor;
1983    evaluate_eval_pack_manifest_inner(manifest, false, None, &mut live_executor)
1984}
1985
1986pub fn evaluate_eval_pack_manifest_resumable(
1987    manifest: &EvalPackManifest,
1988    ledger_options: Option<serde_json::Value>,
1989) -> Result<EvalPackReport, VmError> {
1990    let mut live_executor = EvalPackShellLiveExecutor;
1991    evaluate_eval_pack_manifest_inner(manifest, true, ledger_options, &mut live_executor)
1992}
1993
1994pub fn evaluate_eval_pack_manifest_with_live_executor(
1995    manifest: &EvalPackManifest,
1996    live_executor: &mut dyn EvalPackLiveExecutor,
1997) -> Result<EvalPackReport, VmError> {
1998    evaluate_eval_pack_manifest_inner(manifest, false, None, live_executor)
1999}
2000
2001pub fn evaluate_eval_pack_manifest_resumable_with_live_executor(
2002    manifest: &EvalPackManifest,
2003    ledger_options: Option<serde_json::Value>,
2004    live_executor: &mut dyn EvalPackLiveExecutor,
2005) -> Result<EvalPackReport, VmError> {
2006    evaluate_eval_pack_manifest_inner(manifest, true, ledger_options, live_executor)
2007}
2008
2009fn evaluate_eval_pack_manifest_inner(
2010    manifest: &EvalPackManifest,
2011    ledger_enabled: bool,
2012    ledger_options: Option<serde_json::Value>,
2013    live_executor: &mut dyn EvalPackLiveExecutor,
2014) -> Result<EvalPackReport, VmError> {
2015    let base_dir = manifest.base_dir.as_deref().map(Path::new);
2016    let fixture_base_dir_buf = manifest
2017        .defaults
2018        .fixture_root
2019        .as_deref()
2020        .map(|root| resolve_manifest_path(base_dir, root));
2021    let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
2022    let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
2023        .fixtures
2024        .iter()
2025        .filter(|fixture| !fixture.id.is_empty())
2026        .map(|fixture| (fixture.id.as_str(), fixture))
2027        .collect();
2028    let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
2029        .rubrics
2030        .iter()
2031        .filter(|rubric| !rubric.id.is_empty())
2032        .map(|rubric| (rubric.id.as_str(), rubric))
2033        .collect();
2034
2035    let split_report = validate_eval_pack_split(manifest)?;
2036    let split_by_case = split_by_case_id(&split_report);
2037    let harness_config_fingerprint = eval_pack_harness_config_fingerprint(manifest)?;
2038    let mut ledger = if ledger_enabled {
2039        Some(EvalPackLedgerRun::start(
2040            manifest,
2041            base_dir,
2042            ledger_options,
2043        )?)
2044    } else {
2045        None
2046    };
2047    let mut requested_cells = 0usize;
2048    let mut skipped_cells = 0usize;
2049    let mut executed_cells = 0usize;
2050    let mut reports = Vec::new();
2051    for (index, case) in manifest.cases.iter().enumerate() {
2052        let case_id = eval_pack_case_id(case, index);
2053        let label = case
2054            .name
2055            .clone()
2056            .or_else(|| case.id.clone())
2057            .unwrap_or_else(|| case_id.clone());
2058        let severity = eval_pack_case_severity(manifest, case);
2059        let blocking = severity == "blocking";
2060        let trial_count = case.trials.unwrap_or(manifest.trials);
2061        let split = split_by_case.get(&case_id).cloned();
2062        requested_cells += trial_count;
2063        let mut trials = Vec::with_capacity(trial_count);
2064        for trial in 1..=trial_count {
2065            if let Some(ledger) = ledger.as_mut() {
2066                if let Some(row) = ledger.replay_row_for_cell(
2067                    &case_id,
2068                    split.as_deref(),
2069                    trial,
2070                    &case.case_fingerprint,
2071                    &harness_config_fingerprint,
2072                ) {
2073                    skipped_cells += 1;
2074                    trials.push(eval_pack_trial_report_from_ledger_row(&row, blocking));
2075                    continue;
2076                }
2077            }
2078            let report = match eval_pack_case_kind(case) {
2079                EvalPackCaseKind::LiveVerify => evaluate_eval_pack_live_verify_trial(
2080                    manifest,
2081                    case,
2082                    &case_id,
2083                    trial,
2084                    trial_count,
2085                    &severity,
2086                    blocking,
2087                    base_dir,
2088                    live_executor,
2089                )?,
2090                EvalPackCaseKind::Friction => evaluate_eval_pack_friction_trial(
2091                    manifest,
2092                    case,
2093                    trial,
2094                    &severity,
2095                    blocking,
2096                    base_dir,
2097                    fixture_base_dir,
2098                    &fixtures_by_id,
2099                    &rubrics_by_id,
2100                )?,
2101                EvalPackCaseKind::Replay => evaluate_eval_pack_run_trial(
2102                    manifest,
2103                    case,
2104                    trial,
2105                    &severity,
2106                    blocking,
2107                    base_dir,
2108                    fixture_base_dir,
2109                    &fixtures_by_id,
2110                    &rubrics_by_id,
2111                )?,
2112            };
2113            if let Some(ledger) = ledger.as_mut() {
2114                let row = eval_ledger_row_from_trial(
2115                    case,
2116                    &case_id,
2117                    split.clone(),
2118                    &ledger.suite,
2119                    &ledger.model,
2120                    &ledger.commit,
2121                    &ledger.provenance,
2122                    &harness_config_fingerprint,
2123                    &report,
2124                );
2125                ledger.append_trial_row(row)?;
2126            }
2127            executed_cells += 1;
2128            trials.push(report);
2129        }
2130        reports.push(eval_pack_case_report_from_trials(
2131            case,
2132            case_id,
2133            label,
2134            severity,
2135            split,
2136            blocking,
2137            harness_config_fingerprint.clone(),
2138            trials,
2139        ));
2140    }
2141
2142    let mut ladder_reports = Vec::new();
2143    for ladder in &manifest.ladders {
2144        let mut ladder = ladder.clone();
2145        if ladder.base_dir.is_none() {
2146            ladder.base_dir = manifest.base_dir.clone();
2147        }
2148        ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2149    }
2150
2151    let stats_rows = reports
2152        .iter()
2153        .map(|report| report.stats_row.clone())
2154        .collect::<Vec<_>>();
2155    let stats = eval_pack_stats_report(&stats_rows);
2156    let case_total = reports.len();
2157    let ladder_total = ladder_reports.len();
2158    let total = case_total + ladder_total;
2159    let trial_count = reports.iter().map(|report| report.trial_count).sum();
2160    let case_blocking_failed = reports
2161        .iter()
2162        .filter(|report| report.blocking && report.reliability.status != "all-pass")
2163        .count();
2164    let ladder_blocking_failed = ladder_reports
2165        .iter()
2166        .filter(|report| report.blocking && !report.pass)
2167        .count();
2168    let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2169    let warning_failed = reports
2170        .iter()
2171        .filter(|report| !report.warnings.is_empty())
2172        .count()
2173        + ladder_reports
2174            .iter()
2175            .filter(|report| !report.pass && report.severity == "warning")
2176            .count();
2177    let informational_failed = reports
2178        .iter()
2179        .filter(|report| !report.informational.is_empty())
2180        .count()
2181        + ladder_reports
2182            .iter()
2183            .filter(|report| !report.pass && report.severity == "informational")
2184            .count();
2185    let passed = reports.iter().filter(|report| report.pass).count()
2186        + ladder_reports.iter().filter(|report| report.pass).count();
2187    let run_state = match ledger.as_ref() {
2188        Some(ledger) => ledger.finish(requested_cells, skipped_cells, executed_cells)?,
2189        None => EvalPackRunState {
2190            schema: EVAL_LEDGER_RUN_STATE_SCHEMA.to_string(),
2191            suite: manifest.id.clone(),
2192            model: eval_pack_manifest_model(manifest).unwrap_or_else(|| "unknown".to_string()),
2193            requested_cells,
2194            completed_cells: requested_cells,
2195            executed_cells: requested_cells,
2196            ..EvalPackRunState::default()
2197        },
2198    };
2199    Ok(EvalPackReport {
2200        pack_id: manifest.id.clone(),
2201        harness_config_fingerprint,
2202        pass: blocking_failed == 0,
2203        total,
2204        passed,
2205        failed: total.saturating_sub(passed),
2206        blocking_failed,
2207        warning_failed,
2208        informational_failed,
2209        trial_count,
2210        run_state,
2211        split: manifest.split.as_ref().map(|_| split_report),
2212        stats,
2213        stats_rows,
2214        cases: reports,
2215        ladders: ladder_reports,
2216    })
2217}
2218
2219#[allow(clippy::too_many_arguments)]
2220fn evaluate_eval_pack_live_verify_trial(
2221    manifest: &EvalPackManifest,
2222    case: &EvalPackCase,
2223    case_id: &str,
2224    trial: usize,
2225    trial_count: usize,
2226    severity: &str,
2227    blocking: bool,
2228    base_dir: Option<&Path>,
2229    live_executor: &mut dyn EvalPackLiveExecutor,
2230) -> Result<EvalPackTrialReport, VmError> {
2231    let workspace = eval_pack_live_workspace(case, base_dir)?;
2232    let executor = case.executor.as_ref().or(manifest.executor.as_ref());
2233    let verify_command = case.verify_command.as_ref().ok_or_else(|| {
2234        VmError::Runtime(format!(
2235            "eval pack live-verify case '{case_id}' is missing verify_command"
2236        ))
2237    })?;
2238    let Some(executor) = executor else {
2239        return Err(VmError::Runtime(format!(
2240            "eval pack live-verify case '{case_id}' is missing executor"
2241        )));
2242    };
2243
2244    let mut failures = Vec::new();
2245    let mut warnings = Vec::new();
2246    let mut informational = Vec::new();
2247    let request_payload = eval_pack_live_executor_request(
2248        manifest,
2249        case,
2250        case_id,
2251        trial,
2252        trial_count,
2253        &workspace,
2254        base_dir,
2255    )?;
2256    let request = EvalPackLiveExecutorRequest {
2257        executor: executor.clone(),
2258        payload: request_payload,
2259        manifest_id: manifest.id.clone(),
2260        case: case.clone(),
2261        case_id: case_id.to_string(),
2262        trial,
2263        trials: trial_count,
2264        workspace: workspace.clone(),
2265        base_dir: base_dir.map(Path::to_path_buf),
2266    };
2267    let mut outcome = match live_executor.execute(request) {
2268        Ok(outcome) => outcome,
2269        Err(error) => {
2270            failures.push(format!("live executor failed: {error}"));
2271            EvalPackLiveVerifyOutcome::default()
2272        }
2273    };
2274    failures.append(&mut outcome.failures);
2275    warnings.append(&mut outcome.warnings);
2276    informational.append(&mut outcome.informational);
2277    if outcome.timed_out {
2278        failures.push("live executor timed out".to_string());
2279    }
2280    if live_outcome_verification(&outcome) == "FAIL" {
2281        failures.push("live executor reported verification FAIL".to_string());
2282    }
2283
2284    let verify_output = run_eval_pack_command(
2285        verify_command,
2286        &workspace,
2287        None,
2288        DEFAULT_LIVE_VERIFY_TIMEOUT_SECONDS,
2289    );
2290    let verification_exit_code = match verify_output {
2291        Ok(output) => {
2292            let exit_code = output.exit_code;
2293            if output.timed_out {
2294                outcome.timed_out = true;
2295                failures.push("verify command timed out".to_string());
2296            }
2297            if exit_code != 0 {
2298                failures.push(format!(
2299                    "verify command exited {exit_code}{}",
2300                    command_failure_excerpt(&output)
2301                ));
2302            }
2303            if outcome.wall_time_seconds == 0.0 {
2304                outcome.wall_time_seconds = output.wall_time_seconds;
2305            }
2306            Some(exit_code)
2307        }
2308        Err(error) => {
2309            failures.push(format!("verify command failed: {error}"));
2310            None
2311        }
2312    };
2313
2314    let produced_paths = normalized_live_produced_paths(case, &outcome);
2315    failures.extend(eval_pack_live_expected_path_failures(
2316        &workspace,
2317        &case.expected_output_paths,
2318    ));
2319    failures.extend(eval_pack_live_required_snippet_failures(
2320        &workspace,
2321        &produced_paths,
2322        &case.required_output_snippets,
2323    ));
2324    failures.extend(eval_pack_live_tool_budget_failures(
2325        &case.tool_budgets,
2326        &outcome.tool_call_summary,
2327    ));
2328
2329    let mut report = eval_pack_trial_report(
2330        trial,
2331        severity,
2332        blocking,
2333        outcome
2334            .run_id
2335            .clone()
2336            .unwrap_or_else(|| format!("live:{case_id}:{trial}")),
2337        outcome
2338            .workflow_id
2339            .clone()
2340            .unwrap_or_else(|| "live-verify".to_string()),
2341        outcome
2342            .source_path
2343            .clone()
2344            .or_else(|| Some(workspace.display().to_string())),
2345        outcome.stage_count.unwrap_or_default(),
2346        outcome.timed_out,
2347        outcome.wall_time_seconds,
2348        outcome.cost_usd,
2349        failures,
2350        warnings,
2351        informational,
2352        None,
2353    );
2354    let outcome_verification = live_outcome_verification(&outcome);
2355    if report.failures.is_empty()
2356        && outcome_verification.eq_ignore_ascii_case("skip")
2357        && verification_exit_code.unwrap_or_default() == 0
2358    {
2359        report.verification = "skip".to_string();
2360    }
2361    report.verification_exit_code = verification_exit_code;
2362    report.produced_paths = produced_paths;
2363    report.tool_call_summary = outcome.tool_call_summary;
2364    Ok(report)
2365}
2366
2367#[allow(clippy::too_many_arguments)]
2368fn evaluate_eval_pack_run_trial(
2369    manifest: &EvalPackManifest,
2370    case: &EvalPackCase,
2371    trial: usize,
2372    severity: &str,
2373    blocking: bool,
2374    base_dir: Option<&Path>,
2375    fixture_base_dir: Option<&Path>,
2376    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2377    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2378) -> Result<EvalPackTrialReport, VmError> {
2379    let mut failures = Vec::new();
2380    let mut warnings = Vec::new();
2381    let informational = Vec::new();
2382    let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2383    let fixture =
2384        load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, fixtures_by_id, &run)?;
2385    let eval = evaluate_run_against_fixture(&run, &fixture);
2386    failures.extend(eval.failures);
2387    apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2388    apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2389
2390    let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2391        Some(path) => {
2392            let baseline_path = resolve_manifest_path(base_dir, path);
2393            let baseline = load_run_record(&baseline_path)?;
2394            let diff = diff_run_records(&baseline, &run);
2395            if !diff.identical {
2396                failures.push(format!(
2397                    "run differs from baseline {} with {} stage changes",
2398                    baseline_path.display(),
2399                    diff.stage_diffs.len()
2400                ));
2401            }
2402            Some(diff)
2403        }
2404        None => None,
2405    };
2406
2407    for rubric_id in &case.rubrics {
2408        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2409            failures.push(format!("case references unknown rubric '{rubric_id}'"));
2410            continue;
2411        };
2412        apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2413    }
2414
2415    Ok(eval_pack_trial_report(
2416        trial,
2417        severity,
2418        blocking,
2419        run.id.clone(),
2420        run.workflow_id.clone(),
2421        eval_pack_case_source_path(case, base_dir, fixture_base_dir, fixtures_by_id),
2422        eval.stage_count,
2423        run.status.to_ascii_lowercase().contains("timeout"),
2424        run.usage
2425            .as_ref()
2426            .map(|usage| usage.total_duration_ms as f64 / 1000.0)
2427            .unwrap_or_default(),
2428        run.usage
2429            .as_ref()
2430            .map(|usage| usage.total_cost)
2431            .unwrap_or_default(),
2432        failures,
2433        warnings,
2434        informational,
2435        comparison,
2436    ))
2437}
2438
2439#[allow(clippy::too_many_arguments)]
2440fn evaluate_eval_pack_friction_trial(
2441    manifest: &EvalPackManifest,
2442    case: &EvalPackCase,
2443    trial: usize,
2444    severity: &str,
2445    blocking: bool,
2446    base_dir: Option<&Path>,
2447    fixture_base_dir: Option<&Path>,
2448    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2449    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2450) -> Result<EvalPackTrialReport, VmError> {
2451    let mut failures = Vec::new();
2452    let mut warnings = Vec::new();
2453    let informational = Vec::new();
2454    let events =
2455        load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2456    let options = friction_suggestion_options(case, manifest);
2457    let suggestions = generate_context_pack_suggestions(&events, &options);
2458
2459    for rubric_id in &case.rubrics {
2460        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2461            failures.push(format!("case references unknown rubric '{rubric_id}'"));
2462            continue;
2463        };
2464        apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2465    }
2466
2467    if case.rubrics.is_empty() && suggestions.is_empty() {
2468        failures.push("friction fixture produced no context-pack suggestions".to_string());
2469    }
2470
2471    Ok(eval_pack_trial_report(
2472        trial,
2473        severity,
2474        blocking,
2475        "friction_events".to_string(),
2476        String::new(),
2477        eval_pack_case_friction_source_path(case, base_dir, fixture_base_dir, fixtures_by_id),
2478        events.len(),
2479        false,
2480        0.0,
2481        0.0,
2482        failures,
2483        warnings,
2484        informational,
2485        None,
2486    ))
2487}
2488
2489#[allow(clippy::too_many_arguments)]
2490fn eval_pack_trial_report(
2491    trial: usize,
2492    severity: &str,
2493    blocking: bool,
2494    run_id: String,
2495    workflow_id: String,
2496    source_path: Option<String>,
2497    stage_count: usize,
2498    timed_out: bool,
2499    wall_time_seconds: f64,
2500    cost_usd: f64,
2501    mut failures: Vec<String>,
2502    mut warnings: Vec<String>,
2503    mut informational: Vec<String>,
2504    comparison: Option<RunDiffReport>,
2505) -> EvalPackTrialReport {
2506    let verification = if failures.is_empty() { "PASS" } else { "FAIL" }.to_string();
2507    let pass = failures.is_empty() || !blocking;
2508    if !failures.is_empty() && !blocking {
2509        if severity == "warning" {
2510            warnings.append(&mut failures);
2511        } else {
2512            informational.append(&mut failures);
2513        }
2514    }
2515    EvalPackTrialReport {
2516        trial,
2517        verification,
2518        verification_exit_code: None,
2519        pass,
2520        blocking,
2521        run_id,
2522        workflow_id,
2523        source_path,
2524        stage_count,
2525        failures,
2526        warnings,
2527        informational,
2528        comparison,
2529        timed_out,
2530        wall_time_seconds,
2531        cost_usd,
2532        produced_paths: Vec::new(),
2533        tool_call_summary: serde_json::Value::Null,
2534    }
2535}
2536
2537#[allow(clippy::too_many_arguments)]
2538fn eval_ledger_row_from_trial(
2539    case: &EvalPackCase,
2540    case_id: &str,
2541    split: Option<String>,
2542    suite: &str,
2543    model: &str,
2544    commit: &str,
2545    provenance: &EvalLedgerProvenance,
2546    harness_config_fingerprint: &str,
2547    trial: &EvalPackTrialReport,
2548) -> EvalLedgerRow {
2549    let passes = usize::from(trial.verification == "PASS");
2550    let fails = usize::from(trial.verification == "FAIL");
2551    let skips = usize::from(trial.verification.eq_ignore_ascii_case("skip"));
2552    let timeouts = usize::from(trial.timed_out);
2553    EvalLedgerRow {
2554        schema: EVAL_LEDGER_ROW_SCHEMA.to_string(),
2555        suite: suite.to_string(),
2556        model: model.to_string(),
2557        split,
2558        commit: commit.to_string(),
2559        case_name: case_id.to_string(),
2560        name: case_id.to_string(),
2561        case_fingerprint: case.case_fingerprint.clone(),
2562        harness_config_fingerprint: harness_config_fingerprint.to_string(),
2563        trial: trial.trial,
2564        trials: 1,
2565        passes,
2566        fails,
2567        skips,
2568        timeouts,
2569        pass_rate: passes as f64,
2570        status: trial.verification.clone(),
2571        verification: trial.verification.clone(),
2572        skipped: false,
2573        wall_time_seconds: trial.wall_time_seconds,
2574        cost_usd: trial.cost_usd,
2575        mean_wall_time_seconds: trial.wall_time_seconds,
2576        total_cost_usd: trial.cost_usd,
2577        run_id: trial.run_id.clone(),
2578        workflow_id: trial.workflow_id.clone(),
2579        source_path: trial.source_path.clone(),
2580        trial_report: Some(trial.clone()),
2581        provenance: provenance.clone(),
2582        metadata: case.metadata.clone(),
2583        ..EvalLedgerRow::default()
2584    }
2585}
2586
2587fn eval_pack_trial_report_from_ledger_row(
2588    row: &EvalLedgerRow,
2589    blocking: bool,
2590) -> EvalPackTrialReport {
2591    if let Some(mut report) = row.trial_report.clone() {
2592        report.trial = row.trial;
2593        return report;
2594    }
2595    let mut failures = Vec::new();
2596    let verification = if row.verification.is_empty() {
2597        row.status.clone()
2598    } else {
2599        row.verification.clone()
2600    };
2601    if verification == "FAIL" {
2602        failures.push("ledger row recorded a failed trial".to_string());
2603    }
2604    EvalPackTrialReport {
2605        trial: row.trial,
2606        verification: verification.clone(),
2607        verification_exit_code: None,
2608        pass: verification != "FAIL" || !blocking,
2609        blocking,
2610        run_id: row.run_id.clone(),
2611        workflow_id: row.workflow_id.clone(),
2612        source_path: row.source_path.clone(),
2613        stage_count: 0,
2614        failures,
2615        warnings: Vec::new(),
2616        informational: Vec::new(),
2617        comparison: None,
2618        timed_out: row.timeouts > 0,
2619        wall_time_seconds: row.wall_time_seconds,
2620        cost_usd: row.cost_usd,
2621        produced_paths: Vec::new(),
2622        tool_call_summary: serde_json::Value::Null,
2623    }
2624}
2625
2626#[allow(clippy::too_many_arguments)]
2627fn eval_pack_case_report_from_trials(
2628    case: &EvalPackCase,
2629    case_id: String,
2630    label: String,
2631    severity: String,
2632    split: Option<String>,
2633    blocking: bool,
2634    harness_config_fingerprint: String,
2635    trials: Vec<EvalPackTrialReport>,
2636) -> EvalPackCaseReport {
2637    let reliability = eval_pack_reliability_report(&trials);
2638    let stats_row = eval_pack_stats_row(
2639        case,
2640        &case_id,
2641        &harness_config_fingerprint,
2642        split.clone(),
2643        &trials,
2644        &reliability,
2645    );
2646    let first = trials.first();
2647    let pass = if blocking {
2648        reliability.status == "all-pass"
2649    } else {
2650        true
2651    };
2652    let failures = prefixed_trial_messages(&trials, |trial| &trial.failures);
2653    let warnings = prefixed_trial_messages(&trials, |trial| &trial.warnings);
2654    let informational = prefixed_trial_messages(&trials, |trial| &trial.informational);
2655    EvalPackCaseReport {
2656        id: case_id,
2657        label,
2658        severity,
2659        split,
2660        case_fingerprint: case.case_fingerprint.clone(),
2661        harness_config_fingerprint,
2662        pass,
2663        blocking,
2664        run_id: first.map(|trial| trial.run_id.clone()).unwrap_or_default(),
2665        workflow_id: first
2666            .map(|trial| trial.workflow_id.clone())
2667            .unwrap_or_default(),
2668        source_path: first.and_then(|trial| trial.source_path.clone()),
2669        stage_count: first.map(|trial| trial.stage_count).unwrap_or_default(),
2670        trial_count: trials.len(),
2671        total_stage_count: trials.iter().map(|trial| trial.stage_count).sum(),
2672        reliability,
2673        stats_row,
2674        comparison: first.and_then(|trial| trial.comparison.clone()),
2675        trials,
2676        failures,
2677        warnings,
2678        informational,
2679    }
2680}
2681
2682fn prefixed_trial_messages<F>(trials: &[EvalPackTrialReport], messages: F) -> Vec<String>
2683where
2684    F: Fn(&EvalPackTrialReport) -> &Vec<String>,
2685{
2686    let include_prefix = trials.len() > 1;
2687    let mut out = Vec::new();
2688    for trial in trials {
2689        for message in messages(trial) {
2690            if include_prefix {
2691                out.push(format!("trial {}: {message}", trial.trial));
2692            } else {
2693                out.push(message.clone());
2694            }
2695        }
2696    }
2697    out
2698}
2699
2700fn eval_pack_reliability_report(trials: &[EvalPackTrialReport]) -> EvalPackReliabilityReport {
2701    let passes = trials
2702        .iter()
2703        .filter(|trial| trial.verification == "PASS")
2704        .count();
2705    let fails = trials
2706        .iter()
2707        .filter(|trial| trial.verification == "FAIL")
2708        .count();
2709    let skips = trials
2710        .iter()
2711        .filter(|trial| trial.verification.eq_ignore_ascii_case("skip"))
2712        .count();
2713    let timeouts = trials.iter().filter(|trial| trial.timed_out).count();
2714    let decided = passes + fails;
2715    let majority = if passes > 0 && fails > 0 {
2716        Some(if passes >= fails { "PASS" } else { "FAIL" }.to_string())
2717    } else {
2718        None
2719    };
2720    let status = if decided == 0 {
2721        "no-decision"
2722    } else if fails == 0 {
2723        "all-pass"
2724    } else if passes == 0 {
2725        "all-fail"
2726    } else {
2727        "flaky"
2728    };
2729    EvalPackReliabilityReport {
2730        status: status.to_string(),
2731        trials: trials.len(),
2732        passes,
2733        fails,
2734        skips,
2735        timeouts,
2736        decided,
2737        pass_rate: if trials.is_empty() {
2738            0.0
2739        } else {
2740            passes as f64 / trials.len() as f64
2741        },
2742        majority,
2743    }
2744}
2745
2746fn eval_pack_stats_row(
2747    case: &EvalPackCase,
2748    case_id: &str,
2749    harness_config_fingerprint: &str,
2750    split: Option<String>,
2751    trials: &[EvalPackTrialReport],
2752    reliability: &EvalPackReliabilityReport,
2753) -> EvalPackStatsRow {
2754    let wall_times = trials
2755        .iter()
2756        .map(|trial| trial.wall_time_seconds)
2757        .collect::<Vec<_>>();
2758    let costs = trials
2759        .iter()
2760        .map(|trial| trial.cost_usd)
2761        .collect::<Vec<_>>();
2762    let group = case
2763        .metadata
2764        .get("group")
2765        .or_else(|| case.metadata.get("language"))
2766        .or_else(|| case.metadata.get("bucket"))
2767        .and_then(|value| value.as_str())
2768        .unwrap_or_default()
2769        .to_string();
2770    EvalPackStatsRow {
2771        name: case_id.to_string(),
2772        case_name: case_id.to_string(),
2773        case_fingerprint: case.case_fingerprint.clone(),
2774        harness_config_fingerprint: harness_config_fingerprint.to_string(),
2775        group,
2776        split,
2777        trials: trials.len(),
2778        passes: reliability.passes,
2779        fails: reliability.fails,
2780        skips: reliability.skips,
2781        timeouts: reliability.timeouts,
2782        pass_rate: reliability.pass_rate,
2783        status: match reliability.status.as_str() {
2784            "all-pass" => "PASS",
2785            "all-fail" => "FAIL",
2786            "flaky" => "FLAKY",
2787            _ => "skip",
2788        }
2789        .to_string(),
2790        majority: reliability.majority.clone(),
2791        wall_time_seconds: mean(&wall_times),
2792        cost_usd: costs.iter().sum(),
2793        mean_wall_time_seconds: mean(&wall_times),
2794        stdev_wall_time_seconds: stdev(&wall_times),
2795        total_cost_usd: costs.iter().sum(),
2796    }
2797}
2798
2799fn eval_pack_stats_report(rows: &[EvalPackStatsRow]) -> EvalPackStatsReport {
2800    EvalPackStatsReport {
2801        macro_pass_at_1: macro_pass_at_1(rows),
2802        reliability: eval_pack_reliability_breakdown(rows),
2803    }
2804}
2805
2806fn macro_pass_at_1(rows: &[EvalPackStatsRow]) -> f64 {
2807    let decided = rows
2808        .iter()
2809        .filter(|row| row.passes + row.fails > 0)
2810        .collect::<Vec<_>>();
2811    if decided.is_empty() {
2812        return 0.0;
2813    }
2814    decided.iter().map(|row| row.pass_rate).sum::<f64>() / decided.len() as f64
2815}
2816
2817fn eval_pack_reliability_breakdown(rows: &[EvalPackStatsRow]) -> EvalPackReliabilityBreakdown {
2818    let total_cases = rows.len();
2819    let all_pass_cases = rows
2820        .iter()
2821        .filter(|row| row.passes > 0 && row.fails == 0)
2822        .count();
2823    let flaky_cases = rows
2824        .iter()
2825        .filter(|row| row.passes > 0 && row.fails > 0)
2826        .count();
2827    let all_fail_cases = rows
2828        .iter()
2829        .filter(|row| row.passes == 0 && row.fails > 0)
2830        .count();
2831    let no_decision_cases = rows
2832        .iter()
2833        .filter(|row| row.passes + row.fails == 0)
2834        .count();
2835    EvalPackReliabilityBreakdown {
2836        all_pass_cases,
2837        flaky_cases,
2838        all_fail_cases,
2839        no_decision_cases,
2840        total_cases,
2841        all_pass_fraction: rate(all_pass_cases, total_cases),
2842        flaky_fraction: rate(flaky_cases, total_cases),
2843        all_fail_fraction: rate(all_fail_cases, total_cases),
2844        no_decision_fraction: rate(no_decision_cases, total_cases),
2845    }
2846}
2847
2848fn split_by_case_id(report: &EvalPackSplitValidationReport) -> BTreeMap<String, String> {
2849    let mut out = BTreeMap::new();
2850    for (partition, cases) in &report.partitions {
2851        for case_id in cases {
2852            out.insert(case_id.clone(), partition.clone());
2853        }
2854    }
2855    out
2856}
2857
2858fn mean(values: &[f64]) -> f64 {
2859    if values.is_empty() {
2860        return 0.0;
2861    }
2862    values.iter().sum::<f64>() / values.len() as f64
2863}
2864
2865fn stdev(values: &[f64]) -> f64 {
2866    if values.is_empty() {
2867        return 0.0;
2868    }
2869    let mean = mean(values);
2870    let variance = values
2871        .iter()
2872        .map(|value| {
2873            let diff = value - mean;
2874            diff * diff
2875        })
2876        .sum::<f64>()
2877        / values.len() as f64;
2878    variance.sqrt()
2879}
2880
2881fn rate(count: usize, denom: usize) -> f64 {
2882    if denom == 0 {
2883        0.0
2884    } else {
2885        count as f64 / denom as f64
2886    }
2887}
2888
2889fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2890    normalize_eval_pack_severity(
2891        case.severity
2892            .as_deref()
2893            .or(case.thresholds.severity.as_deref())
2894            .or(manifest.defaults.severity.as_deref())
2895            .or(manifest.defaults.thresholds.severity.as_deref())
2896            .unwrap_or("blocking"),
2897    )
2898}
2899
2900fn normalize_eval_pack_severity(value: &str) -> String {
2901    match value.trim().to_ascii_lowercase().as_str() {
2902        "warn" | "warning" => "warning".to_string(),
2903        "info" | "informational" => "informational".to_string(),
2904        _ => "blocking".to_string(),
2905    }
2906}
2907
2908fn load_eval_pack_case_run(
2909    case: &EvalPackCase,
2910    base_dir: Option<&Path>,
2911    fixture_base_dir: Option<&Path>,
2912    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2913) -> Result<RunRecord, VmError> {
2914    if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2915        if let Some(fixture) = fixtures_by_id.get(run_ref) {
2916            return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2917        }
2918        return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2919    }
2920    Err(VmError::Runtime(
2921        "eval pack case is missing run or run_path".to_string(),
2922    ))
2923}
2924
2925fn load_eval_pack_case_fixture(
2926    case: &EvalPackCase,
2927    base_dir: Option<&Path>,
2928    fixture_base_dir: Option<&Path>,
2929    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2930    run: &RunRecord,
2931) -> Result<ReplayFixture, VmError> {
2932    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2933        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2934            return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2935        }
2936        return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2937    }
2938    Ok(run
2939        .replay_fixture
2940        .clone()
2941        .unwrap_or_else(|| replay_fixture_from_run(run)))
2942}
2943
2944fn eval_pack_case_source_path(
2945    case: &EvalPackCase,
2946    base_dir: Option<&Path>,
2947    fixture_base_dir: Option<&Path>,
2948    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2949) -> Option<String> {
2950    let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2951    if let Some(fixture) = fixtures_by_id.get(run_ref) {
2952        return fixture.path.as_ref().map(|path| {
2953            resolve_manifest_path(fixture_base_dir, path)
2954                .display()
2955                .to_string()
2956        });
2957    }
2958    Some(
2959        resolve_manifest_path(base_dir, run_ref)
2960            .display()
2961            .to_string(),
2962    )
2963}
2964
2965fn load_eval_pack_case_friction_events(
2966    case: &EvalPackCase,
2967    base_dir: Option<&Path>,
2968    fixture_base_dir: Option<&Path>,
2969    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2970) -> Result<Vec<FrictionEvent>, VmError> {
2971    let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2972        VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2973    })?;
2974    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2975        return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2976    }
2977    load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2978}
2979
2980fn load_friction_events_from_fixture_ref(
2981    fixture: &EvalPackFixtureRef,
2982    base_dir: Option<&Path>,
2983) -> Result<Vec<FrictionEvent>, VmError> {
2984    if let Some(inline) = &fixture.inline {
2985        return normalize_friction_events_json(inline.clone());
2986    }
2987    let path = fixture.path.as_deref().ok_or_else(|| {
2988        VmError::Runtime(format!(
2989            "fixture '{}' is missing path or inline friction events",
2990            fixture.id
2991        ))
2992    })?;
2993    load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2994}
2995
2996fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2997    let content = std::fs::read_to_string(path)
2998        .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2999    let value: serde_json::Value = serde_json::from_str(&content)
3000        .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
3001    normalize_friction_events_json(value)
3002}
3003
3004fn eval_pack_case_friction_source_path(
3005    case: &EvalPackCase,
3006    base_dir: Option<&Path>,
3007    fixture_base_dir: Option<&Path>,
3008    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
3009) -> Option<String> {
3010    let event_ref = case.friction_events.as_deref()?;
3011    if let Some(fixture) = fixtures_by_id.get(event_ref) {
3012        return fixture.path.as_ref().map(|path| {
3013            resolve_manifest_path(fixture_base_dir, path)
3014                .display()
3015                .to_string()
3016        });
3017    }
3018    Some(
3019        resolve_manifest_path(base_dir, event_ref)
3020            .display()
3021            .to_string(),
3022    )
3023}
3024
3025fn friction_suggestion_options(
3026    case: &EvalPackCase,
3027    manifest: &EvalPackManifest,
3028) -> ContextPackSuggestionOptions {
3029    let min_occurrences = case
3030        .metadata
3031        .get("min_occurrences")
3032        .or_else(|| manifest.metadata.get("min_occurrences"))
3033        .and_then(|value| value.as_u64())
3034        .unwrap_or(2) as usize;
3035    let owner = case
3036        .metadata
3037        .get("owner")
3038        .or_else(|| manifest.metadata.get("owner"))
3039        .and_then(|value| value.as_str())
3040        .map(str::to_string)
3041        .or_else(|| {
3042            manifest
3043                .package
3044                .as_ref()
3045                .and_then(|package| package.name.clone())
3046        });
3047    ContextPackSuggestionOptions {
3048        min_occurrences,
3049        owner,
3050    }
3051}
3052
3053fn apply_eval_pack_thresholds(
3054    run: &RunRecord,
3055    thresholds: &super::types::EvalPackThresholds,
3056    failures: &mut Vec<String>,
3057) {
3058    if let Some(max_stage_count) = thresholds.max_stage_count {
3059        if run.stages.len() > max_stage_count {
3060            failures.push(format!(
3061                "stage count {} exceeds threshold {}",
3062                run.stages.len(),
3063                max_stage_count
3064            ));
3065        }
3066    }
3067    if let Some(max_latency_ms) = thresholds.max_latency_ms {
3068        let actual = run
3069            .usage
3070            .as_ref()
3071            .map(|usage| usage.total_duration_ms)
3072            .unwrap_or_default();
3073        if actual > max_latency_ms {
3074            failures.push(format!(
3075                "latency {actual}ms exceeds threshold {max_latency_ms}ms"
3076            ));
3077        }
3078    }
3079    if let Some(max_cost_usd) = thresholds.max_cost_usd {
3080        let actual = run
3081            .usage
3082            .as_ref()
3083            .map(|usage| usage.total_cost)
3084            .unwrap_or_default();
3085        if actual > max_cost_usd {
3086            failures.push(format!(
3087                "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
3088            ));
3089        }
3090    }
3091    if let Some(max_tokens) = thresholds.max_tokens {
3092        let actual = run
3093            .usage
3094            .as_ref()
3095            .map(|usage| usage.input_tokens + usage.output_tokens)
3096            .unwrap_or_default();
3097        if actual > max_tokens {
3098            failures.push(format!(
3099                "token count {actual} exceeds threshold {max_tokens}"
3100            ));
3101        }
3102    }
3103}
3104
3105fn apply_eval_pack_rubric(
3106    rubric: &EvalPackRubric,
3107    run: &RunRecord,
3108    failures: &mut Vec<String>,
3109    warnings: &mut Vec<String>,
3110) {
3111    match rubric.kind.as_str() {
3112        "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
3113            apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
3114            for assertion in &rubric.assertions {
3115                apply_eval_pack_assertion(rubric, assertion, run, failures);
3116            }
3117        }
3118        "llm-judge" | "llm_as_judge" | "judge" => {
3119            let severity = normalize_eval_pack_severity(
3120                rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
3121            );
3122            let message = format!(
3123                "rubric '{}' requires an external LLM judge and was not run locally",
3124                rubric.id
3125            );
3126            if severity == "blocking" {
3127                failures.push(message);
3128            } else {
3129                warnings.push(message);
3130            }
3131        }
3132        other => warnings.push(format!(
3133            "rubric '{}' has unknown kind '{}' and was not run locally",
3134            rubric.id, other
3135        )),
3136    }
3137}
3138
3139fn apply_eval_pack_friction_rubric(
3140    rubric: &EvalPackRubric,
3141    suggestions: &[super::super::ContextPackSuggestion],
3142    failures: &mut Vec<String>,
3143    warnings: &mut Vec<String>,
3144) {
3145    match rubric.kind.as_str() {
3146        "" | "deterministic" | "friction" | "context-pack-suggestion" => {
3147            let mut expectations = Vec::new();
3148            for assertion in &rubric.assertions {
3149                match assertion.kind.as_str() {
3150                    "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
3151                        let expectation = context_pack_expectation_from_assertion(assertion);
3152                        expectations.push(expectation);
3153                    }
3154                    other => failures.push(format!(
3155                        "rubric '{}' has unsupported friction assertion kind '{}'",
3156                        rubric.id, other
3157                    )),
3158                }
3159            }
3160            failures.extend(evaluate_context_pack_suggestion_expectations(
3161                suggestions,
3162                &expectations,
3163            ));
3164        }
3165        other => warnings.push(format!(
3166            "rubric '{}' has unknown friction kind '{}' and was not run locally",
3167            rubric.id, other
3168        )),
3169    }
3170}
3171
3172fn context_pack_expectation_from_assertion(
3173    assertion: &EvalPackAssertion,
3174) -> ContextPackSuggestionExpectation {
3175    let expected = assertion
3176        .expected
3177        .as_ref()
3178        .and_then(|value| value.as_object());
3179    let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
3180    ContextPackSuggestionExpectation {
3181        min_suggestions: expected
3182            .and_then(|map| map.get("min_suggestions"))
3183            .and_then(|value| value.as_u64())
3184            .map(|value| value as usize),
3185        recommended_artifact: expected
3186            .and_then(|map| map.get("recommended_artifact"))
3187            .and_then(|value| value.as_str())
3188            .map(str::to_string)
3189            .or_else(|| expected_string.map(str::to_string)),
3190        title_contains: assertion.contains.clone().or_else(|| {
3191            expected
3192                .and_then(|map| map.get("title_contains"))
3193                .and_then(|value| value.as_str())
3194                .map(str::to_string)
3195        }),
3196        manifest_name_contains: expected
3197            .and_then(|map| map.get("manifest_name_contains"))
3198            .and_then(|value| value.as_str())
3199            .map(str::to_string),
3200        required_capability: expected
3201            .and_then(|map| map.get("required_capability"))
3202            .and_then(|value| value.as_str())
3203            .map(str::to_string),
3204        required_output_slot: expected
3205            .and_then(|map| map.get("required_output_slot"))
3206            .and_then(|value| value.as_str())
3207            .map(str::to_string),
3208    }
3209}
3210
3211fn apply_eval_pack_assertion(
3212    rubric: &EvalPackRubric,
3213    assertion: &EvalPackAssertion,
3214    run: &RunRecord,
3215    failures: &mut Vec<String>,
3216) {
3217    match assertion.kind.as_str() {
3218        "run-status" | "run_status" | "status" => {
3219            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
3220            if let Some(expected) = expected {
3221                if run.status != expected {
3222                    failures.push(format!(
3223                        "rubric '{}' expected run status {}, got {}",
3224                        rubric.id, expected, run.status
3225                    ));
3226                }
3227            }
3228        }
3229        "stage-status" | "stage_status" => {
3230            let Some(stage_id) = assertion.stage.as_deref() else {
3231                failures.push(format!(
3232                    "rubric '{}' stage-status assertion missing stage",
3233                    rubric.id
3234                ));
3235                return;
3236            };
3237            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
3238            let Some(expected) = expected else {
3239                failures.push(format!(
3240                    "rubric '{}' stage-status assertion missing expected string",
3241                    rubric.id
3242                ));
3243                return;
3244            };
3245            match run.stages.iter().find(|stage| stage.node_id == stage_id) {
3246                Some(stage) if stage.status == expected => {}
3247                Some(stage) => failures.push(format!(
3248                    "rubric '{}' expected stage {} status {}, got {}",
3249                    rubric.id, stage_id, expected, stage.status
3250                )),
3251                None => failures.push(format!(
3252                    "rubric '{}' expected stage {} to exist",
3253                    rubric.id, stage_id
3254                )),
3255            }
3256        }
3257        "visible-text-contains" | "visible_text_contains" => {
3258            let Some(needle) = assertion.contains.as_deref() else {
3259                failures.push(format!(
3260                    "rubric '{}' visible-text assertion missing contains",
3261                    rubric.id
3262                ));
3263                return;
3264            };
3265            let matched = match assertion.stage.as_deref() {
3266                Some(stage_id) => run
3267                    .stages
3268                    .iter()
3269                    .find(|stage| stage.node_id == stage_id)
3270                    .and_then(|stage| stage.visible_text.as_deref())
3271                    .is_some_and(|text| text.contains(needle)),
3272                None => run
3273                    .stages
3274                    .iter()
3275                    .filter_map(|stage| stage.visible_text.as_deref())
3276                    .any(|text| text.contains(needle)),
3277            };
3278            if !matched {
3279                failures.push(format!(
3280                    "rubric '{}' expected visible text to contain {:?}",
3281                    rubric.id, needle
3282                ));
3283            }
3284        }
3285        "hitl-question-contains" | "hitl_question_contains" => {
3286            let Some(needle) = assertion.contains.as_deref() else {
3287                failures.push(format!(
3288                    "rubric '{}' HITL assertion missing contains",
3289                    rubric.id
3290                ));
3291                return;
3292            };
3293            if !run
3294                .hitl_questions
3295                .iter()
3296                .any(|question| question.prompt.contains(needle))
3297            {
3298                failures.push(format!(
3299                    "rubric '{}' expected HITL question to contain {:?}",
3300                    rubric.id, needle
3301                ));
3302            }
3303        }
3304        "" => {}
3305        other => failures.push(format!(
3306            "rubric '{}' has unsupported assertion kind '{}'",
3307            rubric.id, other
3308        )),
3309    }
3310}
3311
3312pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
3313    ReplayFixture {
3314        type_name: "replay_fixture".to_string(),
3315        id: new_id("fixture"),
3316        source_run_id: run.id.clone(),
3317        workflow_id: run.workflow_id.clone(),
3318        workflow_name: run.workflow_name.clone(),
3319        created_at: now_rfc3339(),
3320        eval_kind: Some("replay".to_string()),
3321        clarifying_question: None,
3322        expected_status: run.status.clone(),
3323        stage_assertions: run
3324            .stages
3325            .iter()
3326            .map(|stage| ReplayStageAssertion {
3327                node_id: stage.node_id.clone(),
3328                expected_status: stage.status.clone(),
3329                expected_outcome: stage.outcome.clone(),
3330                expected_branch: stage.branch.clone(),
3331                required_artifact_kinds: stage
3332                    .artifacts
3333                    .iter()
3334                    .map(|artifact| artifact.kind.clone())
3335                    .collect(),
3336                visible_text_contains: stage
3337                    .visible_text
3338                    .as_ref()
3339                    .filter(|text| !text.is_empty())
3340                    .map(|text| text.chars().take(80).collect()),
3341            })
3342            .collect(),
3343    }
3344}
3345
3346pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
3347    if fixture.eval_kind.as_deref() == Some("clarifying_question") {
3348        return evaluate_clarifying_question(run, fixture);
3349    }
3350    let mut failures = Vec::new();
3351    if run.status != fixture.expected_status {
3352        failures.push(format!(
3353            "run status mismatch: expected {}, got {}",
3354            fixture.expected_status, run.status
3355        ));
3356    }
3357    let stages_by_id: BTreeMap<&str, &RunStageRecord> =
3358        run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
3359    for assertion in &fixture.stage_assertions {
3360        let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
3361            failures.push(format!("missing stage {}", assertion.node_id));
3362            continue;
3363        };
3364        if stage.status != assertion.expected_status {
3365            failures.push(format!(
3366                "stage {} status mismatch: expected {}, got {}",
3367                assertion.node_id, assertion.expected_status, stage.status
3368            ));
3369        }
3370        if stage.outcome != assertion.expected_outcome {
3371            failures.push(format!(
3372                "stage {} outcome mismatch: expected {}, got {}",
3373                assertion.node_id, assertion.expected_outcome, stage.outcome
3374            ));
3375        }
3376        if stage.branch != assertion.expected_branch {
3377            failures.push(format!(
3378                "stage {} branch mismatch: expected {:?}, got {:?}",
3379                assertion.node_id, assertion.expected_branch, stage.branch
3380            ));
3381        }
3382        for required_kind in &assertion.required_artifact_kinds {
3383            if !stage
3384                .artifacts
3385                .iter()
3386                .any(|artifact| &artifact.kind == required_kind)
3387            {
3388                failures.push(format!(
3389                    "stage {} missing artifact kind {}",
3390                    assertion.node_id, required_kind
3391                ));
3392            }
3393        }
3394        if let Some(snippet) = &assertion.visible_text_contains {
3395            let actual = stage.visible_text.clone().unwrap_or_default();
3396            if !actual.contains(snippet) {
3397                failures.push(format!(
3398                    "stage {} visible text does not contain expected snippet {:?}",
3399                    assertion.node_id, snippet
3400                ));
3401            }
3402        }
3403    }
3404
3405    ReplayEvalReport {
3406        pass: failures.is_empty(),
3407        failures,
3408        stage_count: run.stages.len(),
3409    }
3410}
3411
3412fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
3413    let mut failures = Vec::new();
3414    let spec = fixture.clarifying_question.clone().unwrap_or_default();
3415    let min_questions = clarifying_min_questions(&spec);
3416    let max_questions = clarifying_max_questions(&spec);
3417    let questions = &run.hitl_questions;
3418
3419    if run.status != fixture.expected_status {
3420        failures.push(format!(
3421            "run status mismatch: expected {}, got {}",
3422            fixture.expected_status, run.status
3423        ));
3424    }
3425    if questions.len() < min_questions {
3426        failures.push(format!(
3427            "expected at least {min_questions} clarifying question(s), got {}",
3428            questions.len()
3429        ));
3430    }
3431    if questions.len() > max_questions {
3432        failures.push(format!(
3433            "expected at most {max_questions} clarifying question(s), got {}",
3434            questions.len()
3435        ));
3436    }
3437
3438    let normalized_expected = spec
3439        .expected_question
3440        .as_deref()
3441        .map(normalize_question_text);
3442    let normalized_accepted = spec
3443        .accepted_questions
3444        .iter()
3445        .map(|question| normalize_question_text(question))
3446        .collect::<Vec<_>>();
3447    let required_terms = spec
3448        .required_terms
3449        .iter()
3450        .map(|term| normalize_question_text(term))
3451        .collect::<Vec<_>>();
3452    let forbidden_terms = spec
3453        .forbidden_terms
3454        .iter()
3455        .map(|term| normalize_question_text(term))
3456        .collect::<Vec<_>>();
3457
3458    let matched = questions.iter().any(|question| {
3459        let normalized = normalize_question_text(&question.prompt);
3460        let matches_expected = normalized_expected
3461            .as_ref()
3462            .is_none_or(|expected| &normalized == expected)
3463            && (normalized_accepted.is_empty()
3464                || normalized_accepted
3465                    .iter()
3466                    .any(|candidate| candidate == &normalized));
3467        let has_required_terms = required_terms
3468            .iter()
3469            .all(|term| normalized.contains(term.as_str()));
3470        let avoids_forbidden_terms = forbidden_terms
3471            .iter()
3472            .all(|term| !normalized.contains(term.as_str()));
3473        matches_expected && has_required_terms && avoids_forbidden_terms
3474    });
3475
3476    if !questions.is_empty()
3477        && (!normalized_accepted.is_empty()
3478            || normalized_expected.is_some()
3479            || !required_terms.is_empty()
3480            || !forbidden_terms.is_empty())
3481        && !matched
3482    {
3483        failures.push(format!(
3484            "no clarifying question matched fixture; actual questions: {}",
3485            questions
3486                .iter()
3487                .map(|question| format!("{:?}", question.prompt))
3488                .collect::<Vec<_>>()
3489                .join(", ")
3490        ));
3491    }
3492
3493    ReplayEvalReport {
3494        pass: failures.is_empty(),
3495        failures,
3496        stage_count: run.stages.len(),
3497    }
3498}
3499
3500pub fn evaluate_run_suite(
3501    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
3502) -> ReplayEvalSuiteReport {
3503    let mut reports = Vec::new();
3504    for (run, fixture, source_path) in cases {
3505        let report = evaluate_run_against_fixture(&run, &fixture);
3506        reports.push(ReplayEvalCaseReport {
3507            run_id: run.id.clone(),
3508            workflow_id: run.workflow_id.clone(),
3509            label: None,
3510            pass: report.pass,
3511            failures: report.failures,
3512            stage_count: report.stage_count,
3513            source_path,
3514            comparison: None,
3515        });
3516    }
3517    let total = reports.len();
3518    let passed = reports.iter().filter(|report| report.pass).count();
3519    let failed = total.saturating_sub(passed);
3520    ReplayEvalSuiteReport {
3521        pass: failed == 0,
3522        total,
3523        passed,
3524        failed,
3525        cases: reports,
3526    }
3527}
3528
3529#[cfg(test)]
3530mod live_tool_budget_tests {
3531    use super::*;
3532    use std::collections::BTreeMap;
3533
3534    #[test]
3535    fn per_tool_budget_counts_from_sequence_when_no_by_tool_map() {
3536        // The in-process coding-agent executor emits only
3537        // {total, rejected, sequence, successful} — no `by_tool` map.
3538        let summary = serde_json::json!({
3539            "total": 4,
3540            "rejected": 0,
3541            "sequence": ["read", "edit", "edit", "run"],
3542            "successful": ["read", "edit", "edit", "run"],
3543        });
3544        assert_eq!(live_tool_summary_count(&summary, "edit"), Some(2));
3545        assert_eq!(live_tool_summary_count(&summary, "read"), Some(1));
3546        assert_eq!(live_tool_summary_count(&summary, "delete"), Some(0));
3547        assert_eq!(live_tool_summary_count(&summary, "total"), Some(4));
3548    }
3549
3550    #[test]
3551    fn per_tool_budget_is_enforced_against_sequence_only_summary() {
3552        let summary = serde_json::json!({
3553            "total": 3,
3554            "sequence": ["edit", "edit", "run"],
3555        });
3556        let budgets = BTreeMap::from([("edit".to_string(), 1usize)]);
3557        let failures = eval_pack_live_tool_budget_failures(&budgets, &summary);
3558        assert_eq!(failures.len(), 1, "edit budget of 1 must trip on 2 edits");
3559        assert!(failures[0].contains("edit"));
3560
3561        let within = BTreeMap::from([("edit".to_string(), 2usize)]);
3562        assert!(eval_pack_live_tool_budget_failures(&within, &summary).is_empty());
3563    }
3564
3565    #[test]
3566    fn explicit_by_tool_map_still_takes_precedence() {
3567        let summary = serde_json::json!({
3568            "total": 1,
3569            "byTool": {"edit": 1},
3570        });
3571        assert_eq!(live_tool_summary_count(&summary, "edit"), Some(1));
3572    }
3573}