Skip to main content

harn_vm/orchestration/
records.rs

1//! Run records, replay fixtures, eval reports, and diff utilities.
2
3use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9    default_run_dir, evaluate_context_pack_suggestion_expectations,
10    generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11    parse_json_payload, parse_json_value, sync_run_handoffs, ArtifactRecord, CapabilityPolicy,
12    ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact,
13};
14use crate::event_log::{
15    active_event_log, AnyEventLog, EventLog, LogEvent as EventLogRecord, Topic,
16};
17use crate::llm::vm_value_to_json;
18use crate::triggers::{SignatureStatus, TriggerEvent};
19use crate::value::{VmError, VmValue};
20
21pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
22pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
23pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
24pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
25pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
26pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
27pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
28pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
29pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
30pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
31pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
32
33pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
34pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
35pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
36pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
37pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
38pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
39pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
40pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
41pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
42
43#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
44#[serde(default)]
45pub struct LlmUsageRecord {
46    pub input_tokens: i64,
47    pub output_tokens: i64,
48    pub total_duration_ms: i64,
49    pub call_count: i64,
50    pub total_cost: f64,
51    pub models: Vec<String>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
55#[serde(default)]
56pub struct RunStageRecord {
57    pub id: String,
58    pub node_id: String,
59    pub kind: String,
60    pub status: String,
61    pub outcome: String,
62    pub branch: Option<String>,
63    pub started_at: String,
64    pub finished_at: Option<String>,
65    pub visible_text: Option<String>,
66    pub private_reasoning: Option<String>,
67    pub transcript: Option<serde_json::Value>,
68    pub verification: Option<serde_json::Value>,
69    pub usage: Option<LlmUsageRecord>,
70    pub artifacts: Vec<ArtifactRecord>,
71    pub consumed_artifact_ids: Vec<String>,
72    pub produced_artifact_ids: Vec<String>,
73    pub attempts: Vec<RunStageAttemptRecord>,
74    pub metadata: BTreeMap<String, serde_json::Value>,
75}
76
77#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
78#[serde(default)]
79pub struct RunStageAttemptRecord {
80    pub attempt: usize,
81    pub status: String,
82    pub outcome: String,
83    pub branch: Option<String>,
84    pub error: Option<String>,
85    pub verification: Option<serde_json::Value>,
86    pub started_at: String,
87    pub finished_at: Option<String>,
88}
89
90#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(default)]
92pub struct RunTransitionRecord {
93    pub id: String,
94    pub from_stage_id: Option<String>,
95    pub from_node_id: Option<String>,
96    pub to_node_id: String,
97    pub branch: Option<String>,
98    pub timestamp: String,
99    pub consumed_artifact_ids: Vec<String>,
100    pub produced_artifact_ids: Vec<String>,
101}
102
103#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
104#[serde(default)]
105pub struct RunCheckpointRecord {
106    pub id: String,
107    pub ready_nodes: Vec<String>,
108    pub completed_nodes: Vec<String>,
109    pub last_stage_id: Option<String>,
110    pub persisted_at: String,
111    pub reason: String,
112}
113
114#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
115#[serde(default)]
116pub struct ReplayFixture {
117    #[serde(rename = "_type")]
118    pub type_name: String,
119    pub id: String,
120    pub source_run_id: String,
121    pub workflow_id: String,
122    pub workflow_name: Option<String>,
123    pub created_at: String,
124    pub eval_kind: Option<String>,
125    pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
126    pub expected_status: String,
127    pub stage_assertions: Vec<ReplayStageAssertion>,
128}
129
130#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
131#[serde(default)]
132pub struct ClarifyingQuestionEvalSpec {
133    pub expected_question: Option<String>,
134    pub accepted_questions: Vec<String>,
135    pub required_terms: Vec<String>,
136    pub forbidden_terms: Vec<String>,
137    pub min_questions: usize,
138    pub max_questions: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
142#[serde(default)]
143pub struct ReplayStageAssertion {
144    pub node_id: String,
145    pub expected_status: String,
146    pub expected_outcome: String,
147    pub expected_branch: Option<String>,
148    pub required_artifact_kinds: Vec<String>,
149    pub visible_text_contains: Option<String>,
150}
151
152#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
153#[serde(default)]
154pub struct ReplayEvalReport {
155    pub pass: bool,
156    pub failures: Vec<String>,
157    pub stage_count: usize,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalCaseReport {
163    pub run_id: String,
164    pub workflow_id: String,
165    pub label: Option<String>,
166    pub pass: bool,
167    pub failures: Vec<String>,
168    pub stage_count: usize,
169    pub source_path: Option<String>,
170    pub comparison: Option<RunDiffReport>,
171}
172
173#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
174#[serde(default)]
175pub struct ReplayEvalSuiteReport {
176    pub pass: bool,
177    pub total: usize,
178    pub passed: usize,
179    pub failed: usize,
180    pub cases: Vec<ReplayEvalCaseReport>,
181}
182
183#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(default)]
185pub struct RunDeliverableSummaryRecord {
186    pub id: String,
187    pub text: String,
188    pub status: String,
189    pub note: Option<String>,
190}
191
192#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
193#[serde(default)]
194pub struct RunTaskLedgerSummaryRecord {
195    pub root_task: String,
196    pub rationale: String,
197    pub deliverables: Vec<RunDeliverableSummaryRecord>,
198    pub observations: Vec<String>,
199    pub blocking_count: usize,
200}
201
202#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
203#[serde(default)]
204pub struct RunPlannerRoundRecord {
205    pub stage_id: String,
206    pub node_id: String,
207    pub stage_kind: String,
208    pub status: String,
209    pub outcome: String,
210    pub iteration_count: usize,
211    pub llm_call_count: usize,
212    pub tool_execution_count: usize,
213    pub tool_rejection_count: usize,
214    pub intervention_count: usize,
215    pub compaction_count: usize,
216    pub native_text_tool_fallback_count: usize,
217    pub native_text_tool_fallback_rejection_count: usize,
218    pub empty_completion_retry_count: usize,
219    pub tools_used: Vec<String>,
220    pub successful_tools: Vec<String>,
221    pub ledger_done_rejections: usize,
222    pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
223    pub research_facts: Vec<String>,
224}
225
226#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
227#[serde(default)]
228pub struct RunWorkerLineageRecord {
229    pub worker_id: String,
230    pub worker_name: String,
231    pub parent_stage_id: Option<String>,
232    pub task: String,
233    pub status: String,
234    pub session_id: Option<String>,
235    pub parent_session_id: Option<String>,
236    pub run_id: Option<String>,
237    pub run_path: Option<String>,
238    pub snapshot_path: Option<String>,
239}
240
241#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
242#[serde(default)]
243pub struct RunActionGraphNodeRecord {
244    pub id: String,
245    pub label: String,
246    pub kind: String,
247    pub status: String,
248    pub outcome: String,
249    pub trace_id: Option<String>,
250    pub stage_id: Option<String>,
251    pub node_id: Option<String>,
252    pub worker_id: Option<String>,
253    pub run_id: Option<String>,
254    pub run_path: Option<String>,
255    pub metadata: BTreeMap<String, serde_json::Value>,
256}
257
258#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
259#[serde(default)]
260pub struct RunActionGraphEdgeRecord {
261    pub from_id: String,
262    pub to_id: String,
263    pub kind: String,
264    pub label: Option<String>,
265}
266
267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
268#[serde(default)]
269pub struct RunVerificationOutcomeRecord {
270    pub stage_id: String,
271    pub node_id: String,
272    pub status: String,
273    pub passed: Option<bool>,
274    pub summary: Option<String>,
275}
276
277#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
278#[serde(default)]
279pub struct RunTranscriptPointerRecord {
280    pub id: String,
281    pub label: String,
282    pub kind: String,
283    pub location: String,
284    pub path: Option<String>,
285    pub available: bool,
286}
287
288#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
289#[serde(default)]
290pub struct CompactionEventRecord {
291    pub id: String,
292    pub transcript_id: Option<String>,
293    pub stage_id: Option<String>,
294    pub node_id: Option<String>,
295    pub mode: String,
296    pub strategy: String,
297    pub archived_messages: usize,
298    pub estimated_tokens_before: usize,
299    pub estimated_tokens_after: usize,
300    pub snapshot_asset_id: Option<String>,
301    pub snapshot_location: String,
302    pub snapshot_path: Option<String>,
303    pub available: bool,
304}
305
306#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
307#[serde(rename_all = "snake_case")]
308pub enum DaemonEventKindRecord {
309    #[default]
310    Spawned,
311    Triggered,
312    Snapshotted,
313    Resumed,
314    Stopped,
315}
316
317#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
318#[serde(default)]
319pub struct DaemonEventRecord {
320    pub daemon_id: String,
321    pub name: String,
322    pub kind: DaemonEventKindRecord,
323    pub timestamp: String,
324    pub persist_path: String,
325    pub payload_summary: Option<String>,
326}
327
328#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
329#[serde(default)]
330pub struct RunObservabilityRecord {
331    pub schema_version: usize,
332    pub planner_rounds: Vec<RunPlannerRoundRecord>,
333    pub research_fact_count: usize,
334    pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
335    pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
336    pub worker_lineage: Vec<RunWorkerLineageRecord>,
337    pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
338    pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
339    pub compaction_events: Vec<CompactionEventRecord>,
340    pub daemon_events: Vec<DaemonEventRecord>,
341}
342
343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
344#[serde(default)]
345pub struct RunStageDiffRecord {
346    pub node_id: String,
347    pub change: String,
348    pub details: Vec<String>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct ToolCallDiffRecord {
354    pub tool_name: String,
355    pub args_hash: String,
356    pub result_changed: bool,
357    pub left_result: Option<String>,
358    pub right_result: Option<String>,
359}
360
361#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
362#[serde(default)]
363pub struct RunObservabilityDiffRecord {
364    pub section: String,
365    pub label: String,
366    pub details: Vec<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunDiffReport {
372    pub left_run_id: String,
373    pub right_run_id: String,
374    pub identical: bool,
375    pub status_changed: bool,
376    pub left_status: String,
377    pub right_status: String,
378    pub stage_diffs: Vec<RunStageDiffRecord>,
379    pub tool_diffs: Vec<ToolCallDiffRecord>,
380    pub observability_diffs: Vec<RunObservabilityDiffRecord>,
381    pub transition_count_delta: isize,
382    pub artifact_count_delta: isize,
383    pub checkpoint_count_delta: isize,
384}
385
386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
387#[serde(default)]
388pub struct EvalSuiteManifest {
389    #[serde(rename = "_type")]
390    pub type_name: String,
391    pub id: String,
392    pub name: Option<String>,
393    pub base_dir: Option<String>,
394    pub cases: Vec<EvalSuiteCase>,
395}
396
397#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
398#[serde(default)]
399pub struct EvalSuiteCase {
400    pub label: Option<String>,
401    pub run_path: String,
402    pub fixture_path: Option<String>,
403    pub compare_to: Option<String>,
404}
405
406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
407#[serde(default)]
408pub struct EvalPackManifest {
409    pub version: u32,
410    pub id: String,
411    pub name: Option<String>,
412    pub description: Option<String>,
413    pub base_dir: Option<String>,
414    pub package: Option<EvalPackPackage>,
415    pub defaults: EvalPackDefaults,
416    pub fixtures: Vec<EvalPackFixtureRef>,
417    pub rubrics: Vec<EvalPackRubric>,
418    pub judge: Option<EvalPackJudgeConfig>,
419    pub cases: Vec<EvalPackCase>,
420    pub metadata: BTreeMap<String, serde_json::Value>,
421}
422
423#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
424#[serde(default)]
425pub struct EvalPackPackage {
426    pub name: Option<String>,
427    pub version: Option<String>,
428    pub source: Option<String>,
429    pub templates: Vec<String>,
430}
431
432#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
433#[serde(default)]
434pub struct EvalPackDefaults {
435    pub severity: Option<String>,
436    pub fixture_root: Option<String>,
437    pub thresholds: EvalPackThresholds,
438    pub judge: Option<EvalPackJudgeConfig>,
439}
440
441#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
442#[serde(default)]
443pub struct EvalPackFixtureRef {
444    pub id: String,
445    pub kind: String,
446    pub path: Option<String>,
447    #[serde(default, alias = "trace-id")]
448    pub trace_id: Option<String>,
449    pub provider: Option<String>,
450    #[serde(default, alias = "event-kind")]
451    pub event_kind: Option<String>,
452    pub inline: Option<serde_json::Value>,
453    pub metadata: BTreeMap<String, serde_json::Value>,
454}
455
456#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
457#[serde(default)]
458pub struct EvalPackRubric {
459    pub id: String,
460    pub kind: String,
461    pub description: Option<String>,
462    pub prompt: Option<String>,
463    pub assertions: Vec<EvalPackAssertion>,
464    pub judge: Option<EvalPackJudgeConfig>,
465    pub calibration: Vec<EvalPackGoldenExample>,
466    pub thresholds: EvalPackThresholds,
467    pub metadata: BTreeMap<String, serde_json::Value>,
468}
469
470#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
471#[serde(default)]
472pub struct EvalPackAssertion {
473    pub kind: String,
474    pub stage: Option<String>,
475    pub path: Option<String>,
476    pub op: Option<String>,
477    pub expected: Option<serde_json::Value>,
478    pub contains: Option<String>,
479    pub metadata: BTreeMap<String, serde_json::Value>,
480}
481
482#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
483#[serde(default)]
484pub struct EvalPackJudgeConfig {
485    pub model: Option<String>,
486    #[serde(default, alias = "prompt-version")]
487    pub prompt_version: Option<String>,
488    #[serde(default, alias = "tie-break")]
489    pub tie_break: Option<String>,
490    #[serde(default, alias = "confidence-min")]
491    pub confidence_min: Option<f64>,
492    pub temperature: Option<f64>,
493    pub rubric: Option<String>,
494    pub metadata: BTreeMap<String, serde_json::Value>,
495}
496
497#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
498#[serde(default)]
499pub struct EvalPackGoldenExample {
500    pub input: serde_json::Value,
501    pub output: serde_json::Value,
502    pub score: Option<f64>,
503    pub explanation: Option<String>,
504}
505
506#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
507#[serde(default)]
508pub struct EvalPackThresholds {
509    pub severity: Option<String>,
510    #[serde(default, alias = "min-score")]
511    pub min_score: Option<f64>,
512    #[serde(default, alias = "min-confidence")]
513    pub min_confidence: Option<f64>,
514    #[serde(default, alias = "max-cost-usd")]
515    pub max_cost_usd: Option<f64>,
516    #[serde(default, alias = "max-latency-ms")]
517    pub max_latency_ms: Option<i64>,
518    #[serde(default, alias = "max-tokens")]
519    pub max_tokens: Option<i64>,
520    #[serde(default, alias = "max-stage-count")]
521    pub max_stage_count: Option<usize>,
522}
523
524#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
525#[serde(default)]
526pub struct EvalPackCase {
527    pub id: Option<String>,
528    pub name: Option<String>,
529    pub description: Option<String>,
530    pub run: Option<String>,
531    #[serde(default, alias = "run-path")]
532    pub run_path: Option<String>,
533    #[serde(default, alias = "friction-events", alias = "friction_events")]
534    pub friction_events: Option<String>,
535    pub fixture: Option<String>,
536    #[serde(default, alias = "fixture-path")]
537    pub fixture_path: Option<String>,
538    #[serde(default, alias = "compare-to")]
539    pub compare_to: Option<String>,
540    pub rubrics: Vec<String>,
541    pub severity: Option<String>,
542    pub thresholds: EvalPackThresholds,
543    pub metadata: BTreeMap<String, serde_json::Value>,
544}
545
546#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
547#[serde(default)]
548pub struct EvalPackReport {
549    pub pack_id: String,
550    pub pass: bool,
551    pub total: usize,
552    pub passed: usize,
553    pub failed: usize,
554    pub blocking_failed: usize,
555    pub warning_failed: usize,
556    pub informational_failed: usize,
557    pub cases: Vec<EvalPackCaseReport>,
558}
559
560#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
561#[serde(default)]
562pub struct EvalPackCaseReport {
563    pub id: String,
564    pub label: String,
565    pub severity: String,
566    pub pass: bool,
567    pub blocking: bool,
568    pub run_id: String,
569    pub workflow_id: String,
570    pub source_path: Option<String>,
571    pub stage_count: usize,
572    pub failures: Vec<String>,
573    pub warnings: Vec<String>,
574    pub informational: Vec<String>,
575    pub comparison: Option<RunDiffReport>,
576}
577
578#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
579#[serde(default)]
580pub struct RunHitlQuestionRecord {
581    pub request_id: String,
582    pub prompt: String,
583    pub agent: String,
584    pub trace_id: Option<String>,
585    pub asked_at: String,
586}
587
588#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
589#[serde(default)]
590pub struct RunRecord {
591    #[serde(rename = "_type")]
592    pub type_name: String,
593    pub id: String,
594    pub workflow_id: String,
595    pub workflow_name: Option<String>,
596    pub task: String,
597    pub status: String,
598    pub started_at: String,
599    pub finished_at: Option<String>,
600    pub parent_run_id: Option<String>,
601    pub root_run_id: Option<String>,
602    pub stages: Vec<RunStageRecord>,
603    pub transitions: Vec<RunTransitionRecord>,
604    pub checkpoints: Vec<RunCheckpointRecord>,
605    pub pending_nodes: Vec<String>,
606    pub completed_nodes: Vec<String>,
607    pub child_runs: Vec<RunChildRecord>,
608    pub artifacts: Vec<ArtifactRecord>,
609    pub handoffs: Vec<HandoffArtifact>,
610    pub policy: CapabilityPolicy,
611    pub execution: Option<RunExecutionRecord>,
612    pub transcript: Option<serde_json::Value>,
613    pub usage: Option<LlmUsageRecord>,
614    pub replay_fixture: Option<ReplayFixture>,
615    pub observability: Option<RunObservabilityRecord>,
616    pub trace_spans: Vec<RunTraceSpanRecord>,
617    pub tool_recordings: Vec<ToolCallRecord>,
618    pub hitl_questions: Vec<RunHitlQuestionRecord>,
619    pub metadata: BTreeMap<String, serde_json::Value>,
620    pub persisted_path: Option<String>,
621}
622
623#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
624#[serde(default)]
625pub struct ToolCallRecord {
626    pub tool_name: String,
627    pub tool_use_id: String,
628    pub args_hash: String,
629    pub result: String,
630    pub is_rejected: bool,
631    pub duration_ms: u64,
632    pub iteration: usize,
633    pub timestamp: String,
634}
635
636/// Hash a tool invocation for fixture lookup (name + canonical args JSON).
637pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
638    use std::hash::{Hash, Hasher};
639    let mut hasher = std::collections::hash_map::DefaultHasher::new();
640    tool_name.hash(&mut hasher);
641    let args_str = serde_json::to_string(args).unwrap_or_default();
642    args_str.hash(&mut hasher);
643    format!("{:016x}", hasher.finish())
644}
645
646#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
647#[serde(default)]
648pub struct RunTraceSpanRecord {
649    pub span_id: u64,
650    pub parent_id: Option<u64>,
651    pub kind: String,
652    pub name: String,
653    pub start_ms: u64,
654    pub duration_ms: u64,
655    pub metadata: BTreeMap<String, serde_json::Value>,
656}
657
658#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
659#[serde(default)]
660pub struct RunChildRecord {
661    pub worker_id: String,
662    pub worker_name: String,
663    pub parent_stage_id: Option<String>,
664    pub session_id: Option<String>,
665    pub parent_session_id: Option<String>,
666    pub mutation_scope: Option<String>,
667    pub approval_policy: Option<super::ToolApprovalPolicy>,
668    pub task: String,
669    pub request: Option<serde_json::Value>,
670    pub provenance: Option<serde_json::Value>,
671    pub status: String,
672    pub started_at: String,
673    pub finished_at: Option<String>,
674    pub run_id: Option<String>,
675    pub run_path: Option<String>,
676    pub snapshot_path: Option<String>,
677    pub execution: Option<RunExecutionRecord>,
678}
679
680#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
681#[serde(default)]
682pub struct RunExecutionRecord {
683    pub cwd: Option<String>,
684    pub source_dir: Option<String>,
685    pub env: BTreeMap<String, String>,
686    pub adapter: Option<String>,
687    pub repo_path: Option<String>,
688    pub worktree_path: Option<String>,
689    pub branch: Option<String>,
690    pub base_ref: Option<String>,
691    pub cleanup: Option<String>,
692}
693
694fn compact_json_value(value: &serde_json::Value) -> String {
695    serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
696}
697
698fn normalize_question_text(text: &str) -> String {
699    text.chars()
700        .map(|ch| {
701            if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
702                ch.to_ascii_lowercase()
703            } else {
704                ' '
705            }
706        })
707        .collect::<String>()
708        .split_whitespace()
709        .collect::<Vec<_>>()
710        .join(" ")
711}
712
713fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
714    spec.min_questions.max(1)
715}
716
717fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
718    spec.max_questions.unwrap_or(1).max(1)
719}
720
721fn read_topic_records(
722    log: &AnyEventLog,
723    topic: &Topic,
724) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
725    let mut from = None;
726    let mut records = Vec::new();
727    loop {
728        let batch =
729            futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
730        if batch.is_empty() {
731            break;
732        }
733        from = batch.last().map(|(event_id, _)| *event_id);
734        records.extend(batch);
735    }
736    records
737}
738
739fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
740    let Some(log) = active_event_log() else {
741        return;
742    };
743    let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
744        .expect("static hitl.questions topic should always be valid");
745    let mut merged = run
746        .hitl_questions
747        .iter()
748        .cloned()
749        .map(|question| (question.request_id.clone(), question))
750        .collect::<BTreeMap<_, _>>();
751
752    for (_, event) in read_topic_records(log.as_ref(), &topic) {
753        if event.kind != "hitl.question_asked" {
754            continue;
755        }
756        let payload = &event.payload;
757        let matches_run = event
758            .headers
759            .get("run_id")
760            .is_some_and(|value| value == &run.id)
761            || payload
762                .get("run_id")
763                .and_then(|value| value.as_str())
764                .is_some_and(|value| value == run.id);
765        if !matches_run {
766            continue;
767        }
768        let request_id = payload
769            .get("request_id")
770            .and_then(|value| value.as_str())
771            .or_else(|| event.headers.get("request_id").map(String::as_str))
772            .unwrap_or_default();
773        let prompt = payload
774            .get("payload")
775            .and_then(|value| value.get("prompt"))
776            .and_then(|value| value.as_str())
777            .unwrap_or_default();
778        if request_id.is_empty() || prompt.is_empty() {
779            continue;
780        }
781        merged.insert(
782            request_id.to_string(),
783            RunHitlQuestionRecord {
784                request_id: request_id.to_string(),
785                prompt: prompt.to_string(),
786                agent: payload
787                    .get("agent")
788                    .and_then(|value| value.as_str())
789                    .unwrap_or_default()
790                    .to_string(),
791                trace_id: payload
792                    .get("trace_id")
793                    .and_then(|value| value.as_str())
794                    .map(str::to_string),
795                asked_at: payload
796                    .get("requested_at")
797                    .and_then(|value| value.as_str())
798                    .unwrap_or_default()
799                    .to_string(),
800            },
801        );
802    }
803
804    run.hitl_questions = merged.into_values().collect();
805    run.hitl_questions.sort_by(|left, right| {
806        (left.asked_at.as_str(), left.request_id.as_str())
807            .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
808    });
809}
810
811fn signature_status_label(status: &SignatureStatus) -> &'static str {
812    match status {
813        SignatureStatus::Verified => "verified",
814        SignatureStatus::Unsigned => "unsigned",
815        SignatureStatus::Failed { .. } => "failed",
816    }
817}
818
819fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
820    run.metadata
821        .get("trigger_event")
822        .cloned()
823        .and_then(|value| serde_json::from_value(value).ok())
824}
825
826fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
827    trigger_event
828        .map(|event| event.trace_id.0.clone())
829        .or_else(|| {
830            run.metadata
831                .get("trace_id")
832                .and_then(|value| value.as_str())
833                .map(str::to_string)
834        })
835}
836
837fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
838    run.metadata
839        .get("replay_of_event_id")
840        .and_then(|value| value.as_str())
841        .map(str::to_string)
842}
843
844fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
845    if stage.kind == "condition" {
846        ACTION_GRAPH_NODE_KIND_PREDICATE
847    } else {
848        ACTION_GRAPH_NODE_KIND_STAGE
849    }
850}
851
852fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
853    let mut metadata = BTreeMap::new();
854    metadata.insert(
855        "provider".to_string(),
856        serde_json::json!(trigger_event.provider.as_str()),
857    );
858    metadata.insert(
859        "event_kind".to_string(),
860        serde_json::json!(trigger_event.kind),
861    );
862    metadata.insert(
863        "dedupe_key".to_string(),
864        serde_json::json!(trigger_event.dedupe_key),
865    );
866    metadata.insert(
867        "signature_status".to_string(),
868        serde_json::json!(signature_status_label(&trigger_event.signature_status)),
869    );
870    metadata
871}
872
873fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
874    let mut metadata = BTreeMap::new();
875    metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
876    if let Some(branch) = stage.branch.as_ref() {
877        metadata.insert("branch".to_string(), serde_json::json!(branch));
878    }
879    if let Some(worker_id) = stage
880        .metadata
881        .get("worker_id")
882        .and_then(|value| value.as_str())
883    {
884        metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
885    }
886    metadata
887}
888
889fn append_action_graph_node(
890    nodes: &mut Vec<RunActionGraphNodeRecord>,
891    record: RunActionGraphNodeRecord,
892) {
893    nodes.push(record);
894}
895
896pub async fn append_action_graph_update(
897    headers: BTreeMap<String, String>,
898    payload: serde_json::Value,
899) -> Result<(), crate::event_log::LogError> {
900    let Some(log) = active_event_log() else {
901        return Ok(());
902    };
903    let topic = Topic::new("observability.action_graph")
904        .expect("static observability.action_graph topic should always be valid");
905    let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
906    log.append(&topic, record).await.map(|_| ())
907}
908
909fn publish_action_graph_event(
910    run: &RunRecord,
911    observability: &RunObservabilityRecord,
912    path: &Path,
913) {
914    let trigger_event = trigger_event_from_run(run);
915    let mut headers = BTreeMap::new();
916    headers.insert("run_id".to_string(), run.id.clone());
917    headers.insert("workflow_id".to_string(), run.workflow_id.clone());
918    if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
919        headers.insert("trace_id".to_string(), trace_id);
920    }
921    let payload = serde_json::json!({
922        "run_id": run.id,
923        "workflow_id": run.workflow_id,
924        "persisted_path": path.to_string_lossy(),
925        "status": run.status,
926        "observability": observability,
927    });
928    if let Ok(handle) = tokio::runtime::Handle::try_current() {
929        handle.spawn(async move {
930            let _ = append_action_graph_update(headers, payload).await;
931        });
932    } else {
933        let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
934    }
935}
936
937fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
938    let stem = run_path.file_stem()?.to_str()?;
939    let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
940    Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
941}
942
943fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
944    value
945        .and_then(|value| value.as_array())
946        .map(|items| {
947            items
948                .iter()
949                .filter_map(|item| item.as_str().map(str::to_string))
950                .collect::<Vec<_>>()
951        })
952        .unwrap_or_default()
953}
954
955fn json_usize(value: Option<&serde_json::Value>) -> usize {
956    value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
957}
958
959fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
960    value.and_then(|value| value.as_bool())
961}
962
963fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
964    stage
965        .artifacts
966        .iter()
967        .find_map(|artifact| artifact.data.as_ref())
968}
969
970fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
971    let deliverables = value
972        .get("deliverables")
973        .and_then(|raw| raw.as_array())
974        .map(|items| {
975            items
976                .iter()
977                .map(|item| RunDeliverableSummaryRecord {
978                    id: item
979                        .get("id")
980                        .and_then(|value| value.as_str())
981                        .unwrap_or_default()
982                        .to_string(),
983                    text: item
984                        .get("text")
985                        .and_then(|value| value.as_str())
986                        .unwrap_or_default()
987                        .to_string(),
988                    status: item
989                        .get("status")
990                        .and_then(|value| value.as_str())
991                        .unwrap_or_default()
992                        .to_string(),
993                    note: item
994                        .get("note")
995                        .and_then(|value| value.as_str())
996                        .map(str::to_string),
997                })
998                .collect::<Vec<_>>()
999        })
1000        .unwrap_or_default();
1001    let observations = json_string_array(value.get("observations"));
1002    let root_task = value
1003        .get("root_task")
1004        .and_then(|value| value.as_str())
1005        .unwrap_or_default()
1006        .to_string();
1007    let rationale = value
1008        .get("rationale")
1009        .and_then(|value| value.as_str())
1010        .unwrap_or_default()
1011        .to_string();
1012    if root_task.is_empty()
1013        && rationale.is_empty()
1014        && deliverables.is_empty()
1015        && observations.is_empty()
1016    {
1017        return None;
1018    }
1019    let blocking_count = deliverables
1020        .iter()
1021        .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1022        .count();
1023    Some(RunTaskLedgerSummaryRecord {
1024        root_task,
1025        rationale,
1026        deliverables,
1027        observations,
1028        blocking_count,
1029    })
1030}
1031
1032fn compaction_events_from_transcript(
1033    transcript: &serde_json::Value,
1034    stage_id: Option<&str>,
1035    node_id: Option<&str>,
1036    location_prefix: &str,
1037    persisted_path: Option<&Path>,
1038) -> Vec<CompactionEventRecord> {
1039    let transcript_id = transcript
1040        .get("id")
1041        .and_then(|value| value.as_str())
1042        .map(str::to_string);
1043    let asset_ids = transcript
1044        .get("assets")
1045        .and_then(|value| value.as_array())
1046        .map(|assets| {
1047            assets
1048                .iter()
1049                .filter_map(|asset| {
1050                    asset
1051                        .get("id")
1052                        .and_then(|value| value.as_str())
1053                        .map(str::to_string)
1054                })
1055                .collect::<BTreeSet<_>>()
1056        })
1057        .unwrap_or_default();
1058    transcript
1059        .get("events")
1060        .and_then(|value| value.as_array())
1061        .map(|events| {
1062            events
1063                .iter()
1064                .filter(|event| {
1065                    event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1066                })
1067                .map(|event| {
1068                    let metadata = event.get("metadata");
1069                    let snapshot_asset_id = metadata
1070                        .and_then(|value| value.get("snapshot_asset_id"))
1071                        .and_then(|value| value.as_str())
1072                        .map(str::to_string);
1073                    let available = snapshot_asset_id
1074                        .as_ref()
1075                        .is_some_and(|asset_id| asset_ids.contains(asset_id));
1076                    let snapshot_location = snapshot_asset_id
1077                        .as_ref()
1078                        .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1079                        .unwrap_or_else(|| location_prefix.to_string());
1080                    CompactionEventRecord {
1081                        id: event
1082                            .get("id")
1083                            .and_then(|value| value.as_str())
1084                            .unwrap_or_default()
1085                            .to_string(),
1086                        transcript_id: transcript_id.clone(),
1087                        stage_id: stage_id.map(str::to_string),
1088                        node_id: node_id.map(str::to_string),
1089                        mode: metadata
1090                            .and_then(|value| value.get("mode"))
1091                            .and_then(|value| value.as_str())
1092                            .unwrap_or_default()
1093                            .to_string(),
1094                        strategy: metadata
1095                            .and_then(|value| value.get("strategy"))
1096                            .and_then(|value| value.as_str())
1097                            .unwrap_or_default()
1098                            .to_string(),
1099                        archived_messages: json_usize(
1100                            metadata.and_then(|value| value.get("archived_messages")),
1101                        ),
1102                        estimated_tokens_before: json_usize(
1103                            metadata.and_then(|value| value.get("estimated_tokens_before")),
1104                        ),
1105                        estimated_tokens_after: json_usize(
1106                            metadata.and_then(|value| value.get("estimated_tokens_after")),
1107                        ),
1108                        snapshot_asset_id,
1109                        snapshot_location,
1110                        snapshot_path: persisted_path
1111                            .map(|path| path.to_string_lossy().into_owned()),
1112                        available,
1113                    }
1114                })
1115                .collect()
1116        })
1117        .unwrap_or_default()
1118}
1119
1120fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1121    let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1122        return Vec::new();
1123    };
1124    let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1125        return Vec::new();
1126    };
1127
1128    content
1129        .lines()
1130        .filter(|line| !line.trim().is_empty())
1131        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1132        .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1133        .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1134        .collect()
1135}
1136
1137pub fn derive_run_observability(
1138    run: &RunRecord,
1139    persisted_path: Option<&Path>,
1140) -> RunObservabilityRecord {
1141    let mut action_graph_nodes = Vec::new();
1142    let mut action_graph_edges = Vec::new();
1143    let mut verification_outcomes = Vec::new();
1144    let mut planner_rounds = Vec::new();
1145    let mut transcript_pointers = Vec::new();
1146    let mut compaction_events = Vec::new();
1147    let mut daemon_events = Vec::new();
1148    let mut research_fact_count = 0usize;
1149
1150    let root_node_id = format!("run:{}", run.id);
1151    let trigger_event = trigger_event_from_run(run);
1152    let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1153    append_action_graph_node(
1154        &mut action_graph_nodes,
1155        RunActionGraphNodeRecord {
1156            id: root_node_id.clone(),
1157            label: run
1158                .workflow_name
1159                .clone()
1160                .unwrap_or_else(|| run.workflow_id.clone()),
1161            kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1162            status: run.status.clone(),
1163            outcome: run.status.clone(),
1164            trace_id: propagated_trace_id.clone(),
1165            stage_id: None,
1166            node_id: None,
1167            worker_id: None,
1168            run_id: Some(run.id.clone()),
1169            run_path: run.persisted_path.clone(),
1170            metadata: BTreeMap::from([(
1171                "workflow_id".to_string(),
1172                serde_json::json!(run.workflow_id),
1173            )]),
1174        },
1175    );
1176    let mut entry_node_id = root_node_id.clone();
1177    if let Some(trigger_event) = trigger_event.as_ref() {
1178        if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1179            let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1180            append_action_graph_node(
1181                &mut action_graph_nodes,
1182                RunActionGraphNodeRecord {
1183                    id: replay_source_node_id.clone(),
1184                    label: format!(
1185                        "{}:{} (original {})",
1186                        trigger_event.provider.as_str(),
1187                        trigger_event.kind,
1188                        replay_of_event_id
1189                    ),
1190                    kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1191                    status: "historical".to_string(),
1192                    outcome: "replayed_from".to_string(),
1193                    trace_id: Some(trigger_event.trace_id.0.clone()),
1194                    stage_id: None,
1195                    node_id: None,
1196                    worker_id: None,
1197                    run_id: Some(run.id.clone()),
1198                    run_path: run.persisted_path.clone(),
1199                    metadata: trigger_node_metadata(trigger_event),
1200                },
1201            );
1202            action_graph_edges.push(RunActionGraphEdgeRecord {
1203                from_id: replay_source_node_id,
1204                to_id: format!("trigger:{}", trigger_event.id.0),
1205                kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1206                label: Some("replay chain".to_string()),
1207            });
1208        }
1209        let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1210        append_action_graph_node(
1211            &mut action_graph_nodes,
1212            RunActionGraphNodeRecord {
1213                id: trigger_node_id.clone(),
1214                label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1215                kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1216                status: "received".to_string(),
1217                outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1218                trace_id: Some(trigger_event.trace_id.0.clone()),
1219                stage_id: None,
1220                node_id: None,
1221                worker_id: None,
1222                run_id: Some(run.id.clone()),
1223                run_path: run.persisted_path.clone(),
1224                metadata: trigger_node_metadata(trigger_event),
1225            },
1226        );
1227        action_graph_edges.push(RunActionGraphEdgeRecord {
1228            from_id: root_node_id.clone(),
1229            to_id: trigger_node_id.clone(),
1230            kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1231            label: Some(trigger_event.id.0.clone()),
1232        });
1233        entry_node_id = trigger_node_id;
1234    }
1235
1236    let stage_node_ids = run
1237        .stages
1238        .iter()
1239        .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1240        .collect::<BTreeMap<_, _>>();
1241    let stage_by_id = run
1242        .stages
1243        .iter()
1244        .map(|stage| (stage.id.as_str(), stage))
1245        .collect::<BTreeMap<_, _>>();
1246    let stage_by_node_id = run
1247        .stages
1248        .iter()
1249        .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1250        .collect::<BTreeMap<_, _>>();
1251
1252    let incoming_nodes = run
1253        .transitions
1254        .iter()
1255        .map(|transition| transition.to_node_id.clone())
1256        .collect::<BTreeSet<_>>();
1257
1258    for stage in &run.stages {
1259        let graph_node_id = stage_node_ids
1260            .get(&stage.id)
1261            .cloned()
1262            .unwrap_or_else(|| format!("stage:{}", stage.id));
1263        append_action_graph_node(
1264            &mut action_graph_nodes,
1265            RunActionGraphNodeRecord {
1266                id: graph_node_id.clone(),
1267                label: stage.node_id.clone(),
1268                kind: action_graph_kind_for_stage(stage).to_string(),
1269                status: stage.status.clone(),
1270                outcome: stage.outcome.clone(),
1271                trace_id: propagated_trace_id.clone(),
1272                stage_id: Some(stage.id.clone()),
1273                node_id: Some(stage.node_id.clone()),
1274                worker_id: stage
1275                    .metadata
1276                    .get("worker_id")
1277                    .and_then(|value| value.as_str())
1278                    .map(str::to_string),
1279                run_id: None,
1280                run_path: None,
1281                metadata: stage_node_metadata(stage),
1282            },
1283        );
1284        if !incoming_nodes.contains(&stage.node_id) {
1285            action_graph_edges.push(RunActionGraphEdgeRecord {
1286                from_id: entry_node_id.clone(),
1287                to_id: graph_node_id.clone(),
1288                kind: if trigger_event.is_some() {
1289                    ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1290                } else {
1291                    ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1292                },
1293                label: None,
1294            });
1295        }
1296
1297        if stage.kind == "verify" || stage.verification.is_some() {
1298            let passed = json_bool(
1299                stage
1300                    .verification
1301                    .as_ref()
1302                    .and_then(|value| value.get("pass")),
1303            )
1304            .or_else(|| {
1305                json_bool(
1306                    stage
1307                        .verification
1308                        .as_ref()
1309                        .and_then(|value| value.get("success")),
1310                )
1311            })
1312            .or_else(|| {
1313                if stage.status == "completed" && stage.outcome == "success" {
1314                    Some(true)
1315                } else if stage.status == "failed" || stage.outcome == "failed" {
1316                    Some(false)
1317                } else {
1318                    None
1319                }
1320            });
1321            verification_outcomes.push(RunVerificationOutcomeRecord {
1322                stage_id: stage.id.clone(),
1323                node_id: stage.node_id.clone(),
1324                status: stage.status.clone(),
1325                passed,
1326                summary: stage
1327                    .verification
1328                    .as_ref()
1329                    .map(compact_json_value)
1330                    .or_else(|| {
1331                        stage
1332                            .visible_text
1333                            .as_ref()
1334                            .filter(|value| !value.trim().is_empty())
1335                            .cloned()
1336                    }),
1337            });
1338        }
1339
1340        if stage.transcript.is_some() {
1341            transcript_pointers.push(RunTranscriptPointerRecord {
1342                id: format!("stage:{}:transcript", stage.id),
1343                label: format!("Stage {} transcript", stage.node_id),
1344                kind: "embedded_transcript".to_string(),
1345                location: format!("run.stages[{}].transcript", stage.node_id),
1346                path: run.persisted_path.clone(),
1347                available: true,
1348            });
1349            if let Some(transcript) = stage.transcript.as_ref() {
1350                compaction_events.extend(compaction_events_from_transcript(
1351                    transcript,
1352                    Some(&stage.id),
1353                    Some(&stage.node_id),
1354                    &format!("run.stages[{}].transcript", stage.node_id),
1355                    persisted_path,
1356                ));
1357            }
1358        }
1359
1360        if let Some(payload) = stage_result_payload(stage) {
1361            let trace = payload.get("trace");
1362            let task_ledger = payload
1363                .get("task_ledger")
1364                .and_then(task_ledger_summary_from_value);
1365            let research_facts = task_ledger
1366                .as_ref()
1367                .map(|ledger| ledger.observations.clone())
1368                .unwrap_or_default();
1369            research_fact_count += research_facts.len();
1370            let tools_payload = payload.get("tools");
1371            let tools_used = json_string_array(
1372                tools_payload
1373                    .and_then(|tools| tools.get("calls"))
1374                    .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1375            );
1376            let successful_tools =
1377                json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1378            let planner_round = RunPlannerRoundRecord {
1379                stage_id: stage.id.clone(),
1380                node_id: stage.node_id.clone(),
1381                stage_kind: stage.kind.clone(),
1382                status: stage.status.clone(),
1383                outcome: stage.outcome.clone(),
1384                iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1385                llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1386                tool_execution_count: json_usize(
1387                    trace.and_then(|trace| trace.get("tool_executions")),
1388                ),
1389                tool_rejection_count: json_usize(
1390                    trace.and_then(|trace| trace.get("tool_rejections")),
1391                ),
1392                intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1393                compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1394                native_text_tool_fallback_count: json_usize(
1395                    trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1396                ),
1397                native_text_tool_fallback_rejection_count: json_usize(
1398                    trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1399                ),
1400                empty_completion_retry_count: json_usize(
1401                    trace.and_then(|trace| trace.get("empty_completion_retries")),
1402                ),
1403                tools_used,
1404                successful_tools,
1405                ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1406                task_ledger,
1407                research_facts,
1408            };
1409            let has_agentic_detail = planner_round.iteration_count > 0
1410                || planner_round.llm_call_count > 0
1411                || planner_round.tool_execution_count > 0
1412                || planner_round.native_text_tool_fallback_count > 0
1413                || planner_round.native_text_tool_fallback_rejection_count > 0
1414                || planner_round.empty_completion_retry_count > 0
1415                || planner_round.ledger_done_rejections > 0
1416                || planner_round.task_ledger.is_some()
1417                || !planner_round.tools_used.is_empty()
1418                || !planner_round.successful_tools.is_empty();
1419            if has_agentic_detail {
1420                planner_rounds.push(planner_round);
1421            }
1422        }
1423    }
1424
1425    for transition in &run.transitions {
1426        let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1427            continue;
1428        };
1429        let from_stage = transition
1430            .from_stage_id
1431            .as_deref()
1432            .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1433        let from_id = transition
1434            .from_stage_id
1435            .as_ref()
1436            .and_then(|stage_id| stage_node_ids.get(stage_id))
1437            .cloned()
1438            .or_else(|| {
1439                transition
1440                    .from_node_id
1441                    .as_ref()
1442                    .and_then(|node_id| stage_by_node_id.get(node_id))
1443                    .cloned()
1444            })
1445            .unwrap_or_else(|| root_node_id.clone());
1446        action_graph_edges.push(RunActionGraphEdgeRecord {
1447            from_id,
1448            to_id,
1449            kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1450                ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1451            } else {
1452                ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1453            },
1454            label: transition.branch.clone(),
1455        });
1456    }
1457
1458    let worker_lineage = run
1459        .child_runs
1460        .iter()
1461        .map(|child| {
1462            let worker_node_id = format!("worker:{}", child.worker_id);
1463            append_action_graph_node(
1464                &mut action_graph_nodes,
1465                RunActionGraphNodeRecord {
1466                    id: worker_node_id.clone(),
1467                    label: child.worker_name.clone(),
1468                    kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1469                    status: child.status.clone(),
1470                    outcome: child.status.clone(),
1471                    trace_id: propagated_trace_id.clone(),
1472                    stage_id: child.parent_stage_id.clone(),
1473                    node_id: None,
1474                    worker_id: Some(child.worker_id.clone()),
1475                    run_id: child.run_id.clone(),
1476                    run_path: child.run_path.clone(),
1477                    metadata: BTreeMap::from([
1478                        (
1479                            "worker_name".to_string(),
1480                            serde_json::json!(child.worker_name),
1481                        ),
1482                        ("task".to_string(), serde_json::json!(child.task)),
1483                    ]),
1484                },
1485            );
1486            if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1487                if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1488                    action_graph_edges.push(RunActionGraphEdgeRecord {
1489                        from_id: stage_node_id.clone(),
1490                        to_id: worker_node_id,
1491                        kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1492                        label: Some(child.worker_name.clone()),
1493                    });
1494                }
1495            }
1496            RunWorkerLineageRecord {
1497                worker_id: child.worker_id.clone(),
1498                worker_name: child.worker_name.clone(),
1499                parent_stage_id: child.parent_stage_id.clone(),
1500                task: child.task.clone(),
1501                status: child.status.clone(),
1502                session_id: child.session_id.clone(),
1503                parent_session_id: child.parent_session_id.clone(),
1504                run_id: child.run_id.clone(),
1505                run_path: child.run_path.clone(),
1506                snapshot_path: child.snapshot_path.clone(),
1507            }
1508        })
1509        .collect::<Vec<_>>();
1510
1511    if run.transcript.is_some() {
1512        transcript_pointers.push(RunTranscriptPointerRecord {
1513            id: "run:transcript".to_string(),
1514            label: "Run transcript".to_string(),
1515            kind: "embedded_transcript".to_string(),
1516            location: "run.transcript".to_string(),
1517            path: run.persisted_path.clone(),
1518            available: true,
1519        });
1520        if let Some(transcript) = run.transcript.as_ref() {
1521            compaction_events.extend(compaction_events_from_transcript(
1522                transcript,
1523                None,
1524                None,
1525                "run.transcript",
1526                persisted_path,
1527            ));
1528        }
1529    }
1530
1531    if let Some(path) = persisted_path {
1532        if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1533            transcript_pointers.push(RunTranscriptPointerRecord {
1534                id: "run:llm_transcript".to_string(),
1535                label: "LLM transcript sidecar".to_string(),
1536                kind: "llm_jsonl".to_string(),
1537                location: "run sidecar".to_string(),
1538                path: Some(sidecar_path.to_string_lossy().into_owned()),
1539                available: sidecar_path.exists(),
1540            });
1541        }
1542        daemon_events.extend(daemon_events_from_sidecar(path));
1543    }
1544
1545    RunObservabilityRecord {
1546        schema_version: 4,
1547        planner_rounds,
1548        research_fact_count,
1549        action_graph_nodes,
1550        action_graph_edges,
1551        worker_lineage,
1552        verification_outcomes,
1553        transcript_pointers,
1554        compaction_events,
1555        daemon_events,
1556    }
1557}
1558
1559fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1560    run.observability = Some(derive_run_observability(run, persisted_path));
1561}
1562
1563pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1564    let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1565    if run.type_name.is_empty() {
1566        run.type_name = "run_record".to_string();
1567    }
1568    if run.id.is_empty() {
1569        run.id = new_id("run");
1570    }
1571    if run.started_at.is_empty() {
1572        run.started_at = now_rfc3339();
1573    }
1574    if run.status.is_empty() {
1575        run.status = "running".to_string();
1576    }
1577    if run.root_run_id.is_none() {
1578        run.root_run_id = Some(run.id.clone());
1579    }
1580    if run.replay_fixture.is_none() {
1581        run.replay_fixture = Some(replay_fixture_from_run(&run));
1582    }
1583    merge_hitl_questions_from_active_log(&mut run);
1584    sync_run_handoffs(&mut run);
1585    if run.observability.is_none() {
1586        let persisted_path = run.persisted_path.clone();
1587        let persisted = persisted_path.as_deref().map(Path::new);
1588        refresh_run_observability(&mut run, persisted);
1589    }
1590    Ok(run)
1591}
1592
1593pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1594    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1595    if manifest.type_name.is_empty() {
1596        manifest.type_name = "eval_suite_manifest".to_string();
1597    }
1598    if manifest.id.is_empty() {
1599        manifest.id = new_id("eval_suite");
1600    }
1601    Ok(manifest)
1602}
1603
1604pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1605    let content = std::fs::read_to_string(path)
1606        .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1607    let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1608        .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1609    if manifest.base_dir.is_none() {
1610        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1611    }
1612    Ok(manifest)
1613}
1614
1615pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1616    let content = std::fs::read_to_string(path)
1617        .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1618    let mut manifest: EvalPackManifest =
1619        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1620            serde_json::from_str(&content)
1621                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1622        } else {
1623            toml::from_str(&content)
1624                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1625        };
1626    normalize_eval_pack_manifest(&mut manifest);
1627    if manifest.base_dir.is_none() {
1628        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1629    }
1630    Ok(manifest)
1631}
1632
1633fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1634    if manifest.version == 0 {
1635        manifest.version = 1;
1636    }
1637    if manifest.id.is_empty() {
1638        manifest.id = manifest
1639            .name
1640            .clone()
1641            .filter(|name| !name.trim().is_empty())
1642            .unwrap_or_else(|| new_id("eval_pack"));
1643    }
1644}
1645
1646fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1647    let content = std::fs::read_to_string(path)
1648        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1649    serde_json::from_str(&content)
1650        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1651}
1652
1653fn load_run_record_from_fixture_ref(
1654    fixture: &EvalPackFixtureRef,
1655    base_dir: Option<&Path>,
1656) -> Result<RunRecord, VmError> {
1657    if let Some(inline) = &fixture.inline {
1658        let run: RunRecord = serde_json::from_value(inline.clone())
1659            .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1660        return Ok(run);
1661    }
1662    let path = fixture.path.as_deref().ok_or_else(|| {
1663        VmError::Runtime(format!(
1664            "fixture '{}' is missing path or inline run",
1665            fixture.id
1666        ))
1667    })?;
1668    load_run_record(&resolve_manifest_path(base_dir, path))
1669}
1670
1671fn load_replay_fixture_from_ref(
1672    fixture: &EvalPackFixtureRef,
1673    base_dir: Option<&Path>,
1674) -> Result<ReplayFixture, VmError> {
1675    if let Some(inline) = &fixture.inline {
1676        return serde_json::from_value(inline.clone())
1677            .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1678    }
1679    let path = fixture.path.as_deref().ok_or_else(|| {
1680        VmError::Runtime(format!(
1681            "fixture '{}' is missing path or inline replay fixture",
1682            fixture.id
1683        ))
1684    })?;
1685    load_replay_fixture(&resolve_manifest_path(base_dir, path))
1686}
1687
1688fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1689    let path_buf = PathBuf::from(path);
1690    if path_buf.is_absolute() {
1691        path_buf
1692    } else if let Some(base_dir) = base_dir {
1693        base_dir.join(path_buf)
1694    } else {
1695        path_buf
1696    }
1697}
1698
1699pub fn evaluate_run_suite_manifest(
1700    manifest: &EvalSuiteManifest,
1701) -> Result<ReplayEvalSuiteReport, VmError> {
1702    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1703    let mut reports = Vec::new();
1704    for case in &manifest.cases {
1705        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1706        let run = load_run_record(&run_path)?;
1707        let fixture = match &case.fixture_path {
1708            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1709            None => run
1710                .replay_fixture
1711                .clone()
1712                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1713        };
1714        let eval = evaluate_run_against_fixture(&run, &fixture);
1715        let mut pass = eval.pass;
1716        let mut failures = eval.failures;
1717        let comparison = match &case.compare_to {
1718            Some(path) => {
1719                let baseline_path = resolve_manifest_path(base_dir, path);
1720                let baseline = load_run_record(&baseline_path)?;
1721                let diff = diff_run_records(&baseline, &run);
1722                if !diff.identical {
1723                    pass = false;
1724                    failures.push(format!(
1725                        "run differs from baseline {} with {} stage changes",
1726                        baseline_path.display(),
1727                        diff.stage_diffs.len()
1728                    ));
1729                }
1730                Some(diff)
1731            }
1732            None => None,
1733        };
1734        reports.push(ReplayEvalCaseReport {
1735            run_id: run.id.clone(),
1736            workflow_id: run.workflow_id.clone(),
1737            label: case.label.clone(),
1738            pass,
1739            failures,
1740            stage_count: eval.stage_count,
1741            source_path: Some(run_path.display().to_string()),
1742            comparison,
1743        });
1744    }
1745    let total = reports.len();
1746    let passed = reports.iter().filter(|report| report.pass).count();
1747    let failed = total.saturating_sub(passed);
1748    Ok(ReplayEvalSuiteReport {
1749        pass: failed == 0,
1750        total,
1751        passed,
1752        failed,
1753        cases: reports,
1754    })
1755}
1756
1757pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1758    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1759    let fixture_base_dir_buf = manifest
1760        .defaults
1761        .fixture_root
1762        .as_deref()
1763        .map(|root| resolve_manifest_path(base_dir, root));
1764    let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1765    let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1766        .fixtures
1767        .iter()
1768        .filter(|fixture| !fixture.id.is_empty())
1769        .map(|fixture| (fixture.id.as_str(), fixture))
1770        .collect();
1771    let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1772        .rubrics
1773        .iter()
1774        .filter(|rubric| !rubric.id.is_empty())
1775        .map(|rubric| (rubric.id.as_str(), rubric))
1776        .collect();
1777
1778    let mut reports = Vec::new();
1779    for (index, case) in manifest.cases.iter().enumerate() {
1780        let case_id = case
1781            .id
1782            .clone()
1783            .filter(|id| !id.trim().is_empty())
1784            .unwrap_or_else(|| format!("case_{}", index + 1));
1785        let label = case
1786            .name
1787            .clone()
1788            .or_else(|| case.id.clone())
1789            .unwrap_or_else(|| case_id.clone());
1790        let severity = eval_pack_case_severity(manifest, case);
1791        let blocking = severity == "blocking";
1792        let mut failures = Vec::new();
1793        let mut warnings = Vec::new();
1794        let mut informational = Vec::new();
1795
1796        if case.friction_events.is_some() {
1797            let report = evaluate_eval_pack_friction_case(
1798                manifest,
1799                case,
1800                &case_id,
1801                &label,
1802                &severity,
1803                blocking,
1804                base_dir,
1805                fixture_base_dir,
1806                &fixtures_by_id,
1807                &rubrics_by_id,
1808            )?;
1809            reports.push(report);
1810            continue;
1811        }
1812
1813        let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
1814        let fixture =
1815            load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
1816        let eval = evaluate_run_against_fixture(&run, &fixture);
1817        failures.extend(eval.failures);
1818        apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
1819        apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
1820
1821        let comparison = match &case.compare_to {
1822            Some(path) => {
1823                let baseline_path = resolve_manifest_path(base_dir, path);
1824                let baseline = load_run_record(&baseline_path)?;
1825                let diff = diff_run_records(&baseline, &run);
1826                if !diff.identical {
1827                    failures.push(format!(
1828                        "run differs from baseline {} with {} stage changes",
1829                        baseline_path.display(),
1830                        diff.stage_diffs.len()
1831                    ));
1832                }
1833                Some(diff)
1834            }
1835            None => None,
1836        };
1837
1838        for rubric_id in &case.rubrics {
1839            let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1840                failures.push(format!("case references unknown rubric '{rubric_id}'"));
1841                continue;
1842            };
1843            apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
1844        }
1845
1846        let pass = failures.is_empty() || !blocking;
1847        if !failures.is_empty() && !blocking {
1848            if severity == "warning" {
1849                warnings.append(&mut failures);
1850            } else {
1851                informational.append(&mut failures);
1852            }
1853        }
1854        reports.push(EvalPackCaseReport {
1855            id: case_id,
1856            label,
1857            severity,
1858            pass,
1859            blocking,
1860            run_id: run.id.clone(),
1861            workflow_id: run.workflow_id.clone(),
1862            source_path: eval_pack_case_source_path(
1863                case,
1864                base_dir,
1865                fixture_base_dir,
1866                &fixtures_by_id,
1867            ),
1868            stage_count: eval.stage_count,
1869            failures,
1870            warnings,
1871            informational,
1872            comparison,
1873        });
1874    }
1875
1876    let total = reports.len();
1877    let blocking_failed = reports
1878        .iter()
1879        .filter(|report| report.blocking && !report.failures.is_empty())
1880        .count();
1881    let warning_failed = reports
1882        .iter()
1883        .filter(|report| !report.warnings.is_empty())
1884        .count();
1885    let informational_failed = reports
1886        .iter()
1887        .filter(|report| !report.informational.is_empty())
1888        .count();
1889    let passed = reports.iter().filter(|report| report.pass).count();
1890    Ok(EvalPackReport {
1891        pack_id: manifest.id.clone(),
1892        pass: blocking_failed == 0,
1893        total,
1894        passed,
1895        failed: total.saturating_sub(passed),
1896        blocking_failed,
1897        warning_failed,
1898        informational_failed,
1899        cases: reports,
1900    })
1901}
1902
1903#[allow(clippy::too_many_arguments)]
1904fn evaluate_eval_pack_friction_case(
1905    manifest: &EvalPackManifest,
1906    case: &EvalPackCase,
1907    case_id: &str,
1908    label: &str,
1909    severity: &str,
1910    blocking: bool,
1911    base_dir: Option<&Path>,
1912    fixture_base_dir: Option<&Path>,
1913    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
1914    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
1915) -> Result<EvalPackCaseReport, VmError> {
1916    let mut failures = Vec::new();
1917    let mut warnings = Vec::new();
1918    let mut informational = Vec::new();
1919    let events =
1920        load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
1921    let options = friction_suggestion_options(case, manifest);
1922    let suggestions = generate_context_pack_suggestions(&events, &options);
1923
1924    for rubric_id in &case.rubrics {
1925        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1926            failures.push(format!("case references unknown rubric '{rubric_id}'"));
1927            continue;
1928        };
1929        apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
1930    }
1931
1932    if case.rubrics.is_empty() && suggestions.is_empty() {
1933        failures.push("friction fixture produced no context-pack suggestions".to_string());
1934    }
1935
1936    let pass = failures.is_empty() || !blocking;
1937    if !failures.is_empty() && !blocking {
1938        if severity == "warning" {
1939            warnings.append(&mut failures);
1940        } else {
1941            informational.append(&mut failures);
1942        }
1943    }
1944
1945    Ok(EvalPackCaseReport {
1946        id: case_id.to_string(),
1947        label: label.to_string(),
1948        severity: severity.to_string(),
1949        pass,
1950        blocking,
1951        run_id: "friction_events".to_string(),
1952        workflow_id: String::new(),
1953        source_path: eval_pack_case_friction_source_path(
1954            case,
1955            base_dir,
1956            fixture_base_dir,
1957            fixtures_by_id,
1958        ),
1959        stage_count: events.len(),
1960        failures,
1961        warnings,
1962        informational,
1963        comparison: None,
1964    })
1965}
1966
1967fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
1968    normalize_eval_pack_severity(
1969        case.severity
1970            .as_deref()
1971            .or(case.thresholds.severity.as_deref())
1972            .or(manifest.defaults.severity.as_deref())
1973            .or(manifest.defaults.thresholds.severity.as_deref())
1974            .unwrap_or("blocking"),
1975    )
1976}
1977
1978fn normalize_eval_pack_severity(value: &str) -> String {
1979    match value.trim().to_ascii_lowercase().as_str() {
1980        "warn" | "warning" => "warning".to_string(),
1981        "info" | "informational" => "informational".to_string(),
1982        _ => "blocking".to_string(),
1983    }
1984}
1985
1986fn load_eval_pack_case_run(
1987    case: &EvalPackCase,
1988    base_dir: Option<&Path>,
1989    fixture_base_dir: Option<&Path>,
1990    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
1991) -> Result<RunRecord, VmError> {
1992    if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
1993        if let Some(fixture) = fixtures_by_id.get(run_ref) {
1994            return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
1995        }
1996        return load_run_record(&resolve_manifest_path(base_dir, run_ref));
1997    }
1998    Err(VmError::Runtime(
1999        "eval pack case is missing run or run_path".to_string(),
2000    ))
2001}
2002
2003fn load_eval_pack_case_fixture(
2004    case: &EvalPackCase,
2005    base_dir: Option<&Path>,
2006    fixture_base_dir: Option<&Path>,
2007    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2008    run: &RunRecord,
2009) -> Result<ReplayFixture, VmError> {
2010    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2011        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2012            return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2013        }
2014        return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2015    }
2016    Ok(run
2017        .replay_fixture
2018        .clone()
2019        .unwrap_or_else(|| replay_fixture_from_run(run)))
2020}
2021
2022fn eval_pack_case_source_path(
2023    case: &EvalPackCase,
2024    base_dir: Option<&Path>,
2025    fixture_base_dir: Option<&Path>,
2026    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2027) -> Option<String> {
2028    let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2029    if let Some(fixture) = fixtures_by_id.get(run_ref) {
2030        return fixture.path.as_ref().map(|path| {
2031            resolve_manifest_path(fixture_base_dir, path)
2032                .display()
2033                .to_string()
2034        });
2035    }
2036    Some(
2037        resolve_manifest_path(base_dir, run_ref)
2038            .display()
2039            .to_string(),
2040    )
2041}
2042
2043fn load_eval_pack_case_friction_events(
2044    case: &EvalPackCase,
2045    base_dir: Option<&Path>,
2046    fixture_base_dir: Option<&Path>,
2047    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2048) -> Result<Vec<FrictionEvent>, VmError> {
2049    let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2050        VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2051    })?;
2052    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2053        return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2054    }
2055    load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2056}
2057
2058fn load_friction_events_from_fixture_ref(
2059    fixture: &EvalPackFixtureRef,
2060    base_dir: Option<&Path>,
2061) -> Result<Vec<FrictionEvent>, VmError> {
2062    if let Some(inline) = &fixture.inline {
2063        return normalize_friction_events_json(inline.clone());
2064    }
2065    let path = fixture.path.as_deref().ok_or_else(|| {
2066        VmError::Runtime(format!(
2067            "fixture '{}' is missing path or inline friction events",
2068            fixture.id
2069        ))
2070    })?;
2071    load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2072}
2073
2074fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2075    let content = std::fs::read_to_string(path)
2076        .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2077    let value: serde_json::Value = serde_json::from_str(&content)
2078        .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2079    normalize_friction_events_json(value)
2080}
2081
2082fn eval_pack_case_friction_source_path(
2083    case: &EvalPackCase,
2084    base_dir: Option<&Path>,
2085    fixture_base_dir: Option<&Path>,
2086    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2087) -> Option<String> {
2088    let event_ref = case.friction_events.as_deref()?;
2089    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2090        return fixture.path.as_ref().map(|path| {
2091            resolve_manifest_path(fixture_base_dir, path)
2092                .display()
2093                .to_string()
2094        });
2095    }
2096    Some(
2097        resolve_manifest_path(base_dir, event_ref)
2098            .display()
2099            .to_string(),
2100    )
2101}
2102
2103fn friction_suggestion_options(
2104    case: &EvalPackCase,
2105    manifest: &EvalPackManifest,
2106) -> ContextPackSuggestionOptions {
2107    let min_occurrences = case
2108        .metadata
2109        .get("min_occurrences")
2110        .or_else(|| manifest.metadata.get("min_occurrences"))
2111        .and_then(|value| value.as_u64())
2112        .unwrap_or(2) as usize;
2113    let owner = case
2114        .metadata
2115        .get("owner")
2116        .or_else(|| manifest.metadata.get("owner"))
2117        .and_then(|value| value.as_str())
2118        .map(str::to_string)
2119        .or_else(|| {
2120            manifest
2121                .package
2122                .as_ref()
2123                .and_then(|package| package.name.clone())
2124        });
2125    ContextPackSuggestionOptions {
2126        min_occurrences,
2127        owner,
2128    }
2129}
2130
2131fn apply_eval_pack_thresholds(
2132    run: &RunRecord,
2133    thresholds: &EvalPackThresholds,
2134    failures: &mut Vec<String>,
2135) {
2136    if let Some(max_stage_count) = thresholds.max_stage_count {
2137        if run.stages.len() > max_stage_count {
2138            failures.push(format!(
2139                "stage count {} exceeds threshold {}",
2140                run.stages.len(),
2141                max_stage_count
2142            ));
2143        }
2144    }
2145    if let Some(max_latency_ms) = thresholds.max_latency_ms {
2146        let actual = run
2147            .usage
2148            .as_ref()
2149            .map(|usage| usage.total_duration_ms)
2150            .unwrap_or_default();
2151        if actual > max_latency_ms {
2152            failures.push(format!(
2153                "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2154            ));
2155        }
2156    }
2157    if let Some(max_cost_usd) = thresholds.max_cost_usd {
2158        let actual = run
2159            .usage
2160            .as_ref()
2161            .map(|usage| usage.total_cost)
2162            .unwrap_or_default();
2163        if actual > max_cost_usd {
2164            failures.push(format!(
2165                "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2166            ));
2167        }
2168    }
2169    if let Some(max_tokens) = thresholds.max_tokens {
2170        let actual = run
2171            .usage
2172            .as_ref()
2173            .map(|usage| usage.input_tokens + usage.output_tokens)
2174            .unwrap_or_default();
2175        if actual > max_tokens {
2176            failures.push(format!(
2177                "token count {actual} exceeds threshold {max_tokens}"
2178            ));
2179        }
2180    }
2181}
2182
2183fn apply_eval_pack_rubric(
2184    rubric: &EvalPackRubric,
2185    run: &RunRecord,
2186    failures: &mut Vec<String>,
2187    warnings: &mut Vec<String>,
2188) {
2189    match rubric.kind.as_str() {
2190        "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2191            apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2192            for assertion in &rubric.assertions {
2193                apply_eval_pack_assertion(rubric, assertion, run, failures);
2194            }
2195        }
2196        "llm-judge" | "llm_as_judge" | "judge" => {
2197            let severity = normalize_eval_pack_severity(
2198                rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2199            );
2200            let message = format!(
2201                "rubric '{}' requires an external LLM judge and was not run locally",
2202                rubric.id
2203            );
2204            if severity == "blocking" {
2205                failures.push(message);
2206            } else {
2207                warnings.push(message);
2208            }
2209        }
2210        other => warnings.push(format!(
2211            "rubric '{}' has unknown kind '{}' and was not run locally",
2212            rubric.id, other
2213        )),
2214    }
2215}
2216
2217fn apply_eval_pack_friction_rubric(
2218    rubric: &EvalPackRubric,
2219    suggestions: &[super::ContextPackSuggestion],
2220    failures: &mut Vec<String>,
2221    warnings: &mut Vec<String>,
2222) {
2223    match rubric.kind.as_str() {
2224        "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2225            let mut expectations = Vec::new();
2226            for assertion in &rubric.assertions {
2227                match assertion.kind.as_str() {
2228                    "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2229                        let expectation = context_pack_expectation_from_assertion(assertion);
2230                        expectations.push(expectation);
2231                    }
2232                    other => failures.push(format!(
2233                        "rubric '{}' has unsupported friction assertion kind '{}'",
2234                        rubric.id, other
2235                    )),
2236                }
2237            }
2238            failures.extend(evaluate_context_pack_suggestion_expectations(
2239                suggestions,
2240                &expectations,
2241            ));
2242        }
2243        other => warnings.push(format!(
2244            "rubric '{}' has unknown friction kind '{}' and was not run locally",
2245            rubric.id, other
2246        )),
2247    }
2248}
2249
2250fn context_pack_expectation_from_assertion(
2251    assertion: &EvalPackAssertion,
2252) -> ContextPackSuggestionExpectation {
2253    let expected = assertion
2254        .expected
2255        .as_ref()
2256        .and_then(|value| value.as_object());
2257    let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2258    ContextPackSuggestionExpectation {
2259        min_suggestions: expected
2260            .and_then(|map| map.get("min_suggestions"))
2261            .and_then(|value| value.as_u64())
2262            .map(|value| value as usize),
2263        recommended_artifact: expected
2264            .and_then(|map| map.get("recommended_artifact"))
2265            .and_then(|value| value.as_str())
2266            .map(str::to_string)
2267            .or_else(|| expected_string.map(str::to_string)),
2268        title_contains: assertion.contains.clone().or_else(|| {
2269            expected
2270                .and_then(|map| map.get("title_contains"))
2271                .and_then(|value| value.as_str())
2272                .map(str::to_string)
2273        }),
2274        manifest_name_contains: expected
2275            .and_then(|map| map.get("manifest_name_contains"))
2276            .and_then(|value| value.as_str())
2277            .map(str::to_string),
2278        required_capability: expected
2279            .and_then(|map| map.get("required_capability"))
2280            .and_then(|value| value.as_str())
2281            .map(str::to_string),
2282        required_output_slot: expected
2283            .and_then(|map| map.get("required_output_slot"))
2284            .and_then(|value| value.as_str())
2285            .map(str::to_string),
2286    }
2287}
2288
2289fn apply_eval_pack_assertion(
2290    rubric: &EvalPackRubric,
2291    assertion: &EvalPackAssertion,
2292    run: &RunRecord,
2293    failures: &mut Vec<String>,
2294) {
2295    match assertion.kind.as_str() {
2296        "run-status" | "run_status" | "status" => {
2297            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2298            if let Some(expected) = expected {
2299                if run.status != expected {
2300                    failures.push(format!(
2301                        "rubric '{}' expected run status {}, got {}",
2302                        rubric.id, expected, run.status
2303                    ));
2304                }
2305            }
2306        }
2307        "stage-status" | "stage_status" => {
2308            let Some(stage_id) = assertion.stage.as_deref() else {
2309                failures.push(format!(
2310                    "rubric '{}' stage-status assertion missing stage",
2311                    rubric.id
2312                ));
2313                return;
2314            };
2315            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2316            let Some(expected) = expected else {
2317                failures.push(format!(
2318                    "rubric '{}' stage-status assertion missing expected string",
2319                    rubric.id
2320                ));
2321                return;
2322            };
2323            match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2324                Some(stage) if stage.status == expected => {}
2325                Some(stage) => failures.push(format!(
2326                    "rubric '{}' expected stage {} status {}, got {}",
2327                    rubric.id, stage_id, expected, stage.status
2328                )),
2329                None => failures.push(format!(
2330                    "rubric '{}' expected stage {} to exist",
2331                    rubric.id, stage_id
2332                )),
2333            }
2334        }
2335        "visible-text-contains" | "visible_text_contains" => {
2336            let Some(needle) = assertion.contains.as_deref() else {
2337                failures.push(format!(
2338                    "rubric '{}' visible-text assertion missing contains",
2339                    rubric.id
2340                ));
2341                return;
2342            };
2343            let matched = match assertion.stage.as_deref() {
2344                Some(stage_id) => run
2345                    .stages
2346                    .iter()
2347                    .find(|stage| stage.node_id == stage_id)
2348                    .and_then(|stage| stage.visible_text.as_deref())
2349                    .is_some_and(|text| text.contains(needle)),
2350                None => run
2351                    .stages
2352                    .iter()
2353                    .filter_map(|stage| stage.visible_text.as_deref())
2354                    .any(|text| text.contains(needle)),
2355            };
2356            if !matched {
2357                failures.push(format!(
2358                    "rubric '{}' expected visible text to contain {:?}",
2359                    rubric.id, needle
2360                ));
2361            }
2362        }
2363        "hitl-question-contains" | "hitl_question_contains" => {
2364            let Some(needle) = assertion.contains.as_deref() else {
2365                failures.push(format!(
2366                    "rubric '{}' HITL assertion missing contains",
2367                    rubric.id
2368                ));
2369                return;
2370            };
2371            if !run
2372                .hitl_questions
2373                .iter()
2374                .any(|question| question.prompt.contains(needle))
2375            {
2376                failures.push(format!(
2377                    "rubric '{}' expected HITL question to contain {:?}",
2378                    rubric.id, needle
2379                ));
2380            }
2381        }
2382        "" => {}
2383        other => failures.push(format!(
2384            "rubric '{}' has unsupported assertion kind '{}'",
2385            rubric.id, other
2386        )),
2387    }
2388}
2389
2390/// Edit operation in a diff sequence.
2391#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2392pub(crate) enum DiffOp {
2393    Equal,
2394    Delete,
2395    Insert,
2396}
2397
2398/// Compute the shortest edit script using Myers' O(nd) algorithm.
2399/// Returns a sequence of (DiffOp, line_index_in_before_or_after).
2400/// Time: O(nd) where d = edit distance. Space: O(d * n).
2401pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2402    let n = a.len() as isize;
2403    let m = b.len() as isize;
2404    if n == 0 && m == 0 {
2405        return Vec::new();
2406    }
2407    if n == 0 {
2408        return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2409    }
2410    if m == 0 {
2411        return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2412    }
2413
2414    let max_d = (n + m) as usize;
2415    let offset = max_d as isize;
2416    let v_size = 2 * max_d + 1;
2417    let mut v = vec![0isize; v_size];
2418    // trace[d] holds the `v` snapshot BEFORE step d ran — required for backtrack.
2419    let mut trace: Vec<Vec<isize>> = Vec::new();
2420
2421    'outer: for d in 0..=max_d as isize {
2422        trace.push(v.clone());
2423        let mut new_v = v.clone();
2424        for k in (-d..=d).step_by(2) {
2425            let ki = (k + offset) as usize;
2426            let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2427                v[ki + 1]
2428            } else {
2429                v[ki - 1] + 1
2430            };
2431            let mut y = x - k;
2432            while x < n && y < m && a[x as usize] == b[y as usize] {
2433                x += 1;
2434                y += 1;
2435            }
2436            new_v[ki] = x;
2437            if x >= n && y >= m {
2438                let _ = new_v;
2439                break 'outer;
2440            }
2441        }
2442        v = new_v;
2443    }
2444
2445    let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2446    let mut x = n;
2447    let mut y = m;
2448    for d in (1..trace.len() as isize).rev() {
2449        let k = x - y;
2450        let v_prev = &trace[d as usize];
2451        let prev_k = if k == -d
2452            || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2453        {
2454            k + 1
2455        } else {
2456            k - 1
2457        };
2458        let prev_x = v_prev[(prev_k + offset) as usize];
2459        let prev_y = prev_x - prev_k;
2460
2461        while x > prev_x && y > prev_y {
2462            x -= 1;
2463            y -= 1;
2464            ops.push((DiffOp::Equal, x as usize));
2465        }
2466        if prev_k < k {
2467            x -= 1;
2468            ops.push((DiffOp::Delete, x as usize));
2469        } else {
2470            y -= 1;
2471            ops.push((DiffOp::Insert, y as usize));
2472        }
2473    }
2474    while x > 0 && y > 0 {
2475        x -= 1;
2476        y -= 1;
2477        ops.push((DiffOp::Equal, x as usize));
2478    }
2479    ops.reverse();
2480    ops
2481}
2482
2483pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2484    let before_lines: Vec<&str> = before.lines().collect();
2485    let after_lines: Vec<&str> = after.lines().collect();
2486    let ops = myers_diff(&before_lines, &after_lines);
2487
2488    let mut diff = String::new();
2489    let file = path.unwrap_or("artifact");
2490    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2491    for &(op, idx) in &ops {
2492        match op {
2493            DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2494            DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2495            DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2496        }
2497    }
2498    diff
2499}
2500
2501pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2502    let path = path
2503        .map(PathBuf::from)
2504        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2505    let mut materialized = run.clone();
2506    merge_hitl_questions_from_active_log(&mut materialized);
2507    if materialized.replay_fixture.is_none() {
2508        materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2509    }
2510    materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2511    sync_run_handoffs(&mut materialized);
2512    refresh_run_observability(&mut materialized, Some(&path));
2513    if let Some(parent) = path.parent() {
2514        std::fs::create_dir_all(parent)
2515            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2516    }
2517    let json = serde_json::to_string_pretty(&materialized)
2518        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2519    // Atomic write: .tmp then rename guards against partial writes on kill.
2520    let tmp_path = path.with_extension("json.tmp");
2521    std::fs::write(&tmp_path, &json)
2522        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2523    std::fs::rename(&tmp_path, &path).map_err(|e| {
2524        // Cross-device renames fail on some filesystems; best-effort direct write.
2525        let _ = std::fs::write(&path, &json);
2526        VmError::Runtime(format!("failed to finalize run record: {e}"))
2527    })?;
2528    if let Some(observability) = materialized.observability.as_ref() {
2529        publish_action_graph_event(&materialized, observability, &path);
2530    }
2531    Ok(path.to_string_lossy().into_owned())
2532}
2533
2534pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2535    let content = std::fs::read_to_string(path)
2536        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2537    let mut run: RunRecord = serde_json::from_str(&content)
2538        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2539    if run.replay_fixture.is_none() {
2540        run.replay_fixture = Some(replay_fixture_from_run(&run));
2541    }
2542    run.persisted_path
2543        .get_or_insert_with(|| path.to_string_lossy().into_owned());
2544    sync_run_handoffs(&mut run);
2545    refresh_run_observability(&mut run, Some(path));
2546    Ok(run)
2547}
2548
2549pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2550    ReplayFixture {
2551        type_name: "replay_fixture".to_string(),
2552        id: new_id("fixture"),
2553        source_run_id: run.id.clone(),
2554        workflow_id: run.workflow_id.clone(),
2555        workflow_name: run.workflow_name.clone(),
2556        created_at: now_rfc3339(),
2557        eval_kind: Some("replay".to_string()),
2558        clarifying_question: None,
2559        expected_status: run.status.clone(),
2560        stage_assertions: run
2561            .stages
2562            .iter()
2563            .map(|stage| ReplayStageAssertion {
2564                node_id: stage.node_id.clone(),
2565                expected_status: stage.status.clone(),
2566                expected_outcome: stage.outcome.clone(),
2567                expected_branch: stage.branch.clone(),
2568                required_artifact_kinds: stage
2569                    .artifacts
2570                    .iter()
2571                    .map(|artifact| artifact.kind.clone())
2572                    .collect(),
2573                visible_text_contains: stage
2574                    .visible_text
2575                    .as_ref()
2576                    .filter(|text| !text.is_empty())
2577                    .map(|text| text.chars().take(80).collect()),
2578            })
2579            .collect(),
2580    }
2581}
2582
2583pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2584    if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2585        return evaluate_clarifying_question(run, fixture);
2586    }
2587    let mut failures = Vec::new();
2588    if run.status != fixture.expected_status {
2589        failures.push(format!(
2590            "run status mismatch: expected {}, got {}",
2591            fixture.expected_status, run.status
2592        ));
2593    }
2594    let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2595        run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2596    for assertion in &fixture.stage_assertions {
2597        let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2598            failures.push(format!("missing stage {}", assertion.node_id));
2599            continue;
2600        };
2601        if stage.status != assertion.expected_status {
2602            failures.push(format!(
2603                "stage {} status mismatch: expected {}, got {}",
2604                assertion.node_id, assertion.expected_status, stage.status
2605            ));
2606        }
2607        if stage.outcome != assertion.expected_outcome {
2608            failures.push(format!(
2609                "stage {} outcome mismatch: expected {}, got {}",
2610                assertion.node_id, assertion.expected_outcome, stage.outcome
2611            ));
2612        }
2613        if stage.branch != assertion.expected_branch {
2614            failures.push(format!(
2615                "stage {} branch mismatch: expected {:?}, got {:?}",
2616                assertion.node_id, assertion.expected_branch, stage.branch
2617            ));
2618        }
2619        for required_kind in &assertion.required_artifact_kinds {
2620            if !stage
2621                .artifacts
2622                .iter()
2623                .any(|artifact| &artifact.kind == required_kind)
2624            {
2625                failures.push(format!(
2626                    "stage {} missing artifact kind {}",
2627                    assertion.node_id, required_kind
2628                ));
2629            }
2630        }
2631        if let Some(snippet) = &assertion.visible_text_contains {
2632            let actual = stage.visible_text.clone().unwrap_or_default();
2633            if !actual.contains(snippet) {
2634                failures.push(format!(
2635                    "stage {} visible text does not contain expected snippet {:?}",
2636                    assertion.node_id, snippet
2637                ));
2638            }
2639        }
2640    }
2641
2642    ReplayEvalReport {
2643        pass: failures.is_empty(),
2644        failures,
2645        stage_count: run.stages.len(),
2646    }
2647}
2648
2649fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2650    let mut failures = Vec::new();
2651    let spec = fixture.clarifying_question.clone().unwrap_or_default();
2652    let min_questions = clarifying_min_questions(&spec);
2653    let max_questions = clarifying_max_questions(&spec);
2654    let questions = &run.hitl_questions;
2655
2656    if run.status != fixture.expected_status {
2657        failures.push(format!(
2658            "run status mismatch: expected {}, got {}",
2659            fixture.expected_status, run.status
2660        ));
2661    }
2662    if questions.len() < min_questions {
2663        failures.push(format!(
2664            "expected at least {min_questions} clarifying question(s), got {}",
2665            questions.len()
2666        ));
2667    }
2668    if questions.len() > max_questions {
2669        failures.push(format!(
2670            "expected at most {max_questions} clarifying question(s), got {}",
2671            questions.len()
2672        ));
2673    }
2674
2675    let normalized_expected = spec
2676        .expected_question
2677        .as_deref()
2678        .map(normalize_question_text);
2679    let normalized_accepted = spec
2680        .accepted_questions
2681        .iter()
2682        .map(|question| normalize_question_text(question))
2683        .collect::<Vec<_>>();
2684    let required_terms = spec
2685        .required_terms
2686        .iter()
2687        .map(|term| normalize_question_text(term))
2688        .collect::<Vec<_>>();
2689    let forbidden_terms = spec
2690        .forbidden_terms
2691        .iter()
2692        .map(|term| normalize_question_text(term))
2693        .collect::<Vec<_>>();
2694
2695    let matched = questions.iter().any(|question| {
2696        let normalized = normalize_question_text(&question.prompt);
2697        let matches_expected = normalized_expected
2698            .as_ref()
2699            .is_none_or(|expected| &normalized == expected)
2700            && (normalized_accepted.is_empty()
2701                || normalized_accepted
2702                    .iter()
2703                    .any(|candidate| candidate == &normalized));
2704        let has_required_terms = required_terms
2705            .iter()
2706            .all(|term| normalized.contains(term.as_str()));
2707        let avoids_forbidden_terms = forbidden_terms
2708            .iter()
2709            .all(|term| !normalized.contains(term.as_str()));
2710        matches_expected && has_required_terms && avoids_forbidden_terms
2711    });
2712
2713    if !questions.is_empty()
2714        && (!normalized_accepted.is_empty()
2715            || normalized_expected.is_some()
2716            || !required_terms.is_empty()
2717            || !forbidden_terms.is_empty())
2718        && !matched
2719    {
2720        failures.push(format!(
2721            "no clarifying question matched fixture; actual questions: {}",
2722            questions
2723                .iter()
2724                .map(|question| format!("{:?}", question.prompt))
2725                .collect::<Vec<_>>()
2726                .join(", ")
2727        ));
2728    }
2729
2730    ReplayEvalReport {
2731        pass: failures.is_empty(),
2732        failures,
2733        stage_count: run.stages.len(),
2734    }
2735}
2736
2737pub fn evaluate_run_suite(
2738    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2739) -> ReplayEvalSuiteReport {
2740    let mut reports = Vec::new();
2741    for (run, fixture, source_path) in cases {
2742        let report = evaluate_run_against_fixture(&run, &fixture);
2743        reports.push(ReplayEvalCaseReport {
2744            run_id: run.id.clone(),
2745            workflow_id: run.workflow_id.clone(),
2746            label: None,
2747            pass: report.pass,
2748            failures: report.failures,
2749            stage_count: report.stage_count,
2750            source_path,
2751            comparison: None,
2752        });
2753    }
2754    let total = reports.len();
2755    let passed = reports.iter().filter(|report| report.pass).count();
2756    let failed = total.saturating_sub(passed);
2757    ReplayEvalSuiteReport {
2758        pass: failed == 0,
2759        total,
2760        passed,
2761        failed,
2762        cases: reports,
2763    }
2764}
2765
2766pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2767    let mut stage_diffs = Vec::new();
2768    let mut all_node_ids = BTreeSet::new();
2769    let left_by_id: BTreeMap<&str, &RunStageRecord> = left
2770        .stages
2771        .iter()
2772        .map(|s| (s.node_id.as_str(), s))
2773        .collect();
2774    let right_by_id: BTreeMap<&str, &RunStageRecord> = right
2775        .stages
2776        .iter()
2777        .map(|s| (s.node_id.as_str(), s))
2778        .collect();
2779    all_node_ids.extend(left_by_id.keys().copied());
2780    all_node_ids.extend(right_by_id.keys().copied());
2781
2782    for node_id in all_node_ids {
2783        let left_stage = left_by_id.get(node_id).copied();
2784        let right_stage = right_by_id.get(node_id).copied();
2785        match (left_stage, right_stage) {
2786            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2787                node_id: node_id.to_string(),
2788                change: "removed".to_string(),
2789                details: vec!["stage missing from right run".to_string()],
2790            }),
2791            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2792                node_id: node_id.to_string(),
2793                change: "added".to_string(),
2794                details: vec!["stage missing from left run".to_string()],
2795            }),
2796            (Some(left_stage), Some(right_stage)) => {
2797                let mut details = Vec::new();
2798                if left_stage.status != right_stage.status {
2799                    details.push(format!(
2800                        "status: {} -> {}",
2801                        left_stage.status, right_stage.status
2802                    ));
2803                }
2804                if left_stage.outcome != right_stage.outcome {
2805                    details.push(format!(
2806                        "outcome: {} -> {}",
2807                        left_stage.outcome, right_stage.outcome
2808                    ));
2809                }
2810                if left_stage.branch != right_stage.branch {
2811                    details.push(format!(
2812                        "branch: {:?} -> {:?}",
2813                        left_stage.branch, right_stage.branch
2814                    ));
2815                }
2816                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2817                {
2818                    details.push(format!(
2819                        "produced_artifacts: {} -> {}",
2820                        left_stage.produced_artifact_ids.len(),
2821                        right_stage.produced_artifact_ids.len()
2822                    ));
2823                }
2824                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2825                    details.push(format!(
2826                        "artifact_records: {} -> {}",
2827                        left_stage.artifacts.len(),
2828                        right_stage.artifacts.len()
2829                    ));
2830                }
2831                if !details.is_empty() {
2832                    stage_diffs.push(RunStageDiffRecord {
2833                        node_id: node_id.to_string(),
2834                        change: "changed".to_string(),
2835                        details,
2836                    });
2837                }
2838            }
2839            (None, None) => {}
2840        }
2841    }
2842
2843    let mut tool_diffs = Vec::new();
2844    let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
2845        .tool_recordings
2846        .iter()
2847        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
2848        .collect();
2849    let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
2850        .tool_recordings
2851        .iter()
2852        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
2853        .collect();
2854    let all_tool_keys: std::collections::BTreeSet<_> = left_tools
2855        .keys()
2856        .chain(right_tools.keys())
2857        .cloned()
2858        .collect();
2859    for key in &all_tool_keys {
2860        let l = left_tools.get(key);
2861        let r = right_tools.get(key);
2862        let result_changed = match (l, r) {
2863            (Some(a), Some(b)) => a.result != b.result,
2864            _ => true,
2865        };
2866        if result_changed {
2867            tool_diffs.push(ToolCallDiffRecord {
2868                tool_name: key.0.clone(),
2869                args_hash: key.1.clone(),
2870                result_changed,
2871                left_result: l.map(|t| t.result.clone()),
2872                right_result: r.map(|t| t.result.clone()),
2873            });
2874        }
2875    }
2876
2877    let left_observability = left.observability.clone().unwrap_or_else(|| {
2878        derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
2879    });
2880    let right_observability = right.observability.clone().unwrap_or_else(|| {
2881        derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
2882    });
2883    let mut observability_diffs = Vec::new();
2884
2885    let left_workers = left_observability
2886        .worker_lineage
2887        .iter()
2888        .map(|worker| {
2889            (
2890                worker.worker_id.clone(),
2891                (
2892                    worker.status.clone(),
2893                    worker.run_id.clone(),
2894                    worker.run_path.clone(),
2895                ),
2896            )
2897        })
2898        .collect::<BTreeMap<_, _>>();
2899    let right_workers = right_observability
2900        .worker_lineage
2901        .iter()
2902        .map(|worker| {
2903            (
2904                worker.worker_id.clone(),
2905                (
2906                    worker.status.clone(),
2907                    worker.run_id.clone(),
2908                    worker.run_path.clone(),
2909                ),
2910            )
2911        })
2912        .collect::<BTreeMap<_, _>>();
2913    let worker_ids = left_workers
2914        .keys()
2915        .chain(right_workers.keys())
2916        .cloned()
2917        .collect::<BTreeSet<_>>();
2918    for worker_id in worker_ids {
2919        match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
2920            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
2921                section: "worker_lineage".to_string(),
2922                label: worker_id,
2923                details: vec!["worker missing from right run".to_string()],
2924            }),
2925            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
2926                section: "worker_lineage".to_string(),
2927                label: worker_id,
2928                details: vec!["worker missing from left run".to_string()],
2929            }),
2930            (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
2931                let mut details = Vec::new();
2932                if left_worker.0 != right_worker.0 {
2933                    details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
2934                }
2935                if left_worker.1 != right_worker.1 {
2936                    details.push(format!(
2937                        "run_id: {:?} -> {:?}",
2938                        left_worker.1, right_worker.1
2939                    ));
2940                }
2941                if left_worker.2 != right_worker.2 {
2942                    details.push(format!(
2943                        "run_path: {:?} -> {:?}",
2944                        left_worker.2, right_worker.2
2945                    ));
2946                }
2947                observability_diffs.push(RunObservabilityDiffRecord {
2948                    section: "worker_lineage".to_string(),
2949                    label: worker_id,
2950                    details,
2951                });
2952            }
2953            _ => {}
2954        }
2955    }
2956
2957    let left_rounds = left_observability
2958        .planner_rounds
2959        .iter()
2960        .map(|round| (round.stage_id.clone(), round))
2961        .collect::<BTreeMap<_, _>>();
2962    let right_rounds = right_observability
2963        .planner_rounds
2964        .iter()
2965        .map(|round| (round.stage_id.clone(), round))
2966        .collect::<BTreeMap<_, _>>();
2967    let round_ids = left_rounds
2968        .keys()
2969        .chain(right_rounds.keys())
2970        .cloned()
2971        .collect::<BTreeSet<_>>();
2972    for stage_id in round_ids {
2973        match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
2974            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
2975                section: "planner_rounds".to_string(),
2976                label: stage_id,
2977                details: vec!["planner summary missing from right run".to_string()],
2978            }),
2979            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
2980                section: "planner_rounds".to_string(),
2981                label: stage_id,
2982                details: vec!["planner summary missing from left run".to_string()],
2983            }),
2984            (Some(left_round), Some(right_round)) => {
2985                let mut details = Vec::new();
2986                if left_round.iteration_count != right_round.iteration_count {
2987                    details.push(format!(
2988                        "iterations: {} -> {}",
2989                        left_round.iteration_count, right_round.iteration_count
2990                    ));
2991                }
2992                if left_round.tool_execution_count != right_round.tool_execution_count {
2993                    details.push(format!(
2994                        "tool_executions: {} -> {}",
2995                        left_round.tool_execution_count, right_round.tool_execution_count
2996                    ));
2997                }
2998                if left_round.native_text_tool_fallback_count
2999                    != right_round.native_text_tool_fallback_count
3000                {
3001                    details.push(format!(
3002                        "native_text_tool_fallbacks: {} -> {}",
3003                        left_round.native_text_tool_fallback_count,
3004                        right_round.native_text_tool_fallback_count
3005                    ));
3006                }
3007                if left_round.native_text_tool_fallback_rejection_count
3008                    != right_round.native_text_tool_fallback_rejection_count
3009                {
3010                    details.push(format!(
3011                        "native_text_tool_fallback_rejections: {} -> {}",
3012                        left_round.native_text_tool_fallback_rejection_count,
3013                        right_round.native_text_tool_fallback_rejection_count
3014                    ));
3015                }
3016                if left_round.empty_completion_retry_count
3017                    != right_round.empty_completion_retry_count
3018                {
3019                    details.push(format!(
3020                        "empty_completion_retries: {} -> {}",
3021                        left_round.empty_completion_retry_count,
3022                        right_round.empty_completion_retry_count
3023                    ));
3024                }
3025                if left_round.research_facts != right_round.research_facts {
3026                    details.push(format!(
3027                        "research_facts: {:?} -> {:?}",
3028                        left_round.research_facts, right_round.research_facts
3029                    ));
3030                }
3031                let left_deliverables = left_round
3032                    .task_ledger
3033                    .as_ref()
3034                    .map(|ledger| {
3035                        ledger
3036                            .deliverables
3037                            .iter()
3038                            .map(|item| format!("{}:{}", item.id, item.status))
3039                            .collect::<Vec<_>>()
3040                    })
3041                    .unwrap_or_default();
3042                let right_deliverables = right_round
3043                    .task_ledger
3044                    .as_ref()
3045                    .map(|ledger| {
3046                        ledger
3047                            .deliverables
3048                            .iter()
3049                            .map(|item| format!("{}:{}", item.id, item.status))
3050                            .collect::<Vec<_>>()
3051                    })
3052                    .unwrap_or_default();
3053                if left_deliverables != right_deliverables {
3054                    details.push(format!(
3055                        "deliverables: {:?} -> {:?}",
3056                        left_deliverables, right_deliverables
3057                    ));
3058                }
3059                if left_round.successful_tools != right_round.successful_tools {
3060                    details.push(format!(
3061                        "successful_tools: {:?} -> {:?}",
3062                        left_round.successful_tools, right_round.successful_tools
3063                    ));
3064                }
3065                if !details.is_empty() {
3066                    observability_diffs.push(RunObservabilityDiffRecord {
3067                        section: "planner_rounds".to_string(),
3068                        label: left_round.node_id.clone(),
3069                        details,
3070                    });
3071                }
3072            }
3073            _ => {}
3074        }
3075    }
3076
3077    let left_pointers = left_observability
3078        .transcript_pointers
3079        .iter()
3080        .map(|pointer| {
3081            (
3082                pointer.id.clone(),
3083                (
3084                    pointer.available,
3085                    pointer.path.clone(),
3086                    pointer.location.clone(),
3087                ),
3088            )
3089        })
3090        .collect::<BTreeMap<_, _>>();
3091    let right_pointers = right_observability
3092        .transcript_pointers
3093        .iter()
3094        .map(|pointer| {
3095            (
3096                pointer.id.clone(),
3097                (
3098                    pointer.available,
3099                    pointer.path.clone(),
3100                    pointer.location.clone(),
3101                ),
3102            )
3103        })
3104        .collect::<BTreeMap<_, _>>();
3105    let pointer_ids = left_pointers
3106        .keys()
3107        .chain(right_pointers.keys())
3108        .cloned()
3109        .collect::<BTreeSet<_>>();
3110    for pointer_id in pointer_ids {
3111        match (
3112            left_pointers.get(&pointer_id),
3113            right_pointers.get(&pointer_id),
3114        ) {
3115            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3116                section: "transcript_pointers".to_string(),
3117                label: pointer_id,
3118                details: vec!["pointer missing from right run".to_string()],
3119            }),
3120            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3121                section: "transcript_pointers".to_string(),
3122                label: pointer_id,
3123                details: vec!["pointer missing from left run".to_string()],
3124            }),
3125            (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3126                observability_diffs.push(RunObservabilityDiffRecord {
3127                    section: "transcript_pointers".to_string(),
3128                    label: pointer_id,
3129                    details: vec![format!(
3130                        "pointer: {:?} -> {:?}",
3131                        left_pointer, right_pointer
3132                    )],
3133                });
3134            }
3135            _ => {}
3136        }
3137    }
3138
3139    let left_compactions = left_observability
3140        .compaction_events
3141        .iter()
3142        .map(|event| {
3143            (
3144                event.id.clone(),
3145                (
3146                    event.strategy.clone(),
3147                    event.archived_messages,
3148                    event.snapshot_asset_id.clone(),
3149                    event.available,
3150                ),
3151            )
3152        })
3153        .collect::<BTreeMap<_, _>>();
3154    let right_compactions = right_observability
3155        .compaction_events
3156        .iter()
3157        .map(|event| {
3158            (
3159                event.id.clone(),
3160                (
3161                    event.strategy.clone(),
3162                    event.archived_messages,
3163                    event.snapshot_asset_id.clone(),
3164                    event.available,
3165                ),
3166            )
3167        })
3168        .collect::<BTreeMap<_, _>>();
3169    let compaction_ids = left_compactions
3170        .keys()
3171        .chain(right_compactions.keys())
3172        .cloned()
3173        .collect::<BTreeSet<_>>();
3174    for compaction_id in compaction_ids {
3175        match (
3176            left_compactions.get(&compaction_id),
3177            right_compactions.get(&compaction_id),
3178        ) {
3179            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3180                section: "compaction_events".to_string(),
3181                label: compaction_id,
3182                details: vec!["compaction event missing from right run".to_string()],
3183            }),
3184            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3185                section: "compaction_events".to_string(),
3186                label: compaction_id,
3187                details: vec!["compaction event missing from left run".to_string()],
3188            }),
3189            (Some(left_event), Some(right_event)) if left_event != right_event => {
3190                observability_diffs.push(RunObservabilityDiffRecord {
3191                    section: "compaction_events".to_string(),
3192                    label: compaction_id,
3193                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3194                });
3195            }
3196            _ => {}
3197        }
3198    }
3199
3200    let left_daemons = left_observability
3201        .daemon_events
3202        .iter()
3203        .map(|event| {
3204            (
3205                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3206                (
3207                    event.name.clone(),
3208                    event.persist_path.clone(),
3209                    event.payload_summary.clone(),
3210                ),
3211            )
3212        })
3213        .collect::<BTreeMap<_, _>>();
3214    let right_daemons = right_observability
3215        .daemon_events
3216        .iter()
3217        .map(|event| {
3218            (
3219                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3220                (
3221                    event.name.clone(),
3222                    event.persist_path.clone(),
3223                    event.payload_summary.clone(),
3224                ),
3225            )
3226        })
3227        .collect::<BTreeMap<_, _>>();
3228    let daemon_keys = left_daemons
3229        .keys()
3230        .chain(right_daemons.keys())
3231        .cloned()
3232        .collect::<BTreeSet<_>>();
3233    for daemon_key in daemon_keys {
3234        let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3235        match (
3236            left_daemons.get(&daemon_key),
3237            right_daemons.get(&daemon_key),
3238        ) {
3239            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3240                section: "daemon_events".to_string(),
3241                label,
3242                details: vec!["daemon event missing from right run".to_string()],
3243            }),
3244            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3245                section: "daemon_events".to_string(),
3246                label,
3247                details: vec!["daemon event missing from left run".to_string()],
3248            }),
3249            (Some(left_event), Some(right_event)) if left_event != right_event => {
3250                observability_diffs.push(RunObservabilityDiffRecord {
3251                    section: "daemon_events".to_string(),
3252                    label,
3253                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3254                });
3255            }
3256            _ => {}
3257        }
3258    }
3259
3260    let left_verification = left_observability
3261        .verification_outcomes
3262        .iter()
3263        .map(|item| (item.stage_id.clone(), item))
3264        .collect::<BTreeMap<_, _>>();
3265    let right_verification = right_observability
3266        .verification_outcomes
3267        .iter()
3268        .map(|item| (item.stage_id.clone(), item))
3269        .collect::<BTreeMap<_, _>>();
3270    let verification_ids = left_verification
3271        .keys()
3272        .chain(right_verification.keys())
3273        .cloned()
3274        .collect::<BTreeSet<_>>();
3275    for stage_id in verification_ids {
3276        match (
3277            left_verification.get(&stage_id),
3278            right_verification.get(&stage_id),
3279        ) {
3280            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3281                section: "verification".to_string(),
3282                label: stage_id,
3283                details: vec!["verification missing from right run".to_string()],
3284            }),
3285            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3286                section: "verification".to_string(),
3287                label: stage_id,
3288                details: vec!["verification missing from left run".to_string()],
3289            }),
3290            (Some(left_item), Some(right_item)) if left_item != right_item => {
3291                let mut details = Vec::new();
3292                if left_item.passed != right_item.passed {
3293                    details.push(format!(
3294                        "passed: {:?} -> {:?}",
3295                        left_item.passed, right_item.passed
3296                    ));
3297                }
3298                if left_item.summary != right_item.summary {
3299                    details.push(format!(
3300                        "summary: {:?} -> {:?}",
3301                        left_item.summary, right_item.summary
3302                    ));
3303                }
3304                observability_diffs.push(RunObservabilityDiffRecord {
3305                    section: "verification".to_string(),
3306                    label: left_item.node_id.clone(),
3307                    details,
3308                });
3309            }
3310            _ => {}
3311        }
3312    }
3313
3314    let left_graph = (
3315        left_observability.action_graph_nodes.len(),
3316        left_observability.action_graph_edges.len(),
3317    );
3318    let right_graph = (
3319        right_observability.action_graph_nodes.len(),
3320        right_observability.action_graph_edges.len(),
3321    );
3322    if left_graph != right_graph {
3323        observability_diffs.push(RunObservabilityDiffRecord {
3324            section: "action_graph".to_string(),
3325            label: "shape".to_string(),
3326            details: vec![format!(
3327                "nodes/edges: {}/{} -> {}/{}",
3328                left_graph.0, left_graph.1, right_graph.0, right_graph.1
3329            )],
3330        });
3331    }
3332
3333    let status_changed = left.status != right.status;
3334    let identical = !status_changed
3335        && stage_diffs.is_empty()
3336        && tool_diffs.is_empty()
3337        && observability_diffs.is_empty()
3338        && left.transitions.len() == right.transitions.len()
3339        && left.artifacts.len() == right.artifacts.len()
3340        && left.checkpoints.len() == right.checkpoints.len();
3341
3342    RunDiffReport {
3343        left_run_id: left.id.clone(),
3344        right_run_id: right.id.clone(),
3345        identical,
3346        status_changed,
3347        left_status: left.status.clone(),
3348        right_status: right.status.clone(),
3349        stage_diffs,
3350        tool_diffs,
3351        observability_diffs,
3352        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3353        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3354        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3355    }
3356}
3357
3358#[cfg(test)]
3359mod tests {
3360    use super::*;
3361    use std::fs;
3362
3363    fn minimal_run(status: &str) -> RunRecord {
3364        RunRecord {
3365            type_name: "workflow_run".to_string(),
3366            id: "run_1".to_string(),
3367            workflow_id: "workflow_1".to_string(),
3368            status: status.to_string(),
3369            usage: Some(LlmUsageRecord {
3370                total_duration_ms: 12,
3371                total_cost: 0.01,
3372                input_tokens: 3,
3373                output_tokens: 4,
3374                call_count: 1,
3375                models: vec!["mock".to_string()],
3376            }),
3377            replay_fixture: Some(ReplayFixture {
3378                type_name: "replay_fixture".to_string(),
3379                expected_status: "completed".to_string(),
3380                ..ReplayFixture::default()
3381            }),
3382            ..RunRecord::default()
3383        }
3384    }
3385
3386    #[test]
3387    fn eval_pack_manifest_toml_runs_replay_case() {
3388        let temp = tempfile::tempdir().unwrap();
3389        let run_path = temp.path().join("run.json");
3390        fs::write(
3391            &run_path,
3392            serde_json::to_string(&minimal_run("completed")).unwrap(),
3393        )
3394        .unwrap();
3395        let pack_path = temp.path().join("harn.eval.toml");
3396        fs::write(
3397            &pack_path,
3398            r#"
3399version = 1
3400id = "connector-regressions"
3401name = "Connector regressions"
3402
3403[[cases]]
3404id = "webhook"
3405name = "Webhook normalization"
3406run = "run.json"
3407rubrics = ["status"]
3408
3409[[rubrics]]
3410id = "status"
3411kind = "deterministic"
3412
3413[[rubrics.assertions]]
3414kind = "run-status"
3415expected = "completed"
3416"#,
3417        )
3418        .unwrap();
3419
3420        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3421        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3422
3423        assert!(report.pass);
3424        assert_eq!(report.total, 1);
3425        assert_eq!(report.cases[0].label, "Webhook normalization");
3426    }
3427
3428    #[test]
3429    fn eval_pack_warning_case_does_not_block() {
3430        let temp = tempfile::tempdir().unwrap();
3431        let run_path = temp.path().join("run.json");
3432        fs::write(
3433            &run_path,
3434            serde_json::to_string(&minimal_run("completed")).unwrap(),
3435        )
3436        .unwrap();
3437        let pack_path = temp.path().join("harn.eval.toml");
3438        fs::write(
3439            &pack_path,
3440            r#"
3441version = 1
3442id = "budgets"
3443
3444[[cases]]
3445id = "latency-budget"
3446run = "run.json"
3447severity = "warning"
3448
3449[cases.thresholds]
3450max-latency-ms = 1
3451"#,
3452        )
3453        .unwrap();
3454
3455        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3456        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3457
3458        assert!(report.pass);
3459        assert_eq!(report.warning_failed, 1);
3460        assert!(report.cases[0].warnings[0].contains("latency"));
3461    }
3462
3463    #[test]
3464    fn eval_pack_manifest_runs_friction_context_pack_case() {
3465        let temp = tempfile::tempdir().unwrap();
3466        let events_path = temp.path().join("incident-friction.json");
3467        fs::write(
3468            &events_path,
3469            r#"
3470{
3471  "events": [
3472    {
3473      "kind": "repeated_query",
3474      "source": "incident-triage",
3475      "actor": "sre",
3476      "tool": "splunk",
3477      "provider": "splunk",
3478      "redacted_summary": "Checkout incidents need the same Splunk search",
3479      "recurrence_hints": ["checkout incident queries"],
3480      "estimated_time_ms": 300000,
3481      "metadata": {
3482        "query": "index=checkout service=api error",
3483        "capability": "splunk.search",
3484        "secret_ref": "SPLUNK_READ_TOKEN",
3485        "output_slot": "splunk_errors"
3486      }
3487    },
3488    {
3489      "kind": "repeated_query",
3490      "source": "incident-triage",
3491      "actor": "sre",
3492      "tool": "splunk",
3493      "provider": "splunk",
3494      "redacted_summary": "Checkout incident triage repeated the Splunk search",
3495      "recurrence_hints": ["checkout incident queries"],
3496      "estimated_time_ms": 240000,
3497      "metadata": {
3498        "query": "index=checkout service=api error",
3499        "capability": "splunk.search",
3500        "secret_ref": "SPLUNK_READ_TOKEN",
3501        "output_slot": "splunk_errors"
3502      }
3503    }
3504  ]
3505}
3506"#,
3507        )
3508        .unwrap();
3509        let pack_path = temp.path().join("harn.eval.toml");
3510        fs::write(
3511            &pack_path,
3512            r#"
3513version = 1
3514id = "team-learning"
3515name = "Team learning evals"
3516
3517[[fixtures]]
3518id = "incident-friction"
3519kind = "friction-events"
3520path = "incident-friction.json"
3521
3522[[cases]]
3523id = "incident-context-pack"
3524name = "Incident context pack suggestion"
3525friction_events = "incident-friction"
3526rubrics = ["context-pack"]
3527
3528[[rubrics]]
3529id = "context-pack"
3530kind = "friction"
3531
3532[[rubrics.assertions]]
3533kind = "context-pack-suggestion"
3534contains = "incident"
3535expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3536"#,
3537        )
3538        .unwrap();
3539
3540        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3541        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3542
3543        assert!(report.pass);
3544        assert_eq!(report.total, 1);
3545        assert_eq!(report.cases[0].run_id, "friction_events");
3546        assert_eq!(report.cases[0].stage_count, 2);
3547    }
3548}