Skip to main content

harn_vm/orchestration/
records.rs

1//! Run records, replay fixtures, eval reports, and diff utilities.
2
3use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9    default_run_dir, evaluate_context_pack_suggestion_expectations,
10    generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11    parse_json_payload, parse_json_value, sync_run_handoffs, ArtifactRecord, CapabilityPolicy,
12    ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact,
13};
14use crate::event_log::{
15    active_event_log, AnyEventLog, EventLog, LogEvent as EventLogRecord, Topic,
16};
17use crate::llm::vm_value_to_json;
18use crate::triggers::{SignatureStatus, TriggerEvent};
19use crate::value::{VmError, VmValue};
20
21pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
22pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
23pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
24pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
25pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
26pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
27pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
28pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
29pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
30pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
31pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
32
33pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
34pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
35pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
36pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
37pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
38pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
39pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
40pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
41pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
42
43#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
44#[serde(default)]
45pub struct LlmUsageRecord {
46    pub input_tokens: i64,
47    pub output_tokens: i64,
48    pub total_duration_ms: i64,
49    pub call_count: i64,
50    pub total_cost: f64,
51    pub models: Vec<String>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
55#[serde(default)]
56pub struct RunStageRecord {
57    pub id: String,
58    pub node_id: String,
59    pub kind: String,
60    pub status: String,
61    pub outcome: String,
62    pub branch: Option<String>,
63    pub started_at: String,
64    pub finished_at: Option<String>,
65    pub visible_text: Option<String>,
66    pub private_reasoning: Option<String>,
67    pub transcript: Option<serde_json::Value>,
68    pub verification: Option<serde_json::Value>,
69    pub usage: Option<LlmUsageRecord>,
70    pub artifacts: Vec<ArtifactRecord>,
71    pub consumed_artifact_ids: Vec<String>,
72    pub produced_artifact_ids: Vec<String>,
73    pub attempts: Vec<RunStageAttemptRecord>,
74    pub metadata: BTreeMap<String, serde_json::Value>,
75}
76
77#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
78#[serde(default)]
79pub struct RunStageAttemptRecord {
80    pub attempt: usize,
81    pub status: String,
82    pub outcome: String,
83    pub branch: Option<String>,
84    pub error: Option<String>,
85    pub verification: Option<serde_json::Value>,
86    pub started_at: String,
87    pub finished_at: Option<String>,
88}
89
90#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(default)]
92pub struct RunTransitionRecord {
93    pub id: String,
94    pub from_stage_id: Option<String>,
95    pub from_node_id: Option<String>,
96    pub to_node_id: String,
97    pub branch: Option<String>,
98    pub timestamp: String,
99    pub consumed_artifact_ids: Vec<String>,
100    pub produced_artifact_ids: Vec<String>,
101}
102
103#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
104#[serde(default)]
105pub struct RunCheckpointRecord {
106    pub id: String,
107    pub ready_nodes: Vec<String>,
108    pub completed_nodes: Vec<String>,
109    pub last_stage_id: Option<String>,
110    pub persisted_at: String,
111    pub reason: String,
112}
113
114#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
115#[serde(default)]
116pub struct ReplayFixture {
117    #[serde(rename = "_type")]
118    pub type_name: String,
119    pub id: String,
120    pub source_run_id: String,
121    pub workflow_id: String,
122    pub workflow_name: Option<String>,
123    pub created_at: String,
124    pub eval_kind: Option<String>,
125    pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
126    pub expected_status: String,
127    pub stage_assertions: Vec<ReplayStageAssertion>,
128}
129
130#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
131#[serde(default)]
132pub struct ClarifyingQuestionEvalSpec {
133    pub expected_question: Option<String>,
134    pub accepted_questions: Vec<String>,
135    pub required_terms: Vec<String>,
136    pub forbidden_terms: Vec<String>,
137    pub min_questions: usize,
138    pub max_questions: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
142#[serde(default)]
143pub struct ReplayStageAssertion {
144    pub node_id: String,
145    pub expected_status: String,
146    pub expected_outcome: String,
147    pub expected_branch: Option<String>,
148    pub required_artifact_kinds: Vec<String>,
149    pub visible_text_contains: Option<String>,
150}
151
152#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
153#[serde(default)]
154pub struct ReplayEvalReport {
155    pub pass: bool,
156    pub failures: Vec<String>,
157    pub stage_count: usize,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalCaseReport {
163    pub run_id: String,
164    pub workflow_id: String,
165    pub label: Option<String>,
166    pub pass: bool,
167    pub failures: Vec<String>,
168    pub stage_count: usize,
169    pub source_path: Option<String>,
170    pub comparison: Option<RunDiffReport>,
171}
172
173#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
174#[serde(default)]
175pub struct ReplayEvalSuiteReport {
176    pub pass: bool,
177    pub total: usize,
178    pub passed: usize,
179    pub failed: usize,
180    pub cases: Vec<ReplayEvalCaseReport>,
181}
182
183#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(default)]
185pub struct RunDeliverableSummaryRecord {
186    pub id: String,
187    pub text: String,
188    pub status: String,
189    pub note: Option<String>,
190}
191
192#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
193#[serde(default)]
194pub struct RunTaskLedgerSummaryRecord {
195    pub root_task: String,
196    pub rationale: String,
197    pub deliverables: Vec<RunDeliverableSummaryRecord>,
198    pub observations: Vec<String>,
199    pub blocking_count: usize,
200}
201
202#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
203#[serde(default)]
204pub struct RunPlannerRoundRecord {
205    pub stage_id: String,
206    pub node_id: String,
207    pub stage_kind: String,
208    pub status: String,
209    pub outcome: String,
210    pub iteration_count: usize,
211    pub llm_call_count: usize,
212    pub tool_execution_count: usize,
213    pub tool_rejection_count: usize,
214    pub intervention_count: usize,
215    pub compaction_count: usize,
216    pub native_text_tool_fallback_count: usize,
217    pub native_text_tool_fallback_rejection_count: usize,
218    pub empty_completion_retry_count: usize,
219    pub tools_used: Vec<String>,
220    pub successful_tools: Vec<String>,
221    pub ledger_done_rejections: usize,
222    pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
223    pub research_facts: Vec<String>,
224}
225
226#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
227#[serde(default)]
228pub struct RunWorkerLineageRecord {
229    pub worker_id: String,
230    pub worker_name: String,
231    pub parent_stage_id: Option<String>,
232    pub task: String,
233    pub status: String,
234    pub session_id: Option<String>,
235    pub parent_session_id: Option<String>,
236    pub run_id: Option<String>,
237    pub run_path: Option<String>,
238    pub snapshot_path: Option<String>,
239}
240
241#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
242#[serde(default)]
243pub struct RunActionGraphNodeRecord {
244    pub id: String,
245    pub label: String,
246    pub kind: String,
247    pub status: String,
248    pub outcome: String,
249    pub trace_id: Option<String>,
250    pub stage_id: Option<String>,
251    pub node_id: Option<String>,
252    pub worker_id: Option<String>,
253    pub run_id: Option<String>,
254    pub run_path: Option<String>,
255    pub metadata: BTreeMap<String, serde_json::Value>,
256}
257
258#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
259#[serde(default)]
260pub struct RunActionGraphEdgeRecord {
261    pub from_id: String,
262    pub to_id: String,
263    pub kind: String,
264    pub label: Option<String>,
265}
266
267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
268#[serde(default)]
269pub struct RunVerificationOutcomeRecord {
270    pub stage_id: String,
271    pub node_id: String,
272    pub status: String,
273    pub passed: Option<bool>,
274    pub summary: Option<String>,
275}
276
277#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
278#[serde(default)]
279pub struct RunTranscriptPointerRecord {
280    pub id: String,
281    pub label: String,
282    pub kind: String,
283    pub location: String,
284    pub path: Option<String>,
285    pub available: bool,
286}
287
288#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
289#[serde(default)]
290pub struct CompactionEventRecord {
291    pub id: String,
292    pub transcript_id: Option<String>,
293    pub stage_id: Option<String>,
294    pub node_id: Option<String>,
295    pub mode: String,
296    pub strategy: String,
297    pub archived_messages: usize,
298    pub estimated_tokens_before: usize,
299    pub estimated_tokens_after: usize,
300    pub snapshot_asset_id: Option<String>,
301    pub snapshot_location: String,
302    pub snapshot_path: Option<String>,
303    pub available: bool,
304}
305
306#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
307#[serde(rename_all = "snake_case")]
308pub enum DaemonEventKindRecord {
309    #[default]
310    Spawned,
311    Triggered,
312    Snapshotted,
313    Resumed,
314    Stopped,
315}
316
317#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
318#[serde(default)]
319pub struct DaemonEventRecord {
320    pub daemon_id: String,
321    pub name: String,
322    pub kind: DaemonEventKindRecord,
323    pub timestamp: String,
324    pub persist_path: String,
325    pub payload_summary: Option<String>,
326}
327
328#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
329#[serde(default)]
330pub struct RunObservabilityRecord {
331    pub schema_version: usize,
332    pub planner_rounds: Vec<RunPlannerRoundRecord>,
333    pub research_fact_count: usize,
334    pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
335    pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
336    pub worker_lineage: Vec<RunWorkerLineageRecord>,
337    pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
338    pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
339    pub compaction_events: Vec<CompactionEventRecord>,
340    pub daemon_events: Vec<DaemonEventRecord>,
341}
342
343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
344#[serde(default)]
345pub struct RunStageDiffRecord {
346    pub node_id: String,
347    pub change: String,
348    pub details: Vec<String>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct ToolCallDiffRecord {
354    pub tool_name: String,
355    pub args_hash: String,
356    pub result_changed: bool,
357    pub left_result: Option<String>,
358    pub right_result: Option<String>,
359}
360
361#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
362#[serde(default)]
363pub struct RunObservabilityDiffRecord {
364    pub section: String,
365    pub label: String,
366    pub details: Vec<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunDiffReport {
372    pub left_run_id: String,
373    pub right_run_id: String,
374    pub identical: bool,
375    pub status_changed: bool,
376    pub left_status: String,
377    pub right_status: String,
378    pub stage_diffs: Vec<RunStageDiffRecord>,
379    pub tool_diffs: Vec<ToolCallDiffRecord>,
380    pub observability_diffs: Vec<RunObservabilityDiffRecord>,
381    pub transition_count_delta: isize,
382    pub artifact_count_delta: isize,
383    pub checkpoint_count_delta: isize,
384}
385
386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
387#[serde(default)]
388pub struct EvalSuiteManifest {
389    #[serde(rename = "_type")]
390    pub type_name: String,
391    pub id: String,
392    pub name: Option<String>,
393    pub base_dir: Option<String>,
394    pub cases: Vec<EvalSuiteCase>,
395}
396
397#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
398#[serde(default)]
399pub struct EvalSuiteCase {
400    pub label: Option<String>,
401    pub run_path: String,
402    pub fixture_path: Option<String>,
403    pub compare_to: Option<String>,
404}
405
406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
407#[serde(default)]
408pub struct EvalPackManifest {
409    pub version: u32,
410    pub id: String,
411    pub name: Option<String>,
412    pub description: Option<String>,
413    pub base_dir: Option<String>,
414    pub package: Option<EvalPackPackage>,
415    pub defaults: EvalPackDefaults,
416    pub fixtures: Vec<EvalPackFixtureRef>,
417    pub rubrics: Vec<EvalPackRubric>,
418    pub judge: Option<EvalPackJudgeConfig>,
419    pub cases: Vec<EvalPackCase>,
420    pub metadata: BTreeMap<String, serde_json::Value>,
421}
422
423#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
424#[serde(default)]
425pub struct EvalPackPackage {
426    pub name: Option<String>,
427    pub version: Option<String>,
428    pub source: Option<String>,
429    pub templates: Vec<String>,
430}
431
432#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
433#[serde(default)]
434pub struct EvalPackDefaults {
435    pub severity: Option<String>,
436    pub fixture_root: Option<String>,
437    pub thresholds: EvalPackThresholds,
438    pub judge: Option<EvalPackJudgeConfig>,
439}
440
441#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
442#[serde(default)]
443pub struct EvalPackFixtureRef {
444    pub id: String,
445    pub kind: String,
446    pub path: Option<String>,
447    #[serde(default, alias = "trace-id")]
448    pub trace_id: Option<String>,
449    pub provider: Option<String>,
450    #[serde(default, alias = "event-kind")]
451    pub event_kind: Option<String>,
452    pub inline: Option<serde_json::Value>,
453    pub metadata: BTreeMap<String, serde_json::Value>,
454}
455
456#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
457#[serde(default)]
458pub struct EvalPackRubric {
459    pub id: String,
460    pub kind: String,
461    pub description: Option<String>,
462    pub prompt: Option<String>,
463    pub assertions: Vec<EvalPackAssertion>,
464    pub judge: Option<EvalPackJudgeConfig>,
465    pub calibration: Vec<EvalPackGoldenExample>,
466    pub thresholds: EvalPackThresholds,
467    pub metadata: BTreeMap<String, serde_json::Value>,
468}
469
470#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
471#[serde(default)]
472pub struct EvalPackAssertion {
473    pub kind: String,
474    pub stage: Option<String>,
475    pub path: Option<String>,
476    pub op: Option<String>,
477    pub expected: Option<serde_json::Value>,
478    pub contains: Option<String>,
479    pub metadata: BTreeMap<String, serde_json::Value>,
480}
481
482#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
483#[serde(default)]
484pub struct EvalPackJudgeConfig {
485    pub model: Option<String>,
486    #[serde(default, alias = "prompt-version")]
487    pub prompt_version: Option<String>,
488    #[serde(default, alias = "tie-break")]
489    pub tie_break: Option<String>,
490    #[serde(default, alias = "confidence-min")]
491    pub confidence_min: Option<f64>,
492    pub temperature: Option<f64>,
493    pub rubric: Option<String>,
494    pub metadata: BTreeMap<String, serde_json::Value>,
495}
496
497#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
498#[serde(default)]
499pub struct EvalPackGoldenExample {
500    pub input: serde_json::Value,
501    pub output: serde_json::Value,
502    pub score: Option<f64>,
503    pub explanation: Option<String>,
504}
505
506#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
507#[serde(default)]
508pub struct EvalPackThresholds {
509    pub severity: Option<String>,
510    #[serde(default, alias = "min-score")]
511    pub min_score: Option<f64>,
512    #[serde(default, alias = "min-confidence")]
513    pub min_confidence: Option<f64>,
514    #[serde(default, alias = "max-cost-usd")]
515    pub max_cost_usd: Option<f64>,
516    #[serde(default, alias = "max-latency-ms")]
517    pub max_latency_ms: Option<i64>,
518    #[serde(default, alias = "max-tokens")]
519    pub max_tokens: Option<i64>,
520    #[serde(default, alias = "max-stage-count")]
521    pub max_stage_count: Option<usize>,
522}
523
524#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
525#[serde(default)]
526pub struct EvalPackCase {
527    pub id: Option<String>,
528    pub name: Option<String>,
529    pub description: Option<String>,
530    pub run: Option<String>,
531    #[serde(default, alias = "run-path")]
532    pub run_path: Option<String>,
533    #[serde(default, alias = "friction-events", alias = "friction_events")]
534    pub friction_events: Option<String>,
535    pub fixture: Option<String>,
536    #[serde(default, alias = "fixture-path")]
537    pub fixture_path: Option<String>,
538    #[serde(default, alias = "compare-to")]
539    pub compare_to: Option<String>,
540    pub rubrics: Vec<String>,
541    pub severity: Option<String>,
542    pub thresholds: EvalPackThresholds,
543    pub metadata: BTreeMap<String, serde_json::Value>,
544}
545
546#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
547#[serde(default)]
548pub struct EvalPackReport {
549    pub pack_id: String,
550    pub pass: bool,
551    pub total: usize,
552    pub passed: usize,
553    pub failed: usize,
554    pub blocking_failed: usize,
555    pub warning_failed: usize,
556    pub informational_failed: usize,
557    pub cases: Vec<EvalPackCaseReport>,
558}
559
560#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
561#[serde(default)]
562pub struct EvalPackCaseReport {
563    pub id: String,
564    pub label: String,
565    pub severity: String,
566    pub pass: bool,
567    pub blocking: bool,
568    pub run_id: String,
569    pub workflow_id: String,
570    pub source_path: Option<String>,
571    pub stage_count: usize,
572    pub failures: Vec<String>,
573    pub warnings: Vec<String>,
574    pub informational: Vec<String>,
575    pub comparison: Option<RunDiffReport>,
576}
577
578#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
579#[serde(default)]
580pub struct RunHitlQuestionRecord {
581    pub request_id: String,
582    pub prompt: String,
583    pub agent: String,
584    pub trace_id: Option<String>,
585    pub asked_at: String,
586}
587
588#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
589#[serde(default)]
590pub struct RunRecord {
591    #[serde(rename = "_type")]
592    pub type_name: String,
593    pub id: String,
594    pub workflow_id: String,
595    pub workflow_name: Option<String>,
596    pub task: String,
597    pub status: String,
598    pub started_at: String,
599    pub finished_at: Option<String>,
600    pub parent_run_id: Option<String>,
601    pub root_run_id: Option<String>,
602    pub stages: Vec<RunStageRecord>,
603    pub transitions: Vec<RunTransitionRecord>,
604    pub checkpoints: Vec<RunCheckpointRecord>,
605    pub pending_nodes: Vec<String>,
606    pub completed_nodes: Vec<String>,
607    pub child_runs: Vec<RunChildRecord>,
608    pub artifacts: Vec<ArtifactRecord>,
609    pub handoffs: Vec<HandoffArtifact>,
610    pub policy: CapabilityPolicy,
611    pub execution: Option<RunExecutionRecord>,
612    pub transcript: Option<serde_json::Value>,
613    pub usage: Option<LlmUsageRecord>,
614    pub replay_fixture: Option<ReplayFixture>,
615    pub observability: Option<RunObservabilityRecord>,
616    pub trace_spans: Vec<RunTraceSpanRecord>,
617    pub tool_recordings: Vec<ToolCallRecord>,
618    pub hitl_questions: Vec<RunHitlQuestionRecord>,
619    pub metadata: BTreeMap<String, serde_json::Value>,
620    pub persisted_path: Option<String>,
621}
622
623#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
624#[serde(default)]
625pub struct ToolCallRecord {
626    pub tool_name: String,
627    pub tool_use_id: String,
628    pub args_hash: String,
629    pub result: String,
630    pub is_rejected: bool,
631    pub duration_ms: u64,
632    pub iteration: usize,
633    pub timestamp: String,
634}
635
636/// Hash a tool invocation for fixture lookup (name + canonical args JSON).
637pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
638    use std::hash::{Hash, Hasher};
639    let mut hasher = std::collections::hash_map::DefaultHasher::new();
640    tool_name.hash(&mut hasher);
641    let args_str = serde_json::to_string(args).unwrap_or_default();
642    args_str.hash(&mut hasher);
643    format!("{:016x}", hasher.finish())
644}
645
646#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
647#[serde(default)]
648pub struct RunTraceSpanRecord {
649    pub span_id: u64,
650    pub parent_id: Option<u64>,
651    pub kind: String,
652    pub name: String,
653    pub start_ms: u64,
654    pub duration_ms: u64,
655    pub metadata: BTreeMap<String, serde_json::Value>,
656}
657
658#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
659#[serde(default)]
660pub struct RunChildRecord {
661    pub worker_id: String,
662    pub worker_name: String,
663    pub parent_stage_id: Option<String>,
664    pub session_id: Option<String>,
665    pub parent_session_id: Option<String>,
666    pub mutation_scope: Option<String>,
667    pub approval_policy: Option<super::ToolApprovalPolicy>,
668    pub task: String,
669    pub request: Option<serde_json::Value>,
670    pub provenance: Option<serde_json::Value>,
671    pub status: String,
672    pub started_at: String,
673    pub finished_at: Option<String>,
674    pub run_id: Option<String>,
675    pub run_path: Option<String>,
676    pub snapshot_path: Option<String>,
677    pub execution: Option<RunExecutionRecord>,
678}
679
680pub(crate) fn run_child_record_from_worker_metadata(
681    parent_stage_id: Option<String>,
682    worker: &serde_json::Value,
683) -> Option<RunChildRecord> {
684    let worker_id = worker.get("id").and_then(|value| value.as_str())?;
685    if worker_id.is_empty() {
686        return None;
687    }
688    Some(RunChildRecord {
689        worker_id: worker_id.to_string(),
690        worker_name: worker
691            .get("name")
692            .and_then(|value| value.as_str())
693            .unwrap_or("worker")
694            .to_string(),
695        parent_stage_id,
696        session_id: worker
697            .get("audit")
698            .and_then(|value| value.get("session_id"))
699            .and_then(|value| value.as_str())
700            .map(str::to_string),
701        parent_session_id: worker
702            .get("audit")
703            .and_then(|value| value.get("parent_session_id"))
704            .and_then(|value| value.as_str())
705            .map(str::to_string),
706        mutation_scope: worker
707            .get("audit")
708            .and_then(|value| value.get("mutation_scope"))
709            .and_then(|value| value.as_str())
710            .map(str::to_string),
711        approval_policy: worker
712            .get("audit")
713            .and_then(|value| value.get("approval_policy"))
714            .and_then(|value| {
715                serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
716            }),
717        task: worker
718            .get("task")
719            .and_then(|value| value.as_str())
720            .unwrap_or_default()
721            .to_string(),
722        request: worker.get("request").cloned(),
723        provenance: worker.get("provenance").cloned(),
724        status: worker
725            .get("status")
726            .and_then(|value| value.as_str())
727            .unwrap_or("completed")
728            .to_string(),
729        started_at: worker
730            .get("started_at")
731            .and_then(|value| value.as_str())
732            .unwrap_or_default()
733            .to_string(),
734        finished_at: worker
735            .get("finished_at")
736            .and_then(|value| value.as_str())
737            .map(str::to_string),
738        run_id: worker
739            .get("child_run_id")
740            .and_then(|value| value.as_str())
741            .map(str::to_string),
742        run_path: worker
743            .get("child_run_path")
744            .and_then(|value| value.as_str())
745            .map(str::to_string),
746        snapshot_path: worker
747            .get("snapshot_path")
748            .and_then(|value| value.as_str())
749            .map(str::to_string),
750        execution: worker
751            .get("execution")
752            .cloned()
753            .and_then(|value| serde_json::from_value(value).ok()),
754    })
755}
756
757fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
758    let parent_stage_id = if stage.id.is_empty() {
759        None
760    } else {
761        Some(stage.id.clone())
762    };
763    run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
764}
765
766fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
767    if existing.worker_name.is_empty() {
768        existing.worker_name = child.worker_name;
769    }
770    if existing.parent_stage_id.is_none() {
771        existing.parent_stage_id = child.parent_stage_id;
772    }
773    if existing.session_id.is_none() {
774        existing.session_id = child.session_id;
775    }
776    if existing.parent_session_id.is_none() {
777        existing.parent_session_id = child.parent_session_id;
778    }
779    if existing.mutation_scope.is_none() {
780        existing.mutation_scope = child.mutation_scope;
781    }
782    if existing.approval_policy.is_none() {
783        existing.approval_policy = child.approval_policy;
784    }
785    if existing.task.is_empty() {
786        existing.task = child.task;
787    }
788    if existing.request.is_none() {
789        existing.request = child.request;
790    }
791    if existing.provenance.is_none() {
792        existing.provenance = child.provenance;
793    }
794    if existing.status.is_empty() {
795        existing.status = child.status;
796    }
797    if existing.started_at.is_empty() {
798        existing.started_at = child.started_at;
799    }
800    if existing.finished_at.is_none() {
801        existing.finished_at = child.finished_at;
802    }
803    if existing.run_id.is_none() {
804        existing.run_id = child.run_id;
805    }
806    if existing.run_path.is_none() {
807        existing.run_path = child.run_path;
808    }
809    if existing.snapshot_path.is_none() {
810        existing.snapshot_path = child.snapshot_path;
811    }
812    if existing.execution.is_none() {
813        existing.execution = child.execution;
814    }
815}
816
817fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
818    for child in run
819        .stages
820        .iter()
821        .filter_map(run_child_from_stage_metadata)
822        .collect::<Vec<_>>()
823    {
824        match run
825            .child_runs
826            .iter_mut()
827            .find(|existing| existing.worker_id == child.worker_id)
828        {
829            Some(existing) => fill_missing_child_run_fields(existing, child),
830            None => run.child_runs.push(child),
831        }
832    }
833}
834
835#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
836#[serde(default)]
837pub struct RunExecutionRecord {
838    pub cwd: Option<String>,
839    pub source_dir: Option<String>,
840    pub env: BTreeMap<String, String>,
841    pub adapter: Option<String>,
842    pub repo_path: Option<String>,
843    pub worktree_path: Option<String>,
844    pub branch: Option<String>,
845    pub base_ref: Option<String>,
846    pub cleanup: Option<String>,
847}
848
849fn compact_json_value(value: &serde_json::Value) -> String {
850    serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
851}
852
853fn normalize_question_text(text: &str) -> String {
854    text.chars()
855        .map(|ch| {
856            if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
857                ch.to_ascii_lowercase()
858            } else {
859                ' '
860            }
861        })
862        .collect::<String>()
863        .split_whitespace()
864        .collect::<Vec<_>>()
865        .join(" ")
866}
867
868fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
869    spec.min_questions.max(1)
870}
871
872fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
873    spec.max_questions.unwrap_or(1).max(1)
874}
875
876fn read_topic_records(
877    log: &AnyEventLog,
878    topic: &Topic,
879) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
880    let mut from = None;
881    let mut records = Vec::new();
882    loop {
883        let batch =
884            futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
885        if batch.is_empty() {
886            break;
887        }
888        from = batch.last().map(|(event_id, _)| *event_id);
889        records.extend(batch);
890    }
891    records
892}
893
894fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
895    let Some(log) = active_event_log() else {
896        return;
897    };
898    let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
899        .expect("static hitl.questions topic should always be valid");
900    let mut merged = run
901        .hitl_questions
902        .iter()
903        .cloned()
904        .map(|question| (question.request_id.clone(), question))
905        .collect::<BTreeMap<_, _>>();
906
907    for (_, event) in read_topic_records(log.as_ref(), &topic) {
908        if event.kind != "hitl.question_asked" {
909            continue;
910        }
911        let payload = &event.payload;
912        let matches_run = event
913            .headers
914            .get("run_id")
915            .is_some_and(|value| value == &run.id)
916            || payload
917                .get("run_id")
918                .and_then(|value| value.as_str())
919                .is_some_and(|value| value == run.id);
920        if !matches_run {
921            continue;
922        }
923        let request_id = payload
924            .get("request_id")
925            .and_then(|value| value.as_str())
926            .or_else(|| event.headers.get("request_id").map(String::as_str))
927            .unwrap_or_default();
928        let prompt = payload
929            .get("payload")
930            .and_then(|value| value.get("prompt"))
931            .and_then(|value| value.as_str())
932            .unwrap_or_default();
933        if request_id.is_empty() || prompt.is_empty() {
934            continue;
935        }
936        merged.insert(
937            request_id.to_string(),
938            RunHitlQuestionRecord {
939                request_id: request_id.to_string(),
940                prompt: prompt.to_string(),
941                agent: payload
942                    .get("agent")
943                    .and_then(|value| value.as_str())
944                    .unwrap_or_default()
945                    .to_string(),
946                trace_id: payload
947                    .get("trace_id")
948                    .and_then(|value| value.as_str())
949                    .map(str::to_string),
950                asked_at: payload
951                    .get("requested_at")
952                    .and_then(|value| value.as_str())
953                    .unwrap_or_default()
954                    .to_string(),
955            },
956        );
957    }
958
959    run.hitl_questions = merged.into_values().collect();
960    run.hitl_questions.sort_by(|left, right| {
961        (left.asked_at.as_str(), left.request_id.as_str())
962            .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
963    });
964}
965
966fn signature_status_label(status: &SignatureStatus) -> &'static str {
967    match status {
968        SignatureStatus::Verified => "verified",
969        SignatureStatus::Unsigned => "unsigned",
970        SignatureStatus::Failed { .. } => "failed",
971    }
972}
973
974fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
975    run.metadata
976        .get("trigger_event")
977        .cloned()
978        .and_then(|value| serde_json::from_value(value).ok())
979}
980
981fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
982    trigger_event
983        .map(|event| event.trace_id.0.clone())
984        .or_else(|| {
985            run.metadata
986                .get("trace_id")
987                .and_then(|value| value.as_str())
988                .map(str::to_string)
989        })
990}
991
992fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
993    run.metadata
994        .get("replay_of_event_id")
995        .and_then(|value| value.as_str())
996        .map(str::to_string)
997}
998
999fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1000    if stage.kind == "condition" {
1001        ACTION_GRAPH_NODE_KIND_PREDICATE
1002    } else {
1003        ACTION_GRAPH_NODE_KIND_STAGE
1004    }
1005}
1006
1007fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1008    let mut metadata = BTreeMap::new();
1009    metadata.insert(
1010        "provider".to_string(),
1011        serde_json::json!(trigger_event.provider.as_str()),
1012    );
1013    metadata.insert(
1014        "event_kind".to_string(),
1015        serde_json::json!(trigger_event.kind),
1016    );
1017    metadata.insert(
1018        "dedupe_key".to_string(),
1019        serde_json::json!(trigger_event.dedupe_key),
1020    );
1021    metadata.insert(
1022        "signature_status".to_string(),
1023        serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1024    );
1025    metadata
1026}
1027
1028fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1029    let mut metadata = BTreeMap::new();
1030    metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1031    if let Some(branch) = stage.branch.as_ref() {
1032        metadata.insert("branch".to_string(), serde_json::json!(branch));
1033    }
1034    if let Some(worker_id) = stage
1035        .metadata
1036        .get("worker_id")
1037        .and_then(|value| value.as_str())
1038    {
1039        metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1040    }
1041    metadata
1042}
1043
1044fn append_action_graph_node(
1045    nodes: &mut Vec<RunActionGraphNodeRecord>,
1046    record: RunActionGraphNodeRecord,
1047) {
1048    nodes.push(record);
1049}
1050
1051pub async fn append_action_graph_update(
1052    headers: BTreeMap<String, String>,
1053    payload: serde_json::Value,
1054) -> Result<(), crate::event_log::LogError> {
1055    let Some(log) = active_event_log() else {
1056        return Ok(());
1057    };
1058    let topic = Topic::new("observability.action_graph")
1059        .expect("static observability.action_graph topic should always be valid");
1060    let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1061    log.append(&topic, record).await.map(|_| ())
1062}
1063
1064fn publish_action_graph_event(
1065    run: &RunRecord,
1066    observability: &RunObservabilityRecord,
1067    path: &Path,
1068) {
1069    let trigger_event = trigger_event_from_run(run);
1070    let mut headers = BTreeMap::new();
1071    headers.insert("run_id".to_string(), run.id.clone());
1072    headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1073    if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1074        headers.insert("trace_id".to_string(), trace_id);
1075    }
1076    let payload = serde_json::json!({
1077        "run_id": run.id,
1078        "workflow_id": run.workflow_id,
1079        "persisted_path": path.to_string_lossy(),
1080        "status": run.status,
1081        "observability": observability,
1082    });
1083    if let Ok(handle) = tokio::runtime::Handle::try_current() {
1084        handle.spawn(async move {
1085            let _ = append_action_graph_update(headers, payload).await;
1086        });
1087    } else {
1088        let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1089    }
1090}
1091
1092fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1093    let stem = run_path.file_stem()?.to_str()?;
1094    let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1095    Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1096}
1097
1098fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1099    value
1100        .and_then(|value| value.as_array())
1101        .map(|items| {
1102            items
1103                .iter()
1104                .filter_map(|item| item.as_str().map(str::to_string))
1105                .collect::<Vec<_>>()
1106        })
1107        .unwrap_or_default()
1108}
1109
1110fn json_usize(value: Option<&serde_json::Value>) -> usize {
1111    value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1112}
1113
1114fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1115    value.and_then(|value| value.as_bool())
1116}
1117
1118fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1119    stage
1120        .artifacts
1121        .iter()
1122        .find_map(|artifact| artifact.data.as_ref())
1123}
1124
1125fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1126    let deliverables = value
1127        .get("deliverables")
1128        .and_then(|raw| raw.as_array())
1129        .map(|items| {
1130            items
1131                .iter()
1132                .map(|item| RunDeliverableSummaryRecord {
1133                    id: item
1134                        .get("id")
1135                        .and_then(|value| value.as_str())
1136                        .unwrap_or_default()
1137                        .to_string(),
1138                    text: item
1139                        .get("text")
1140                        .and_then(|value| value.as_str())
1141                        .unwrap_or_default()
1142                        .to_string(),
1143                    status: item
1144                        .get("status")
1145                        .and_then(|value| value.as_str())
1146                        .unwrap_or_default()
1147                        .to_string(),
1148                    note: item
1149                        .get("note")
1150                        .and_then(|value| value.as_str())
1151                        .map(str::to_string),
1152                })
1153                .collect::<Vec<_>>()
1154        })
1155        .unwrap_or_default();
1156    let observations = json_string_array(value.get("observations"));
1157    let root_task = value
1158        .get("root_task")
1159        .and_then(|value| value.as_str())
1160        .unwrap_or_default()
1161        .to_string();
1162    let rationale = value
1163        .get("rationale")
1164        .and_then(|value| value.as_str())
1165        .unwrap_or_default()
1166        .to_string();
1167    if root_task.is_empty()
1168        && rationale.is_empty()
1169        && deliverables.is_empty()
1170        && observations.is_empty()
1171    {
1172        return None;
1173    }
1174    let blocking_count = deliverables
1175        .iter()
1176        .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1177        .count();
1178    Some(RunTaskLedgerSummaryRecord {
1179        root_task,
1180        rationale,
1181        deliverables,
1182        observations,
1183        blocking_count,
1184    })
1185}
1186
1187fn compaction_events_from_transcript(
1188    transcript: &serde_json::Value,
1189    stage_id: Option<&str>,
1190    node_id: Option<&str>,
1191    location_prefix: &str,
1192    persisted_path: Option<&Path>,
1193) -> Vec<CompactionEventRecord> {
1194    let transcript_id = transcript
1195        .get("id")
1196        .and_then(|value| value.as_str())
1197        .map(str::to_string);
1198    let asset_ids = transcript
1199        .get("assets")
1200        .and_then(|value| value.as_array())
1201        .map(|assets| {
1202            assets
1203                .iter()
1204                .filter_map(|asset| {
1205                    asset
1206                        .get("id")
1207                        .and_then(|value| value.as_str())
1208                        .map(str::to_string)
1209                })
1210                .collect::<BTreeSet<_>>()
1211        })
1212        .unwrap_or_default();
1213    transcript
1214        .get("events")
1215        .and_then(|value| value.as_array())
1216        .map(|events| {
1217            events
1218                .iter()
1219                .filter(|event| {
1220                    event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1221                })
1222                .map(|event| {
1223                    let metadata = event.get("metadata");
1224                    let snapshot_asset_id = metadata
1225                        .and_then(|value| value.get("snapshot_asset_id"))
1226                        .and_then(|value| value.as_str())
1227                        .map(str::to_string);
1228                    let available = snapshot_asset_id
1229                        .as_ref()
1230                        .is_some_and(|asset_id| asset_ids.contains(asset_id));
1231                    let snapshot_location = snapshot_asset_id
1232                        .as_ref()
1233                        .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1234                        .unwrap_or_else(|| location_prefix.to_string());
1235                    CompactionEventRecord {
1236                        id: event
1237                            .get("id")
1238                            .and_then(|value| value.as_str())
1239                            .unwrap_or_default()
1240                            .to_string(),
1241                        transcript_id: transcript_id.clone(),
1242                        stage_id: stage_id.map(str::to_string),
1243                        node_id: node_id.map(str::to_string),
1244                        mode: metadata
1245                            .and_then(|value| value.get("mode"))
1246                            .and_then(|value| value.as_str())
1247                            .unwrap_or_default()
1248                            .to_string(),
1249                        strategy: metadata
1250                            .and_then(|value| value.get("strategy"))
1251                            .and_then(|value| value.as_str())
1252                            .unwrap_or_default()
1253                            .to_string(),
1254                        archived_messages: json_usize(
1255                            metadata.and_then(|value| value.get("archived_messages")),
1256                        ),
1257                        estimated_tokens_before: json_usize(
1258                            metadata.and_then(|value| value.get("estimated_tokens_before")),
1259                        ),
1260                        estimated_tokens_after: json_usize(
1261                            metadata.and_then(|value| value.get("estimated_tokens_after")),
1262                        ),
1263                        snapshot_asset_id,
1264                        snapshot_location,
1265                        snapshot_path: persisted_path
1266                            .map(|path| path.to_string_lossy().into_owned()),
1267                        available,
1268                    }
1269                })
1270                .collect()
1271        })
1272        .unwrap_or_default()
1273}
1274
1275fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1276    let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1277        return Vec::new();
1278    };
1279    let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1280        return Vec::new();
1281    };
1282
1283    content
1284        .lines()
1285        .filter(|line| !line.trim().is_empty())
1286        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1287        .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1288        .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1289        .collect()
1290}
1291
1292pub fn derive_run_observability(
1293    run: &RunRecord,
1294    persisted_path: Option<&Path>,
1295) -> RunObservabilityRecord {
1296    let mut action_graph_nodes = Vec::new();
1297    let mut action_graph_edges = Vec::new();
1298    let mut verification_outcomes = Vec::new();
1299    let mut planner_rounds = Vec::new();
1300    let mut transcript_pointers = Vec::new();
1301    let mut compaction_events = Vec::new();
1302    let mut daemon_events = Vec::new();
1303    let mut research_fact_count = 0usize;
1304
1305    let root_node_id = format!("run:{}", run.id);
1306    let trigger_event = trigger_event_from_run(run);
1307    let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1308    append_action_graph_node(
1309        &mut action_graph_nodes,
1310        RunActionGraphNodeRecord {
1311            id: root_node_id.clone(),
1312            label: run
1313                .workflow_name
1314                .clone()
1315                .unwrap_or_else(|| run.workflow_id.clone()),
1316            kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1317            status: run.status.clone(),
1318            outcome: run.status.clone(),
1319            trace_id: propagated_trace_id.clone(),
1320            stage_id: None,
1321            node_id: None,
1322            worker_id: None,
1323            run_id: Some(run.id.clone()),
1324            run_path: run.persisted_path.clone(),
1325            metadata: BTreeMap::from([(
1326                "workflow_id".to_string(),
1327                serde_json::json!(run.workflow_id),
1328            )]),
1329        },
1330    );
1331    let mut entry_node_id = root_node_id.clone();
1332    if let Some(trigger_event) = trigger_event.as_ref() {
1333        if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1334            let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1335            append_action_graph_node(
1336                &mut action_graph_nodes,
1337                RunActionGraphNodeRecord {
1338                    id: replay_source_node_id.clone(),
1339                    label: format!(
1340                        "{}:{} (original {})",
1341                        trigger_event.provider.as_str(),
1342                        trigger_event.kind,
1343                        replay_of_event_id
1344                    ),
1345                    kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1346                    status: "historical".to_string(),
1347                    outcome: "replayed_from".to_string(),
1348                    trace_id: Some(trigger_event.trace_id.0.clone()),
1349                    stage_id: None,
1350                    node_id: None,
1351                    worker_id: None,
1352                    run_id: Some(run.id.clone()),
1353                    run_path: run.persisted_path.clone(),
1354                    metadata: trigger_node_metadata(trigger_event),
1355                },
1356            );
1357            action_graph_edges.push(RunActionGraphEdgeRecord {
1358                from_id: replay_source_node_id,
1359                to_id: format!("trigger:{}", trigger_event.id.0),
1360                kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1361                label: Some("replay chain".to_string()),
1362            });
1363        }
1364        let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1365        append_action_graph_node(
1366            &mut action_graph_nodes,
1367            RunActionGraphNodeRecord {
1368                id: trigger_node_id.clone(),
1369                label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1370                kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1371                status: "received".to_string(),
1372                outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1373                trace_id: Some(trigger_event.trace_id.0.clone()),
1374                stage_id: None,
1375                node_id: None,
1376                worker_id: None,
1377                run_id: Some(run.id.clone()),
1378                run_path: run.persisted_path.clone(),
1379                metadata: trigger_node_metadata(trigger_event),
1380            },
1381        );
1382        action_graph_edges.push(RunActionGraphEdgeRecord {
1383            from_id: root_node_id.clone(),
1384            to_id: trigger_node_id.clone(),
1385            kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1386            label: Some(trigger_event.id.0.clone()),
1387        });
1388        entry_node_id = trigger_node_id;
1389    }
1390
1391    let stage_node_ids = run
1392        .stages
1393        .iter()
1394        .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1395        .collect::<BTreeMap<_, _>>();
1396    let stage_by_id = run
1397        .stages
1398        .iter()
1399        .map(|stage| (stage.id.as_str(), stage))
1400        .collect::<BTreeMap<_, _>>();
1401    let stage_by_node_id = run
1402        .stages
1403        .iter()
1404        .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1405        .collect::<BTreeMap<_, _>>();
1406
1407    let incoming_nodes = run
1408        .transitions
1409        .iter()
1410        .map(|transition| transition.to_node_id.clone())
1411        .collect::<BTreeSet<_>>();
1412
1413    for stage in &run.stages {
1414        let graph_node_id = stage_node_ids
1415            .get(&stage.id)
1416            .cloned()
1417            .unwrap_or_else(|| format!("stage:{}", stage.id));
1418        append_action_graph_node(
1419            &mut action_graph_nodes,
1420            RunActionGraphNodeRecord {
1421                id: graph_node_id.clone(),
1422                label: stage.node_id.clone(),
1423                kind: action_graph_kind_for_stage(stage).to_string(),
1424                status: stage.status.clone(),
1425                outcome: stage.outcome.clone(),
1426                trace_id: propagated_trace_id.clone(),
1427                stage_id: Some(stage.id.clone()),
1428                node_id: Some(stage.node_id.clone()),
1429                worker_id: stage
1430                    .metadata
1431                    .get("worker_id")
1432                    .and_then(|value| value.as_str())
1433                    .map(str::to_string),
1434                run_id: None,
1435                run_path: None,
1436                metadata: stage_node_metadata(stage),
1437            },
1438        );
1439        if !incoming_nodes.contains(&stage.node_id) {
1440            action_graph_edges.push(RunActionGraphEdgeRecord {
1441                from_id: entry_node_id.clone(),
1442                to_id: graph_node_id.clone(),
1443                kind: if trigger_event.is_some() {
1444                    ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1445                } else {
1446                    ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1447                },
1448                label: None,
1449            });
1450        }
1451
1452        if stage.kind == "verify" || stage.verification.is_some() {
1453            let passed = json_bool(
1454                stage
1455                    .verification
1456                    .as_ref()
1457                    .and_then(|value| value.get("pass")),
1458            )
1459            .or_else(|| {
1460                json_bool(
1461                    stage
1462                        .verification
1463                        .as_ref()
1464                        .and_then(|value| value.get("success")),
1465                )
1466            })
1467            .or_else(|| {
1468                if stage.status == "completed" && stage.outcome == "success" {
1469                    Some(true)
1470                } else if stage.status == "failed" || stage.outcome == "failed" {
1471                    Some(false)
1472                } else {
1473                    None
1474                }
1475            });
1476            verification_outcomes.push(RunVerificationOutcomeRecord {
1477                stage_id: stage.id.clone(),
1478                node_id: stage.node_id.clone(),
1479                status: stage.status.clone(),
1480                passed,
1481                summary: stage
1482                    .verification
1483                    .as_ref()
1484                    .map(compact_json_value)
1485                    .or_else(|| {
1486                        stage
1487                            .visible_text
1488                            .as_ref()
1489                            .filter(|value| !value.trim().is_empty())
1490                            .cloned()
1491                    }),
1492            });
1493        }
1494
1495        if stage.transcript.is_some() {
1496            transcript_pointers.push(RunTranscriptPointerRecord {
1497                id: format!("stage:{}:transcript", stage.id),
1498                label: format!("Stage {} transcript", stage.node_id),
1499                kind: "embedded_transcript".to_string(),
1500                location: format!("run.stages[{}].transcript", stage.node_id),
1501                path: run.persisted_path.clone(),
1502                available: true,
1503            });
1504            if let Some(transcript) = stage.transcript.as_ref() {
1505                compaction_events.extend(compaction_events_from_transcript(
1506                    transcript,
1507                    Some(&stage.id),
1508                    Some(&stage.node_id),
1509                    &format!("run.stages[{}].transcript", stage.node_id),
1510                    persisted_path,
1511                ));
1512            }
1513        }
1514
1515        if let Some(payload) = stage_result_payload(stage) {
1516            let trace = payload.get("trace");
1517            let task_ledger = payload
1518                .get("task_ledger")
1519                .and_then(task_ledger_summary_from_value);
1520            let research_facts = task_ledger
1521                .as_ref()
1522                .map(|ledger| ledger.observations.clone())
1523                .unwrap_or_default();
1524            research_fact_count += research_facts.len();
1525            let tools_payload = payload.get("tools");
1526            let tools_used = json_string_array(
1527                tools_payload
1528                    .and_then(|tools| tools.get("calls"))
1529                    .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1530            );
1531            let successful_tools =
1532                json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1533            let planner_round = RunPlannerRoundRecord {
1534                stage_id: stage.id.clone(),
1535                node_id: stage.node_id.clone(),
1536                stage_kind: stage.kind.clone(),
1537                status: stage.status.clone(),
1538                outcome: stage.outcome.clone(),
1539                iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1540                llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1541                tool_execution_count: json_usize(
1542                    trace.and_then(|trace| trace.get("tool_executions")),
1543                ),
1544                tool_rejection_count: json_usize(
1545                    trace.and_then(|trace| trace.get("tool_rejections")),
1546                ),
1547                intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1548                compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1549                native_text_tool_fallback_count: json_usize(
1550                    trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1551                ),
1552                native_text_tool_fallback_rejection_count: json_usize(
1553                    trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1554                ),
1555                empty_completion_retry_count: json_usize(
1556                    trace.and_then(|trace| trace.get("empty_completion_retries")),
1557                ),
1558                tools_used,
1559                successful_tools,
1560                ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1561                task_ledger,
1562                research_facts,
1563            };
1564            let has_agentic_detail = planner_round.iteration_count > 0
1565                || planner_round.llm_call_count > 0
1566                || planner_round.tool_execution_count > 0
1567                || planner_round.native_text_tool_fallback_count > 0
1568                || planner_round.native_text_tool_fallback_rejection_count > 0
1569                || planner_round.empty_completion_retry_count > 0
1570                || planner_round.ledger_done_rejections > 0
1571                || planner_round.task_ledger.is_some()
1572                || !planner_round.tools_used.is_empty()
1573                || !planner_round.successful_tools.is_empty();
1574            if has_agentic_detail {
1575                planner_rounds.push(planner_round);
1576            }
1577        }
1578    }
1579
1580    for transition in &run.transitions {
1581        let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1582            continue;
1583        };
1584        let from_stage = transition
1585            .from_stage_id
1586            .as_deref()
1587            .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1588        let from_id = transition
1589            .from_stage_id
1590            .as_ref()
1591            .and_then(|stage_id| stage_node_ids.get(stage_id))
1592            .cloned()
1593            .or_else(|| {
1594                transition
1595                    .from_node_id
1596                    .as_ref()
1597                    .and_then(|node_id| stage_by_node_id.get(node_id))
1598                    .cloned()
1599            })
1600            .unwrap_or_else(|| root_node_id.clone());
1601        action_graph_edges.push(RunActionGraphEdgeRecord {
1602            from_id,
1603            to_id,
1604            kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1605                ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1606            } else {
1607                ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1608            },
1609            label: transition.branch.clone(),
1610        });
1611    }
1612
1613    let worker_lineage = run
1614        .child_runs
1615        .iter()
1616        .map(|child| {
1617            let worker_node_id = format!("worker:{}", child.worker_id);
1618            append_action_graph_node(
1619                &mut action_graph_nodes,
1620                RunActionGraphNodeRecord {
1621                    id: worker_node_id.clone(),
1622                    label: child.worker_name.clone(),
1623                    kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1624                    status: child.status.clone(),
1625                    outcome: child.status.clone(),
1626                    trace_id: propagated_trace_id.clone(),
1627                    stage_id: child.parent_stage_id.clone(),
1628                    node_id: None,
1629                    worker_id: Some(child.worker_id.clone()),
1630                    run_id: child.run_id.clone(),
1631                    run_path: child.run_path.clone(),
1632                    metadata: BTreeMap::from([
1633                        (
1634                            "worker_name".to_string(),
1635                            serde_json::json!(child.worker_name),
1636                        ),
1637                        ("task".to_string(), serde_json::json!(child.task)),
1638                    ]),
1639                },
1640            );
1641            if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1642                if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1643                    action_graph_edges.push(RunActionGraphEdgeRecord {
1644                        from_id: stage_node_id.clone(),
1645                        to_id: worker_node_id,
1646                        kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1647                        label: Some(child.worker_name.clone()),
1648                    });
1649                }
1650            }
1651            RunWorkerLineageRecord {
1652                worker_id: child.worker_id.clone(),
1653                worker_name: child.worker_name.clone(),
1654                parent_stage_id: child.parent_stage_id.clone(),
1655                task: child.task.clone(),
1656                status: child.status.clone(),
1657                session_id: child.session_id.clone(),
1658                parent_session_id: child.parent_session_id.clone(),
1659                run_id: child.run_id.clone(),
1660                run_path: child.run_path.clone(),
1661                snapshot_path: child.snapshot_path.clone(),
1662            }
1663        })
1664        .collect::<Vec<_>>();
1665
1666    if run.transcript.is_some() {
1667        transcript_pointers.push(RunTranscriptPointerRecord {
1668            id: "run:transcript".to_string(),
1669            label: "Run transcript".to_string(),
1670            kind: "embedded_transcript".to_string(),
1671            location: "run.transcript".to_string(),
1672            path: run.persisted_path.clone(),
1673            available: true,
1674        });
1675        if let Some(transcript) = run.transcript.as_ref() {
1676            compaction_events.extend(compaction_events_from_transcript(
1677                transcript,
1678                None,
1679                None,
1680                "run.transcript",
1681                persisted_path,
1682            ));
1683        }
1684    }
1685
1686    if let Some(path) = persisted_path {
1687        if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1688            transcript_pointers.push(RunTranscriptPointerRecord {
1689                id: "run:llm_transcript".to_string(),
1690                label: "LLM transcript sidecar".to_string(),
1691                kind: "llm_jsonl".to_string(),
1692                location: "run sidecar".to_string(),
1693                path: Some(sidecar_path.to_string_lossy().into_owned()),
1694                available: sidecar_path.exists(),
1695            });
1696        }
1697        daemon_events.extend(daemon_events_from_sidecar(path));
1698    }
1699
1700    RunObservabilityRecord {
1701        schema_version: 4,
1702        planner_rounds,
1703        research_fact_count,
1704        action_graph_nodes,
1705        action_graph_edges,
1706        worker_lineage,
1707        verification_outcomes,
1708        transcript_pointers,
1709        compaction_events,
1710        daemon_events,
1711    }
1712}
1713
1714fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1715    run.observability = Some(derive_run_observability(run, persisted_path));
1716}
1717
1718pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1719    let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1720    if run.type_name.is_empty() {
1721        run.type_name = "run_record".to_string();
1722    }
1723    if run.id.is_empty() {
1724        run.id = new_id("run");
1725    }
1726    if run.started_at.is_empty() {
1727        run.started_at = now_rfc3339();
1728    }
1729    if run.status.is_empty() {
1730        run.status = "running".to_string();
1731    }
1732    if run.root_run_id.is_none() {
1733        run.root_run_id = Some(run.id.clone());
1734    }
1735    if run.replay_fixture.is_none() {
1736        run.replay_fixture = Some(replay_fixture_from_run(&run));
1737    }
1738    merge_hitl_questions_from_active_log(&mut run);
1739    materialize_child_runs_from_stage_metadata(&mut run);
1740    sync_run_handoffs(&mut run);
1741    if run.observability.is_none() {
1742        let persisted_path = run.persisted_path.clone();
1743        let persisted = persisted_path.as_deref().map(Path::new);
1744        refresh_run_observability(&mut run, persisted);
1745    }
1746    Ok(run)
1747}
1748
1749pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1750    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1751    if manifest.type_name.is_empty() {
1752        manifest.type_name = "eval_suite_manifest".to_string();
1753    }
1754    if manifest.id.is_empty() {
1755        manifest.id = new_id("eval_suite");
1756    }
1757    Ok(manifest)
1758}
1759
1760pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1761    let content = std::fs::read_to_string(path)
1762        .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1763    let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1764        .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1765    if manifest.base_dir.is_none() {
1766        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1767    }
1768    Ok(manifest)
1769}
1770
1771pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1772    let content = std::fs::read_to_string(path)
1773        .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1774    let mut manifest: EvalPackManifest =
1775        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1776            serde_json::from_str(&content)
1777                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1778        } else {
1779            toml::from_str(&content)
1780                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1781        };
1782    normalize_eval_pack_manifest(&mut manifest);
1783    if manifest.base_dir.is_none() {
1784        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1785    }
1786    Ok(manifest)
1787}
1788
1789fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1790    if manifest.version == 0 {
1791        manifest.version = 1;
1792    }
1793    if manifest.id.is_empty() {
1794        manifest.id = manifest
1795            .name
1796            .clone()
1797            .filter(|name| !name.trim().is_empty())
1798            .unwrap_or_else(|| new_id("eval_pack"));
1799    }
1800}
1801
1802fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1803    let content = std::fs::read_to_string(path)
1804        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1805    serde_json::from_str(&content)
1806        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1807}
1808
1809fn load_run_record_from_fixture_ref(
1810    fixture: &EvalPackFixtureRef,
1811    base_dir: Option<&Path>,
1812) -> Result<RunRecord, VmError> {
1813    if let Some(inline) = &fixture.inline {
1814        let run: RunRecord = serde_json::from_value(inline.clone())
1815            .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1816        return Ok(run);
1817    }
1818    let path = fixture.path.as_deref().ok_or_else(|| {
1819        VmError::Runtime(format!(
1820            "fixture '{}' is missing path or inline run",
1821            fixture.id
1822        ))
1823    })?;
1824    load_run_record(&resolve_manifest_path(base_dir, path))
1825}
1826
1827fn load_replay_fixture_from_ref(
1828    fixture: &EvalPackFixtureRef,
1829    base_dir: Option<&Path>,
1830) -> Result<ReplayFixture, VmError> {
1831    if let Some(inline) = &fixture.inline {
1832        return serde_json::from_value(inline.clone())
1833            .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1834    }
1835    let path = fixture.path.as_deref().ok_or_else(|| {
1836        VmError::Runtime(format!(
1837            "fixture '{}' is missing path or inline replay fixture",
1838            fixture.id
1839        ))
1840    })?;
1841    load_replay_fixture(&resolve_manifest_path(base_dir, path))
1842}
1843
1844fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1845    let path_buf = PathBuf::from(path);
1846    if path_buf.is_absolute() {
1847        path_buf
1848    } else if let Some(base_dir) = base_dir {
1849        base_dir.join(path_buf)
1850    } else {
1851        path_buf
1852    }
1853}
1854
1855pub fn evaluate_run_suite_manifest(
1856    manifest: &EvalSuiteManifest,
1857) -> Result<ReplayEvalSuiteReport, VmError> {
1858    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1859    let mut reports = Vec::new();
1860    for case in &manifest.cases {
1861        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1862        let run = load_run_record(&run_path)?;
1863        let fixture = match &case.fixture_path {
1864            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1865            None => run
1866                .replay_fixture
1867                .clone()
1868                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1869        };
1870        let eval = evaluate_run_against_fixture(&run, &fixture);
1871        let mut pass = eval.pass;
1872        let mut failures = eval.failures;
1873        let comparison = match &case.compare_to {
1874            Some(path) => {
1875                let baseline_path = resolve_manifest_path(base_dir, path);
1876                let baseline = load_run_record(&baseline_path)?;
1877                let diff = diff_run_records(&baseline, &run);
1878                if !diff.identical {
1879                    pass = false;
1880                    failures.push(format!(
1881                        "run differs from baseline {} with {} stage changes",
1882                        baseline_path.display(),
1883                        diff.stage_diffs.len()
1884                    ));
1885                }
1886                Some(diff)
1887            }
1888            None => None,
1889        };
1890        reports.push(ReplayEvalCaseReport {
1891            run_id: run.id.clone(),
1892            workflow_id: run.workflow_id.clone(),
1893            label: case.label.clone(),
1894            pass,
1895            failures,
1896            stage_count: eval.stage_count,
1897            source_path: Some(run_path.display().to_string()),
1898            comparison,
1899        });
1900    }
1901    let total = reports.len();
1902    let passed = reports.iter().filter(|report| report.pass).count();
1903    let failed = total.saturating_sub(passed);
1904    Ok(ReplayEvalSuiteReport {
1905        pass: failed == 0,
1906        total,
1907        passed,
1908        failed,
1909        cases: reports,
1910    })
1911}
1912
1913pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1914    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1915    let fixture_base_dir_buf = manifest
1916        .defaults
1917        .fixture_root
1918        .as_deref()
1919        .map(|root| resolve_manifest_path(base_dir, root));
1920    let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1921    let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1922        .fixtures
1923        .iter()
1924        .filter(|fixture| !fixture.id.is_empty())
1925        .map(|fixture| (fixture.id.as_str(), fixture))
1926        .collect();
1927    let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1928        .rubrics
1929        .iter()
1930        .filter(|rubric| !rubric.id.is_empty())
1931        .map(|rubric| (rubric.id.as_str(), rubric))
1932        .collect();
1933
1934    let mut reports = Vec::new();
1935    for (index, case) in manifest.cases.iter().enumerate() {
1936        let case_id = case
1937            .id
1938            .clone()
1939            .filter(|id| !id.trim().is_empty())
1940            .unwrap_or_else(|| format!("case_{}", index + 1));
1941        let label = case
1942            .name
1943            .clone()
1944            .or_else(|| case.id.clone())
1945            .unwrap_or_else(|| case_id.clone());
1946        let severity = eval_pack_case_severity(manifest, case);
1947        let blocking = severity == "blocking";
1948        let mut failures = Vec::new();
1949        let mut warnings = Vec::new();
1950        let mut informational = Vec::new();
1951
1952        if case.friction_events.is_some() {
1953            let report = evaluate_eval_pack_friction_case(
1954                manifest,
1955                case,
1956                &case_id,
1957                &label,
1958                &severity,
1959                blocking,
1960                base_dir,
1961                fixture_base_dir,
1962                &fixtures_by_id,
1963                &rubrics_by_id,
1964            )?;
1965            reports.push(report);
1966            continue;
1967        }
1968
1969        let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
1970        let fixture =
1971            load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
1972        let eval = evaluate_run_against_fixture(&run, &fixture);
1973        failures.extend(eval.failures);
1974        apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
1975        apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
1976
1977        let comparison = match &case.compare_to {
1978            Some(path) => {
1979                let baseline_path = resolve_manifest_path(base_dir, path);
1980                let baseline = load_run_record(&baseline_path)?;
1981                let diff = diff_run_records(&baseline, &run);
1982                if !diff.identical {
1983                    failures.push(format!(
1984                        "run differs from baseline {} with {} stage changes",
1985                        baseline_path.display(),
1986                        diff.stage_diffs.len()
1987                    ));
1988                }
1989                Some(diff)
1990            }
1991            None => None,
1992        };
1993
1994        for rubric_id in &case.rubrics {
1995            let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1996                failures.push(format!("case references unknown rubric '{rubric_id}'"));
1997                continue;
1998            };
1999            apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2000        }
2001
2002        let pass = failures.is_empty() || !blocking;
2003        if !failures.is_empty() && !blocking {
2004            if severity == "warning" {
2005                warnings.append(&mut failures);
2006            } else {
2007                informational.append(&mut failures);
2008            }
2009        }
2010        reports.push(EvalPackCaseReport {
2011            id: case_id,
2012            label,
2013            severity,
2014            pass,
2015            blocking,
2016            run_id: run.id.clone(),
2017            workflow_id: run.workflow_id.clone(),
2018            source_path: eval_pack_case_source_path(
2019                case,
2020                base_dir,
2021                fixture_base_dir,
2022                &fixtures_by_id,
2023            ),
2024            stage_count: eval.stage_count,
2025            failures,
2026            warnings,
2027            informational,
2028            comparison,
2029        });
2030    }
2031
2032    let total = reports.len();
2033    let blocking_failed = reports
2034        .iter()
2035        .filter(|report| report.blocking && !report.failures.is_empty())
2036        .count();
2037    let warning_failed = reports
2038        .iter()
2039        .filter(|report| !report.warnings.is_empty())
2040        .count();
2041    let informational_failed = reports
2042        .iter()
2043        .filter(|report| !report.informational.is_empty())
2044        .count();
2045    let passed = reports.iter().filter(|report| report.pass).count();
2046    Ok(EvalPackReport {
2047        pack_id: manifest.id.clone(),
2048        pass: blocking_failed == 0,
2049        total,
2050        passed,
2051        failed: total.saturating_sub(passed),
2052        blocking_failed,
2053        warning_failed,
2054        informational_failed,
2055        cases: reports,
2056    })
2057}
2058
2059#[allow(clippy::too_many_arguments)]
2060fn evaluate_eval_pack_friction_case(
2061    manifest: &EvalPackManifest,
2062    case: &EvalPackCase,
2063    case_id: &str,
2064    label: &str,
2065    severity: &str,
2066    blocking: bool,
2067    base_dir: Option<&Path>,
2068    fixture_base_dir: Option<&Path>,
2069    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2070    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2071) -> Result<EvalPackCaseReport, VmError> {
2072    let mut failures = Vec::new();
2073    let mut warnings = Vec::new();
2074    let mut informational = Vec::new();
2075    let events =
2076        load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2077    let options = friction_suggestion_options(case, manifest);
2078    let suggestions = generate_context_pack_suggestions(&events, &options);
2079
2080    for rubric_id in &case.rubrics {
2081        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2082            failures.push(format!("case references unknown rubric '{rubric_id}'"));
2083            continue;
2084        };
2085        apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2086    }
2087
2088    if case.rubrics.is_empty() && suggestions.is_empty() {
2089        failures.push("friction fixture produced no context-pack suggestions".to_string());
2090    }
2091
2092    let pass = failures.is_empty() || !blocking;
2093    if !failures.is_empty() && !blocking {
2094        if severity == "warning" {
2095            warnings.append(&mut failures);
2096        } else {
2097            informational.append(&mut failures);
2098        }
2099    }
2100
2101    Ok(EvalPackCaseReport {
2102        id: case_id.to_string(),
2103        label: label.to_string(),
2104        severity: severity.to_string(),
2105        pass,
2106        blocking,
2107        run_id: "friction_events".to_string(),
2108        workflow_id: String::new(),
2109        source_path: eval_pack_case_friction_source_path(
2110            case,
2111            base_dir,
2112            fixture_base_dir,
2113            fixtures_by_id,
2114        ),
2115        stage_count: events.len(),
2116        failures,
2117        warnings,
2118        informational,
2119        comparison: None,
2120    })
2121}
2122
2123fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2124    normalize_eval_pack_severity(
2125        case.severity
2126            .as_deref()
2127            .or(case.thresholds.severity.as_deref())
2128            .or(manifest.defaults.severity.as_deref())
2129            .or(manifest.defaults.thresholds.severity.as_deref())
2130            .unwrap_or("blocking"),
2131    )
2132}
2133
2134fn normalize_eval_pack_severity(value: &str) -> String {
2135    match value.trim().to_ascii_lowercase().as_str() {
2136        "warn" | "warning" => "warning".to_string(),
2137        "info" | "informational" => "informational".to_string(),
2138        _ => "blocking".to_string(),
2139    }
2140}
2141
2142fn load_eval_pack_case_run(
2143    case: &EvalPackCase,
2144    base_dir: Option<&Path>,
2145    fixture_base_dir: Option<&Path>,
2146    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2147) -> Result<RunRecord, VmError> {
2148    if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2149        if let Some(fixture) = fixtures_by_id.get(run_ref) {
2150            return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2151        }
2152        return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2153    }
2154    Err(VmError::Runtime(
2155        "eval pack case is missing run or run_path".to_string(),
2156    ))
2157}
2158
2159fn load_eval_pack_case_fixture(
2160    case: &EvalPackCase,
2161    base_dir: Option<&Path>,
2162    fixture_base_dir: Option<&Path>,
2163    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2164    run: &RunRecord,
2165) -> Result<ReplayFixture, VmError> {
2166    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2167        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2168            return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2169        }
2170        return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2171    }
2172    Ok(run
2173        .replay_fixture
2174        .clone()
2175        .unwrap_or_else(|| replay_fixture_from_run(run)))
2176}
2177
2178fn eval_pack_case_source_path(
2179    case: &EvalPackCase,
2180    base_dir: Option<&Path>,
2181    fixture_base_dir: Option<&Path>,
2182    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2183) -> Option<String> {
2184    let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2185    if let Some(fixture) = fixtures_by_id.get(run_ref) {
2186        return fixture.path.as_ref().map(|path| {
2187            resolve_manifest_path(fixture_base_dir, path)
2188                .display()
2189                .to_string()
2190        });
2191    }
2192    Some(
2193        resolve_manifest_path(base_dir, run_ref)
2194            .display()
2195            .to_string(),
2196    )
2197}
2198
2199fn load_eval_pack_case_friction_events(
2200    case: &EvalPackCase,
2201    base_dir: Option<&Path>,
2202    fixture_base_dir: Option<&Path>,
2203    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2204) -> Result<Vec<FrictionEvent>, VmError> {
2205    let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2206        VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2207    })?;
2208    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2209        return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2210    }
2211    load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2212}
2213
2214fn load_friction_events_from_fixture_ref(
2215    fixture: &EvalPackFixtureRef,
2216    base_dir: Option<&Path>,
2217) -> Result<Vec<FrictionEvent>, VmError> {
2218    if let Some(inline) = &fixture.inline {
2219        return normalize_friction_events_json(inline.clone());
2220    }
2221    let path = fixture.path.as_deref().ok_or_else(|| {
2222        VmError::Runtime(format!(
2223            "fixture '{}' is missing path or inline friction events",
2224            fixture.id
2225        ))
2226    })?;
2227    load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2228}
2229
2230fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2231    let content = std::fs::read_to_string(path)
2232        .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2233    let value: serde_json::Value = serde_json::from_str(&content)
2234        .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2235    normalize_friction_events_json(value)
2236}
2237
2238fn eval_pack_case_friction_source_path(
2239    case: &EvalPackCase,
2240    base_dir: Option<&Path>,
2241    fixture_base_dir: Option<&Path>,
2242    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2243) -> Option<String> {
2244    let event_ref = case.friction_events.as_deref()?;
2245    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2246        return fixture.path.as_ref().map(|path| {
2247            resolve_manifest_path(fixture_base_dir, path)
2248                .display()
2249                .to_string()
2250        });
2251    }
2252    Some(
2253        resolve_manifest_path(base_dir, event_ref)
2254            .display()
2255            .to_string(),
2256    )
2257}
2258
2259fn friction_suggestion_options(
2260    case: &EvalPackCase,
2261    manifest: &EvalPackManifest,
2262) -> ContextPackSuggestionOptions {
2263    let min_occurrences = case
2264        .metadata
2265        .get("min_occurrences")
2266        .or_else(|| manifest.metadata.get("min_occurrences"))
2267        .and_then(|value| value.as_u64())
2268        .unwrap_or(2) as usize;
2269    let owner = case
2270        .metadata
2271        .get("owner")
2272        .or_else(|| manifest.metadata.get("owner"))
2273        .and_then(|value| value.as_str())
2274        .map(str::to_string)
2275        .or_else(|| {
2276            manifest
2277                .package
2278                .as_ref()
2279                .and_then(|package| package.name.clone())
2280        });
2281    ContextPackSuggestionOptions {
2282        min_occurrences,
2283        owner,
2284    }
2285}
2286
2287fn apply_eval_pack_thresholds(
2288    run: &RunRecord,
2289    thresholds: &EvalPackThresholds,
2290    failures: &mut Vec<String>,
2291) {
2292    if let Some(max_stage_count) = thresholds.max_stage_count {
2293        if run.stages.len() > max_stage_count {
2294            failures.push(format!(
2295                "stage count {} exceeds threshold {}",
2296                run.stages.len(),
2297                max_stage_count
2298            ));
2299        }
2300    }
2301    if let Some(max_latency_ms) = thresholds.max_latency_ms {
2302        let actual = run
2303            .usage
2304            .as_ref()
2305            .map(|usage| usage.total_duration_ms)
2306            .unwrap_or_default();
2307        if actual > max_latency_ms {
2308            failures.push(format!(
2309                "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2310            ));
2311        }
2312    }
2313    if let Some(max_cost_usd) = thresholds.max_cost_usd {
2314        let actual = run
2315            .usage
2316            .as_ref()
2317            .map(|usage| usage.total_cost)
2318            .unwrap_or_default();
2319        if actual > max_cost_usd {
2320            failures.push(format!(
2321                "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2322            ));
2323        }
2324    }
2325    if let Some(max_tokens) = thresholds.max_tokens {
2326        let actual = run
2327            .usage
2328            .as_ref()
2329            .map(|usage| usage.input_tokens + usage.output_tokens)
2330            .unwrap_or_default();
2331        if actual > max_tokens {
2332            failures.push(format!(
2333                "token count {actual} exceeds threshold {max_tokens}"
2334            ));
2335        }
2336    }
2337}
2338
2339fn apply_eval_pack_rubric(
2340    rubric: &EvalPackRubric,
2341    run: &RunRecord,
2342    failures: &mut Vec<String>,
2343    warnings: &mut Vec<String>,
2344) {
2345    match rubric.kind.as_str() {
2346        "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2347            apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2348            for assertion in &rubric.assertions {
2349                apply_eval_pack_assertion(rubric, assertion, run, failures);
2350            }
2351        }
2352        "llm-judge" | "llm_as_judge" | "judge" => {
2353            let severity = normalize_eval_pack_severity(
2354                rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2355            );
2356            let message = format!(
2357                "rubric '{}' requires an external LLM judge and was not run locally",
2358                rubric.id
2359            );
2360            if severity == "blocking" {
2361                failures.push(message);
2362            } else {
2363                warnings.push(message);
2364            }
2365        }
2366        other => warnings.push(format!(
2367            "rubric '{}' has unknown kind '{}' and was not run locally",
2368            rubric.id, other
2369        )),
2370    }
2371}
2372
2373fn apply_eval_pack_friction_rubric(
2374    rubric: &EvalPackRubric,
2375    suggestions: &[super::ContextPackSuggestion],
2376    failures: &mut Vec<String>,
2377    warnings: &mut Vec<String>,
2378) {
2379    match rubric.kind.as_str() {
2380        "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2381            let mut expectations = Vec::new();
2382            for assertion in &rubric.assertions {
2383                match assertion.kind.as_str() {
2384                    "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2385                        let expectation = context_pack_expectation_from_assertion(assertion);
2386                        expectations.push(expectation);
2387                    }
2388                    other => failures.push(format!(
2389                        "rubric '{}' has unsupported friction assertion kind '{}'",
2390                        rubric.id, other
2391                    )),
2392                }
2393            }
2394            failures.extend(evaluate_context_pack_suggestion_expectations(
2395                suggestions,
2396                &expectations,
2397            ));
2398        }
2399        other => warnings.push(format!(
2400            "rubric '{}' has unknown friction kind '{}' and was not run locally",
2401            rubric.id, other
2402        )),
2403    }
2404}
2405
2406fn context_pack_expectation_from_assertion(
2407    assertion: &EvalPackAssertion,
2408) -> ContextPackSuggestionExpectation {
2409    let expected = assertion
2410        .expected
2411        .as_ref()
2412        .and_then(|value| value.as_object());
2413    let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2414    ContextPackSuggestionExpectation {
2415        min_suggestions: expected
2416            .and_then(|map| map.get("min_suggestions"))
2417            .and_then(|value| value.as_u64())
2418            .map(|value| value as usize),
2419        recommended_artifact: expected
2420            .and_then(|map| map.get("recommended_artifact"))
2421            .and_then(|value| value.as_str())
2422            .map(str::to_string)
2423            .or_else(|| expected_string.map(str::to_string)),
2424        title_contains: assertion.contains.clone().or_else(|| {
2425            expected
2426                .and_then(|map| map.get("title_contains"))
2427                .and_then(|value| value.as_str())
2428                .map(str::to_string)
2429        }),
2430        manifest_name_contains: expected
2431            .and_then(|map| map.get("manifest_name_contains"))
2432            .and_then(|value| value.as_str())
2433            .map(str::to_string),
2434        required_capability: expected
2435            .and_then(|map| map.get("required_capability"))
2436            .and_then(|value| value.as_str())
2437            .map(str::to_string),
2438        required_output_slot: expected
2439            .and_then(|map| map.get("required_output_slot"))
2440            .and_then(|value| value.as_str())
2441            .map(str::to_string),
2442    }
2443}
2444
2445fn apply_eval_pack_assertion(
2446    rubric: &EvalPackRubric,
2447    assertion: &EvalPackAssertion,
2448    run: &RunRecord,
2449    failures: &mut Vec<String>,
2450) {
2451    match assertion.kind.as_str() {
2452        "run-status" | "run_status" | "status" => {
2453            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2454            if let Some(expected) = expected {
2455                if run.status != expected {
2456                    failures.push(format!(
2457                        "rubric '{}' expected run status {}, got {}",
2458                        rubric.id, expected, run.status
2459                    ));
2460                }
2461            }
2462        }
2463        "stage-status" | "stage_status" => {
2464            let Some(stage_id) = assertion.stage.as_deref() else {
2465                failures.push(format!(
2466                    "rubric '{}' stage-status assertion missing stage",
2467                    rubric.id
2468                ));
2469                return;
2470            };
2471            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2472            let Some(expected) = expected else {
2473                failures.push(format!(
2474                    "rubric '{}' stage-status assertion missing expected string",
2475                    rubric.id
2476                ));
2477                return;
2478            };
2479            match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2480                Some(stage) if stage.status == expected => {}
2481                Some(stage) => failures.push(format!(
2482                    "rubric '{}' expected stage {} status {}, got {}",
2483                    rubric.id, stage_id, expected, stage.status
2484                )),
2485                None => failures.push(format!(
2486                    "rubric '{}' expected stage {} to exist",
2487                    rubric.id, stage_id
2488                )),
2489            }
2490        }
2491        "visible-text-contains" | "visible_text_contains" => {
2492            let Some(needle) = assertion.contains.as_deref() else {
2493                failures.push(format!(
2494                    "rubric '{}' visible-text assertion missing contains",
2495                    rubric.id
2496                ));
2497                return;
2498            };
2499            let matched = match assertion.stage.as_deref() {
2500                Some(stage_id) => run
2501                    .stages
2502                    .iter()
2503                    .find(|stage| stage.node_id == stage_id)
2504                    .and_then(|stage| stage.visible_text.as_deref())
2505                    .is_some_and(|text| text.contains(needle)),
2506                None => run
2507                    .stages
2508                    .iter()
2509                    .filter_map(|stage| stage.visible_text.as_deref())
2510                    .any(|text| text.contains(needle)),
2511            };
2512            if !matched {
2513                failures.push(format!(
2514                    "rubric '{}' expected visible text to contain {:?}",
2515                    rubric.id, needle
2516                ));
2517            }
2518        }
2519        "hitl-question-contains" | "hitl_question_contains" => {
2520            let Some(needle) = assertion.contains.as_deref() else {
2521                failures.push(format!(
2522                    "rubric '{}' HITL assertion missing contains",
2523                    rubric.id
2524                ));
2525                return;
2526            };
2527            if !run
2528                .hitl_questions
2529                .iter()
2530                .any(|question| question.prompt.contains(needle))
2531            {
2532                failures.push(format!(
2533                    "rubric '{}' expected HITL question to contain {:?}",
2534                    rubric.id, needle
2535                ));
2536            }
2537        }
2538        "" => {}
2539        other => failures.push(format!(
2540            "rubric '{}' has unsupported assertion kind '{}'",
2541            rubric.id, other
2542        )),
2543    }
2544}
2545
2546/// Edit operation in a diff sequence.
2547#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2548pub(crate) enum DiffOp {
2549    Equal,
2550    Delete,
2551    Insert,
2552}
2553
2554/// Compute the shortest edit script using Myers' O(nd) algorithm.
2555/// Returns a sequence of (DiffOp, line_index_in_before_or_after).
2556/// Time: O(nd) where d = edit distance. Space: O(d * n).
2557pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2558    let n = a.len() as isize;
2559    let m = b.len() as isize;
2560    if n == 0 && m == 0 {
2561        return Vec::new();
2562    }
2563    if n == 0 {
2564        return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2565    }
2566    if m == 0 {
2567        return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2568    }
2569
2570    let max_d = (n + m) as usize;
2571    let offset = max_d as isize;
2572    let v_size = 2 * max_d + 1;
2573    let mut v = vec![0isize; v_size];
2574    // trace[d] holds the `v` snapshot BEFORE step d ran — required for backtrack.
2575    let mut trace: Vec<Vec<isize>> = Vec::new();
2576
2577    'outer: for d in 0..=max_d as isize {
2578        trace.push(v.clone());
2579        let mut new_v = v.clone();
2580        for k in (-d..=d).step_by(2) {
2581            let ki = (k + offset) as usize;
2582            let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2583                v[ki + 1]
2584            } else {
2585                v[ki - 1] + 1
2586            };
2587            let mut y = x - k;
2588            while x < n && y < m && a[x as usize] == b[y as usize] {
2589                x += 1;
2590                y += 1;
2591            }
2592            new_v[ki] = x;
2593            if x >= n && y >= m {
2594                let _ = new_v;
2595                break 'outer;
2596            }
2597        }
2598        v = new_v;
2599    }
2600
2601    let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2602    let mut x = n;
2603    let mut y = m;
2604    for d in (1..trace.len() as isize).rev() {
2605        let k = x - y;
2606        let v_prev = &trace[d as usize];
2607        let prev_k = if k == -d
2608            || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2609        {
2610            k + 1
2611        } else {
2612            k - 1
2613        };
2614        let prev_x = v_prev[(prev_k + offset) as usize];
2615        let prev_y = prev_x - prev_k;
2616
2617        while x > prev_x && y > prev_y {
2618            x -= 1;
2619            y -= 1;
2620            ops.push((DiffOp::Equal, x as usize));
2621        }
2622        if prev_k < k {
2623            x -= 1;
2624            ops.push((DiffOp::Delete, x as usize));
2625        } else {
2626            y -= 1;
2627            ops.push((DiffOp::Insert, y as usize));
2628        }
2629    }
2630    while x > 0 && y > 0 {
2631        x -= 1;
2632        y -= 1;
2633        ops.push((DiffOp::Equal, x as usize));
2634    }
2635    ops.reverse();
2636    ops
2637}
2638
2639pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2640    let before_lines: Vec<&str> = before.lines().collect();
2641    let after_lines: Vec<&str> = after.lines().collect();
2642    let ops = myers_diff(&before_lines, &after_lines);
2643
2644    let mut diff = String::new();
2645    let file = path.unwrap_or("artifact");
2646    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2647    for &(op, idx) in &ops {
2648        match op {
2649            DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2650            DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2651            DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2652        }
2653    }
2654    diff
2655}
2656
2657pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2658    let path = path
2659        .map(PathBuf::from)
2660        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2661    let mut materialized = run.clone();
2662    merge_hitl_questions_from_active_log(&mut materialized);
2663    materialize_child_runs_from_stage_metadata(&mut materialized);
2664    if materialized.replay_fixture.is_none() {
2665        materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2666    }
2667    materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2668    sync_run_handoffs(&mut materialized);
2669    refresh_run_observability(&mut materialized, Some(&path));
2670    if let Some(parent) = path.parent() {
2671        std::fs::create_dir_all(parent)
2672            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2673    }
2674    let json = serde_json::to_string_pretty(&materialized)
2675        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2676    // Atomic write: .tmp then rename guards against partial writes on kill.
2677    let tmp_path = path.with_extension("json.tmp");
2678    std::fs::write(&tmp_path, &json)
2679        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2680    std::fs::rename(&tmp_path, &path).map_err(|e| {
2681        // Cross-device renames fail on some filesystems; best-effort direct write.
2682        let _ = std::fs::write(&path, &json);
2683        VmError::Runtime(format!("failed to finalize run record: {e}"))
2684    })?;
2685    if let Some(observability) = materialized.observability.as_ref() {
2686        publish_action_graph_event(&materialized, observability, &path);
2687    }
2688    Ok(path.to_string_lossy().into_owned())
2689}
2690
2691pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2692    let content = std::fs::read_to_string(path)
2693        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2694    let mut run: RunRecord = serde_json::from_str(&content)
2695        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2696    materialize_child_runs_from_stage_metadata(&mut run);
2697    if run.replay_fixture.is_none() {
2698        run.replay_fixture = Some(replay_fixture_from_run(&run));
2699    }
2700    run.persisted_path
2701        .get_or_insert_with(|| path.to_string_lossy().into_owned());
2702    sync_run_handoffs(&mut run);
2703    refresh_run_observability(&mut run, Some(path));
2704    Ok(run)
2705}
2706
2707pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2708    ReplayFixture {
2709        type_name: "replay_fixture".to_string(),
2710        id: new_id("fixture"),
2711        source_run_id: run.id.clone(),
2712        workflow_id: run.workflow_id.clone(),
2713        workflow_name: run.workflow_name.clone(),
2714        created_at: now_rfc3339(),
2715        eval_kind: Some("replay".to_string()),
2716        clarifying_question: None,
2717        expected_status: run.status.clone(),
2718        stage_assertions: run
2719            .stages
2720            .iter()
2721            .map(|stage| ReplayStageAssertion {
2722                node_id: stage.node_id.clone(),
2723                expected_status: stage.status.clone(),
2724                expected_outcome: stage.outcome.clone(),
2725                expected_branch: stage.branch.clone(),
2726                required_artifact_kinds: stage
2727                    .artifacts
2728                    .iter()
2729                    .map(|artifact| artifact.kind.clone())
2730                    .collect(),
2731                visible_text_contains: stage
2732                    .visible_text
2733                    .as_ref()
2734                    .filter(|text| !text.is_empty())
2735                    .map(|text| text.chars().take(80).collect()),
2736            })
2737            .collect(),
2738    }
2739}
2740
2741pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2742    if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2743        return evaluate_clarifying_question(run, fixture);
2744    }
2745    let mut failures = Vec::new();
2746    if run.status != fixture.expected_status {
2747        failures.push(format!(
2748            "run status mismatch: expected {}, got {}",
2749            fixture.expected_status, run.status
2750        ));
2751    }
2752    let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2753        run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2754    for assertion in &fixture.stage_assertions {
2755        let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2756            failures.push(format!("missing stage {}", assertion.node_id));
2757            continue;
2758        };
2759        if stage.status != assertion.expected_status {
2760            failures.push(format!(
2761                "stage {} status mismatch: expected {}, got {}",
2762                assertion.node_id, assertion.expected_status, stage.status
2763            ));
2764        }
2765        if stage.outcome != assertion.expected_outcome {
2766            failures.push(format!(
2767                "stage {} outcome mismatch: expected {}, got {}",
2768                assertion.node_id, assertion.expected_outcome, stage.outcome
2769            ));
2770        }
2771        if stage.branch != assertion.expected_branch {
2772            failures.push(format!(
2773                "stage {} branch mismatch: expected {:?}, got {:?}",
2774                assertion.node_id, assertion.expected_branch, stage.branch
2775            ));
2776        }
2777        for required_kind in &assertion.required_artifact_kinds {
2778            if !stage
2779                .artifacts
2780                .iter()
2781                .any(|artifact| &artifact.kind == required_kind)
2782            {
2783                failures.push(format!(
2784                    "stage {} missing artifact kind {}",
2785                    assertion.node_id, required_kind
2786                ));
2787            }
2788        }
2789        if let Some(snippet) = &assertion.visible_text_contains {
2790            let actual = stage.visible_text.clone().unwrap_or_default();
2791            if !actual.contains(snippet) {
2792                failures.push(format!(
2793                    "stage {} visible text does not contain expected snippet {:?}",
2794                    assertion.node_id, snippet
2795                ));
2796            }
2797        }
2798    }
2799
2800    ReplayEvalReport {
2801        pass: failures.is_empty(),
2802        failures,
2803        stage_count: run.stages.len(),
2804    }
2805}
2806
2807fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2808    let mut failures = Vec::new();
2809    let spec = fixture.clarifying_question.clone().unwrap_or_default();
2810    let min_questions = clarifying_min_questions(&spec);
2811    let max_questions = clarifying_max_questions(&spec);
2812    let questions = &run.hitl_questions;
2813
2814    if run.status != fixture.expected_status {
2815        failures.push(format!(
2816            "run status mismatch: expected {}, got {}",
2817            fixture.expected_status, run.status
2818        ));
2819    }
2820    if questions.len() < min_questions {
2821        failures.push(format!(
2822            "expected at least {min_questions} clarifying question(s), got {}",
2823            questions.len()
2824        ));
2825    }
2826    if questions.len() > max_questions {
2827        failures.push(format!(
2828            "expected at most {max_questions} clarifying question(s), got {}",
2829            questions.len()
2830        ));
2831    }
2832
2833    let normalized_expected = spec
2834        .expected_question
2835        .as_deref()
2836        .map(normalize_question_text);
2837    let normalized_accepted = spec
2838        .accepted_questions
2839        .iter()
2840        .map(|question| normalize_question_text(question))
2841        .collect::<Vec<_>>();
2842    let required_terms = spec
2843        .required_terms
2844        .iter()
2845        .map(|term| normalize_question_text(term))
2846        .collect::<Vec<_>>();
2847    let forbidden_terms = spec
2848        .forbidden_terms
2849        .iter()
2850        .map(|term| normalize_question_text(term))
2851        .collect::<Vec<_>>();
2852
2853    let matched = questions.iter().any(|question| {
2854        let normalized = normalize_question_text(&question.prompt);
2855        let matches_expected = normalized_expected
2856            .as_ref()
2857            .is_none_or(|expected| &normalized == expected)
2858            && (normalized_accepted.is_empty()
2859                || normalized_accepted
2860                    .iter()
2861                    .any(|candidate| candidate == &normalized));
2862        let has_required_terms = required_terms
2863            .iter()
2864            .all(|term| normalized.contains(term.as_str()));
2865        let avoids_forbidden_terms = forbidden_terms
2866            .iter()
2867            .all(|term| !normalized.contains(term.as_str()));
2868        matches_expected && has_required_terms && avoids_forbidden_terms
2869    });
2870
2871    if !questions.is_empty()
2872        && (!normalized_accepted.is_empty()
2873            || normalized_expected.is_some()
2874            || !required_terms.is_empty()
2875            || !forbidden_terms.is_empty())
2876        && !matched
2877    {
2878        failures.push(format!(
2879            "no clarifying question matched fixture; actual questions: {}",
2880            questions
2881                .iter()
2882                .map(|question| format!("{:?}", question.prompt))
2883                .collect::<Vec<_>>()
2884                .join(", ")
2885        ));
2886    }
2887
2888    ReplayEvalReport {
2889        pass: failures.is_empty(),
2890        failures,
2891        stage_count: run.stages.len(),
2892    }
2893}
2894
2895pub fn evaluate_run_suite(
2896    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2897) -> ReplayEvalSuiteReport {
2898    let mut reports = Vec::new();
2899    for (run, fixture, source_path) in cases {
2900        let report = evaluate_run_against_fixture(&run, &fixture);
2901        reports.push(ReplayEvalCaseReport {
2902            run_id: run.id.clone(),
2903            workflow_id: run.workflow_id.clone(),
2904            label: None,
2905            pass: report.pass,
2906            failures: report.failures,
2907            stage_count: report.stage_count,
2908            source_path,
2909            comparison: None,
2910        });
2911    }
2912    let total = reports.len();
2913    let passed = reports.iter().filter(|report| report.pass).count();
2914    let failed = total.saturating_sub(passed);
2915    ReplayEvalSuiteReport {
2916        pass: failed == 0,
2917        total,
2918        passed,
2919        failed,
2920        cases: reports,
2921    }
2922}
2923
2924pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2925    let mut stage_diffs = Vec::new();
2926    let mut all_node_ids = BTreeSet::new();
2927    let left_by_id: BTreeMap<&str, &RunStageRecord> = left
2928        .stages
2929        .iter()
2930        .map(|s| (s.node_id.as_str(), s))
2931        .collect();
2932    let right_by_id: BTreeMap<&str, &RunStageRecord> = right
2933        .stages
2934        .iter()
2935        .map(|s| (s.node_id.as_str(), s))
2936        .collect();
2937    all_node_ids.extend(left_by_id.keys().copied());
2938    all_node_ids.extend(right_by_id.keys().copied());
2939
2940    for node_id in all_node_ids {
2941        let left_stage = left_by_id.get(node_id).copied();
2942        let right_stage = right_by_id.get(node_id).copied();
2943        match (left_stage, right_stage) {
2944            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2945                node_id: node_id.to_string(),
2946                change: "removed".to_string(),
2947                details: vec!["stage missing from right run".to_string()],
2948            }),
2949            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2950                node_id: node_id.to_string(),
2951                change: "added".to_string(),
2952                details: vec!["stage missing from left run".to_string()],
2953            }),
2954            (Some(left_stage), Some(right_stage)) => {
2955                let mut details = Vec::new();
2956                if left_stage.status != right_stage.status {
2957                    details.push(format!(
2958                        "status: {} -> {}",
2959                        left_stage.status, right_stage.status
2960                    ));
2961                }
2962                if left_stage.outcome != right_stage.outcome {
2963                    details.push(format!(
2964                        "outcome: {} -> {}",
2965                        left_stage.outcome, right_stage.outcome
2966                    ));
2967                }
2968                if left_stage.branch != right_stage.branch {
2969                    details.push(format!(
2970                        "branch: {:?} -> {:?}",
2971                        left_stage.branch, right_stage.branch
2972                    ));
2973                }
2974                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2975                {
2976                    details.push(format!(
2977                        "produced_artifacts: {} -> {}",
2978                        left_stage.produced_artifact_ids.len(),
2979                        right_stage.produced_artifact_ids.len()
2980                    ));
2981                }
2982                if left_stage.artifacts.len() != right_stage.artifacts.len() {
2983                    details.push(format!(
2984                        "artifact_records: {} -> {}",
2985                        left_stage.artifacts.len(),
2986                        right_stage.artifacts.len()
2987                    ));
2988                }
2989                if !details.is_empty() {
2990                    stage_diffs.push(RunStageDiffRecord {
2991                        node_id: node_id.to_string(),
2992                        change: "changed".to_string(),
2993                        details,
2994                    });
2995                }
2996            }
2997            (None, None) => {}
2998        }
2999    }
3000
3001    let mut tool_diffs = Vec::new();
3002    let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3003        .tool_recordings
3004        .iter()
3005        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3006        .collect();
3007    let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3008        .tool_recordings
3009        .iter()
3010        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3011        .collect();
3012    let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3013        .keys()
3014        .chain(right_tools.keys())
3015        .cloned()
3016        .collect();
3017    for key in &all_tool_keys {
3018        let l = left_tools.get(key);
3019        let r = right_tools.get(key);
3020        let result_changed = match (l, r) {
3021            (Some(a), Some(b)) => a.result != b.result,
3022            _ => true,
3023        };
3024        if result_changed {
3025            tool_diffs.push(ToolCallDiffRecord {
3026                tool_name: key.0.clone(),
3027                args_hash: key.1.clone(),
3028                result_changed,
3029                left_result: l.map(|t| t.result.clone()),
3030                right_result: r.map(|t| t.result.clone()),
3031            });
3032        }
3033    }
3034
3035    let left_observability = left.observability.clone().unwrap_or_else(|| {
3036        derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3037    });
3038    let right_observability = right.observability.clone().unwrap_or_else(|| {
3039        derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3040    });
3041    let mut observability_diffs = Vec::new();
3042
3043    let left_workers = left_observability
3044        .worker_lineage
3045        .iter()
3046        .map(|worker| {
3047            (
3048                worker.worker_id.clone(),
3049                (
3050                    worker.status.clone(),
3051                    worker.run_id.clone(),
3052                    worker.run_path.clone(),
3053                ),
3054            )
3055        })
3056        .collect::<BTreeMap<_, _>>();
3057    let right_workers = right_observability
3058        .worker_lineage
3059        .iter()
3060        .map(|worker| {
3061            (
3062                worker.worker_id.clone(),
3063                (
3064                    worker.status.clone(),
3065                    worker.run_id.clone(),
3066                    worker.run_path.clone(),
3067                ),
3068            )
3069        })
3070        .collect::<BTreeMap<_, _>>();
3071    let worker_ids = left_workers
3072        .keys()
3073        .chain(right_workers.keys())
3074        .cloned()
3075        .collect::<BTreeSet<_>>();
3076    for worker_id in worker_ids {
3077        match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3078            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3079                section: "worker_lineage".to_string(),
3080                label: worker_id,
3081                details: vec!["worker missing from right run".to_string()],
3082            }),
3083            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3084                section: "worker_lineage".to_string(),
3085                label: worker_id,
3086                details: vec!["worker missing from left run".to_string()],
3087            }),
3088            (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3089                let mut details = Vec::new();
3090                if left_worker.0 != right_worker.0 {
3091                    details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3092                }
3093                if left_worker.1 != right_worker.1 {
3094                    details.push(format!(
3095                        "run_id: {:?} -> {:?}",
3096                        left_worker.1, right_worker.1
3097                    ));
3098                }
3099                if left_worker.2 != right_worker.2 {
3100                    details.push(format!(
3101                        "run_path: {:?} -> {:?}",
3102                        left_worker.2, right_worker.2
3103                    ));
3104                }
3105                observability_diffs.push(RunObservabilityDiffRecord {
3106                    section: "worker_lineage".to_string(),
3107                    label: worker_id,
3108                    details,
3109                });
3110            }
3111            _ => {}
3112        }
3113    }
3114
3115    let left_rounds = left_observability
3116        .planner_rounds
3117        .iter()
3118        .map(|round| (round.stage_id.clone(), round))
3119        .collect::<BTreeMap<_, _>>();
3120    let right_rounds = right_observability
3121        .planner_rounds
3122        .iter()
3123        .map(|round| (round.stage_id.clone(), round))
3124        .collect::<BTreeMap<_, _>>();
3125    let round_ids = left_rounds
3126        .keys()
3127        .chain(right_rounds.keys())
3128        .cloned()
3129        .collect::<BTreeSet<_>>();
3130    for stage_id in round_ids {
3131        match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3132            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3133                section: "planner_rounds".to_string(),
3134                label: stage_id,
3135                details: vec!["planner summary missing from right run".to_string()],
3136            }),
3137            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3138                section: "planner_rounds".to_string(),
3139                label: stage_id,
3140                details: vec!["planner summary missing from left run".to_string()],
3141            }),
3142            (Some(left_round), Some(right_round)) => {
3143                let mut details = Vec::new();
3144                if left_round.iteration_count != right_round.iteration_count {
3145                    details.push(format!(
3146                        "iterations: {} -> {}",
3147                        left_round.iteration_count, right_round.iteration_count
3148                    ));
3149                }
3150                if left_round.tool_execution_count != right_round.tool_execution_count {
3151                    details.push(format!(
3152                        "tool_executions: {} -> {}",
3153                        left_round.tool_execution_count, right_round.tool_execution_count
3154                    ));
3155                }
3156                if left_round.native_text_tool_fallback_count
3157                    != right_round.native_text_tool_fallback_count
3158                {
3159                    details.push(format!(
3160                        "native_text_tool_fallbacks: {} -> {}",
3161                        left_round.native_text_tool_fallback_count,
3162                        right_round.native_text_tool_fallback_count
3163                    ));
3164                }
3165                if left_round.native_text_tool_fallback_rejection_count
3166                    != right_round.native_text_tool_fallback_rejection_count
3167                {
3168                    details.push(format!(
3169                        "native_text_tool_fallback_rejections: {} -> {}",
3170                        left_round.native_text_tool_fallback_rejection_count,
3171                        right_round.native_text_tool_fallback_rejection_count
3172                    ));
3173                }
3174                if left_round.empty_completion_retry_count
3175                    != right_round.empty_completion_retry_count
3176                {
3177                    details.push(format!(
3178                        "empty_completion_retries: {} -> {}",
3179                        left_round.empty_completion_retry_count,
3180                        right_round.empty_completion_retry_count
3181                    ));
3182                }
3183                if left_round.research_facts != right_round.research_facts {
3184                    details.push(format!(
3185                        "research_facts: {:?} -> {:?}",
3186                        left_round.research_facts, right_round.research_facts
3187                    ));
3188                }
3189                let left_deliverables = left_round
3190                    .task_ledger
3191                    .as_ref()
3192                    .map(|ledger| {
3193                        ledger
3194                            .deliverables
3195                            .iter()
3196                            .map(|item| format!("{}:{}", item.id, item.status))
3197                            .collect::<Vec<_>>()
3198                    })
3199                    .unwrap_or_default();
3200                let right_deliverables = right_round
3201                    .task_ledger
3202                    .as_ref()
3203                    .map(|ledger| {
3204                        ledger
3205                            .deliverables
3206                            .iter()
3207                            .map(|item| format!("{}:{}", item.id, item.status))
3208                            .collect::<Vec<_>>()
3209                    })
3210                    .unwrap_or_default();
3211                if left_deliverables != right_deliverables {
3212                    details.push(format!(
3213                        "deliverables: {:?} -> {:?}",
3214                        left_deliverables, right_deliverables
3215                    ));
3216                }
3217                if left_round.successful_tools != right_round.successful_tools {
3218                    details.push(format!(
3219                        "successful_tools: {:?} -> {:?}",
3220                        left_round.successful_tools, right_round.successful_tools
3221                    ));
3222                }
3223                if !details.is_empty() {
3224                    observability_diffs.push(RunObservabilityDiffRecord {
3225                        section: "planner_rounds".to_string(),
3226                        label: left_round.node_id.clone(),
3227                        details,
3228                    });
3229                }
3230            }
3231            _ => {}
3232        }
3233    }
3234
3235    let left_pointers = left_observability
3236        .transcript_pointers
3237        .iter()
3238        .map(|pointer| {
3239            (
3240                pointer.id.clone(),
3241                (
3242                    pointer.available,
3243                    pointer.path.clone(),
3244                    pointer.location.clone(),
3245                ),
3246            )
3247        })
3248        .collect::<BTreeMap<_, _>>();
3249    let right_pointers = right_observability
3250        .transcript_pointers
3251        .iter()
3252        .map(|pointer| {
3253            (
3254                pointer.id.clone(),
3255                (
3256                    pointer.available,
3257                    pointer.path.clone(),
3258                    pointer.location.clone(),
3259                ),
3260            )
3261        })
3262        .collect::<BTreeMap<_, _>>();
3263    let pointer_ids = left_pointers
3264        .keys()
3265        .chain(right_pointers.keys())
3266        .cloned()
3267        .collect::<BTreeSet<_>>();
3268    for pointer_id in pointer_ids {
3269        match (
3270            left_pointers.get(&pointer_id),
3271            right_pointers.get(&pointer_id),
3272        ) {
3273            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3274                section: "transcript_pointers".to_string(),
3275                label: pointer_id,
3276                details: vec!["pointer missing from right run".to_string()],
3277            }),
3278            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3279                section: "transcript_pointers".to_string(),
3280                label: pointer_id,
3281                details: vec!["pointer missing from left run".to_string()],
3282            }),
3283            (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3284                observability_diffs.push(RunObservabilityDiffRecord {
3285                    section: "transcript_pointers".to_string(),
3286                    label: pointer_id,
3287                    details: vec![format!(
3288                        "pointer: {:?} -> {:?}",
3289                        left_pointer, right_pointer
3290                    )],
3291                });
3292            }
3293            _ => {}
3294        }
3295    }
3296
3297    let left_compactions = left_observability
3298        .compaction_events
3299        .iter()
3300        .map(|event| {
3301            (
3302                event.id.clone(),
3303                (
3304                    event.strategy.clone(),
3305                    event.archived_messages,
3306                    event.snapshot_asset_id.clone(),
3307                    event.available,
3308                ),
3309            )
3310        })
3311        .collect::<BTreeMap<_, _>>();
3312    let right_compactions = right_observability
3313        .compaction_events
3314        .iter()
3315        .map(|event| {
3316            (
3317                event.id.clone(),
3318                (
3319                    event.strategy.clone(),
3320                    event.archived_messages,
3321                    event.snapshot_asset_id.clone(),
3322                    event.available,
3323                ),
3324            )
3325        })
3326        .collect::<BTreeMap<_, _>>();
3327    let compaction_ids = left_compactions
3328        .keys()
3329        .chain(right_compactions.keys())
3330        .cloned()
3331        .collect::<BTreeSet<_>>();
3332    for compaction_id in compaction_ids {
3333        match (
3334            left_compactions.get(&compaction_id),
3335            right_compactions.get(&compaction_id),
3336        ) {
3337            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3338                section: "compaction_events".to_string(),
3339                label: compaction_id,
3340                details: vec!["compaction event missing from right run".to_string()],
3341            }),
3342            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3343                section: "compaction_events".to_string(),
3344                label: compaction_id,
3345                details: vec!["compaction event missing from left run".to_string()],
3346            }),
3347            (Some(left_event), Some(right_event)) if left_event != right_event => {
3348                observability_diffs.push(RunObservabilityDiffRecord {
3349                    section: "compaction_events".to_string(),
3350                    label: compaction_id,
3351                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3352                });
3353            }
3354            _ => {}
3355        }
3356    }
3357
3358    let left_daemons = left_observability
3359        .daemon_events
3360        .iter()
3361        .map(|event| {
3362            (
3363                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3364                (
3365                    event.name.clone(),
3366                    event.persist_path.clone(),
3367                    event.payload_summary.clone(),
3368                ),
3369            )
3370        })
3371        .collect::<BTreeMap<_, _>>();
3372    let right_daemons = right_observability
3373        .daemon_events
3374        .iter()
3375        .map(|event| {
3376            (
3377                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3378                (
3379                    event.name.clone(),
3380                    event.persist_path.clone(),
3381                    event.payload_summary.clone(),
3382                ),
3383            )
3384        })
3385        .collect::<BTreeMap<_, _>>();
3386    let daemon_keys = left_daemons
3387        .keys()
3388        .chain(right_daemons.keys())
3389        .cloned()
3390        .collect::<BTreeSet<_>>();
3391    for daemon_key in daemon_keys {
3392        let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3393        match (
3394            left_daemons.get(&daemon_key),
3395            right_daemons.get(&daemon_key),
3396        ) {
3397            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3398                section: "daemon_events".to_string(),
3399                label,
3400                details: vec!["daemon event missing from right run".to_string()],
3401            }),
3402            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3403                section: "daemon_events".to_string(),
3404                label,
3405                details: vec!["daemon event missing from left run".to_string()],
3406            }),
3407            (Some(left_event), Some(right_event)) if left_event != right_event => {
3408                observability_diffs.push(RunObservabilityDiffRecord {
3409                    section: "daemon_events".to_string(),
3410                    label,
3411                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3412                });
3413            }
3414            _ => {}
3415        }
3416    }
3417
3418    let left_verification = left_observability
3419        .verification_outcomes
3420        .iter()
3421        .map(|item| (item.stage_id.clone(), item))
3422        .collect::<BTreeMap<_, _>>();
3423    let right_verification = right_observability
3424        .verification_outcomes
3425        .iter()
3426        .map(|item| (item.stage_id.clone(), item))
3427        .collect::<BTreeMap<_, _>>();
3428    let verification_ids = left_verification
3429        .keys()
3430        .chain(right_verification.keys())
3431        .cloned()
3432        .collect::<BTreeSet<_>>();
3433    for stage_id in verification_ids {
3434        match (
3435            left_verification.get(&stage_id),
3436            right_verification.get(&stage_id),
3437        ) {
3438            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3439                section: "verification".to_string(),
3440                label: stage_id,
3441                details: vec!["verification missing from right run".to_string()],
3442            }),
3443            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3444                section: "verification".to_string(),
3445                label: stage_id,
3446                details: vec!["verification missing from left run".to_string()],
3447            }),
3448            (Some(left_item), Some(right_item)) if left_item != right_item => {
3449                let mut details = Vec::new();
3450                if left_item.passed != right_item.passed {
3451                    details.push(format!(
3452                        "passed: {:?} -> {:?}",
3453                        left_item.passed, right_item.passed
3454                    ));
3455                }
3456                if left_item.summary != right_item.summary {
3457                    details.push(format!(
3458                        "summary: {:?} -> {:?}",
3459                        left_item.summary, right_item.summary
3460                    ));
3461                }
3462                observability_diffs.push(RunObservabilityDiffRecord {
3463                    section: "verification".to_string(),
3464                    label: left_item.node_id.clone(),
3465                    details,
3466                });
3467            }
3468            _ => {}
3469        }
3470    }
3471
3472    let left_graph = (
3473        left_observability.action_graph_nodes.len(),
3474        left_observability.action_graph_edges.len(),
3475    );
3476    let right_graph = (
3477        right_observability.action_graph_nodes.len(),
3478        right_observability.action_graph_edges.len(),
3479    );
3480    if left_graph != right_graph {
3481        observability_diffs.push(RunObservabilityDiffRecord {
3482            section: "action_graph".to_string(),
3483            label: "shape".to_string(),
3484            details: vec![format!(
3485                "nodes/edges: {}/{} -> {}/{}",
3486                left_graph.0, left_graph.1, right_graph.0, right_graph.1
3487            )],
3488        });
3489    }
3490
3491    let status_changed = left.status != right.status;
3492    let identical = !status_changed
3493        && stage_diffs.is_empty()
3494        && tool_diffs.is_empty()
3495        && observability_diffs.is_empty()
3496        && left.transitions.len() == right.transitions.len()
3497        && left.artifacts.len() == right.artifacts.len()
3498        && left.checkpoints.len() == right.checkpoints.len();
3499
3500    RunDiffReport {
3501        left_run_id: left.id.clone(),
3502        right_run_id: right.id.clone(),
3503        identical,
3504        status_changed,
3505        left_status: left.status.clone(),
3506        right_status: right.status.clone(),
3507        stage_diffs,
3508        tool_diffs,
3509        observability_diffs,
3510        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3511        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3512        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3513    }
3514}
3515
3516#[cfg(test)]
3517mod tests {
3518    use super::*;
3519    use std::fs;
3520
3521    fn minimal_run(status: &str) -> RunRecord {
3522        RunRecord {
3523            type_name: "workflow_run".to_string(),
3524            id: "run_1".to_string(),
3525            workflow_id: "workflow_1".to_string(),
3526            status: status.to_string(),
3527            usage: Some(LlmUsageRecord {
3528                total_duration_ms: 12,
3529                total_cost: 0.01,
3530                input_tokens: 3,
3531                output_tokens: 4,
3532                call_count: 1,
3533                models: vec!["mock".to_string()],
3534            }),
3535            replay_fixture: Some(ReplayFixture {
3536                type_name: "replay_fixture".to_string(),
3537                expected_status: "completed".to_string(),
3538                ..ReplayFixture::default()
3539            }),
3540            ..RunRecord::default()
3541        }
3542    }
3543
3544    #[test]
3545    fn eval_pack_manifest_toml_runs_replay_case() {
3546        let temp = tempfile::tempdir().unwrap();
3547        let run_path = temp.path().join("run.json");
3548        fs::write(
3549            &run_path,
3550            serde_json::to_string(&minimal_run("completed")).unwrap(),
3551        )
3552        .unwrap();
3553        let pack_path = temp.path().join("harn.eval.toml");
3554        fs::write(
3555            &pack_path,
3556            r#"
3557version = 1
3558id = "connector-regressions"
3559name = "Connector regressions"
3560
3561[[cases]]
3562id = "webhook"
3563name = "Webhook normalization"
3564run = "run.json"
3565rubrics = ["status"]
3566
3567[[rubrics]]
3568id = "status"
3569kind = "deterministic"
3570
3571[[rubrics.assertions]]
3572kind = "run-status"
3573expected = "completed"
3574"#,
3575        )
3576        .unwrap();
3577
3578        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3579        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3580
3581        assert!(report.pass);
3582        assert_eq!(report.total, 1);
3583        assert_eq!(report.cases[0].label, "Webhook normalization");
3584    }
3585
3586    #[test]
3587    fn eval_pack_warning_case_does_not_block() {
3588        let temp = tempfile::tempdir().unwrap();
3589        let run_path = temp.path().join("run.json");
3590        fs::write(
3591            &run_path,
3592            serde_json::to_string(&minimal_run("completed")).unwrap(),
3593        )
3594        .unwrap();
3595        let pack_path = temp.path().join("harn.eval.toml");
3596        fs::write(
3597            &pack_path,
3598            r#"
3599version = 1
3600id = "budgets"
3601
3602[[cases]]
3603id = "latency-budget"
3604run = "run.json"
3605severity = "warning"
3606
3607[cases.thresholds]
3608max-latency-ms = 1
3609"#,
3610        )
3611        .unwrap();
3612
3613        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3614        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3615
3616        assert!(report.pass);
3617        assert_eq!(report.warning_failed, 1);
3618        assert!(report.cases[0].warnings[0].contains("latency"));
3619    }
3620
3621    #[test]
3622    fn eval_pack_manifest_runs_friction_context_pack_case() {
3623        let temp = tempfile::tempdir().unwrap();
3624        let events_path = temp.path().join("incident-friction.json");
3625        fs::write(
3626            &events_path,
3627            r#"
3628{
3629  "events": [
3630    {
3631      "kind": "repeated_query",
3632      "source": "incident-triage",
3633      "actor": "sre",
3634      "tool": "splunk",
3635      "provider": "splunk",
3636      "redacted_summary": "Checkout incidents need the same Splunk search",
3637      "recurrence_hints": ["checkout incident queries"],
3638      "estimated_time_ms": 300000,
3639      "metadata": {
3640        "query": "index=checkout service=api error",
3641        "capability": "splunk.search",
3642        "secret_ref": "SPLUNK_READ_TOKEN",
3643        "output_slot": "splunk_errors"
3644      }
3645    },
3646    {
3647      "kind": "repeated_query",
3648      "source": "incident-triage",
3649      "actor": "sre",
3650      "tool": "splunk",
3651      "provider": "splunk",
3652      "redacted_summary": "Checkout incident triage repeated the Splunk search",
3653      "recurrence_hints": ["checkout incident queries"],
3654      "estimated_time_ms": 240000,
3655      "metadata": {
3656        "query": "index=checkout service=api error",
3657        "capability": "splunk.search",
3658        "secret_ref": "SPLUNK_READ_TOKEN",
3659        "output_slot": "splunk_errors"
3660      }
3661    }
3662  ]
3663}
3664"#,
3665        )
3666        .unwrap();
3667        let pack_path = temp.path().join("harn.eval.toml");
3668        fs::write(
3669            &pack_path,
3670            r#"
3671version = 1
3672id = "team-learning"
3673name = "Team learning evals"
3674
3675[[fixtures]]
3676id = "incident-friction"
3677kind = "friction-events"
3678path = "incident-friction.json"
3679
3680[[cases]]
3681id = "incident-context-pack"
3682name = "Incident context pack suggestion"
3683friction_events = "incident-friction"
3684rubrics = ["context-pack"]
3685
3686[[rubrics]]
3687id = "context-pack"
3688kind = "friction"
3689
3690[[rubrics.assertions]]
3691kind = "context-pack-suggestion"
3692contains = "incident"
3693expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3694"#,
3695        )
3696        .unwrap();
3697
3698        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3699        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3700
3701        assert!(report.pass);
3702        assert_eq!(report.total, 1);
3703        assert_eq!(report.cases[0].run_id, "friction_events");
3704        assert_eq!(report.cases[0].stage_count, 2);
3705    }
3706}