Skip to main content

harn_vm/orchestration/
records.rs

1//! Run records, replay fixtures, eval reports, and diff utilities.
2
3use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9    default_run_dir, evaluate_context_pack_suggestion_expectations,
10    generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11    parse_json_payload, parse_json_value, run_persona_eval_ladder, sync_run_handoffs,
12    ArtifactRecord, CapabilityPolicy, ContextPackSuggestionExpectation,
13    ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact, PersonaEvalLadderManifest,
14    PersonaEvalLadderReport,
15};
16use crate::agent_events::AgentEvent;
17use crate::event_log::{
18    active_event_log, sanitize_topic_component, AnyEventLog, EventId, EventLog,
19    LogEvent as EventLogRecord, Topic,
20};
21use crate::llm::vm_value_to_json;
22use crate::personas::{
23    PersonaAssignmentStatus, PersonaBudgetStatus, PersonaHandoffInboxItem, PersonaQueuedWork,
24    PersonaStatus, PersonaValueReceipt,
25};
26use crate::triggers::{SignatureStatus, TriggerEvent};
27use crate::value::{VmError, VmValue};
28
29pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
30pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
31pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
32pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
33pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
34pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
35pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
36pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
37pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
38pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
39pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
40
41pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
42pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
43pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
44pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
45pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
46pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
47pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
48pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
49pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
50
51#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
52#[serde(default)]
53pub struct LlmUsageRecord {
54    pub input_tokens: i64,
55    pub output_tokens: i64,
56    pub total_duration_ms: i64,
57    pub call_count: i64,
58    pub total_cost: f64,
59    pub models: Vec<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct RunStageRecord {
65    pub id: String,
66    pub node_id: String,
67    pub kind: String,
68    pub status: String,
69    pub outcome: String,
70    pub branch: Option<String>,
71    pub started_at: String,
72    pub finished_at: Option<String>,
73    pub visible_text: Option<String>,
74    pub private_reasoning: Option<String>,
75    pub transcript: Option<serde_json::Value>,
76    pub verification: Option<serde_json::Value>,
77    pub usage: Option<LlmUsageRecord>,
78    pub artifacts: Vec<ArtifactRecord>,
79    pub consumed_artifact_ids: Vec<String>,
80    pub produced_artifact_ids: Vec<String>,
81    pub attempts: Vec<RunStageAttemptRecord>,
82    pub metadata: BTreeMap<String, serde_json::Value>,
83}
84
85#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
86#[serde(default)]
87pub struct RunStageAttemptRecord {
88    pub attempt: usize,
89    pub status: String,
90    pub outcome: String,
91    pub branch: Option<String>,
92    pub error: Option<String>,
93    pub verification: Option<serde_json::Value>,
94    pub started_at: String,
95    pub finished_at: Option<String>,
96}
97
98#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
99#[serde(default)]
100pub struct RunTransitionRecord {
101    pub id: String,
102    pub from_stage_id: Option<String>,
103    pub from_node_id: Option<String>,
104    pub to_node_id: String,
105    pub branch: Option<String>,
106    pub timestamp: String,
107    pub consumed_artifact_ids: Vec<String>,
108    pub produced_artifact_ids: Vec<String>,
109}
110
111#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
112#[serde(default)]
113pub struct RunCheckpointRecord {
114    pub id: String,
115    pub ready_nodes: Vec<String>,
116    pub completed_nodes: Vec<String>,
117    pub last_stage_id: Option<String>,
118    pub persisted_at: String,
119    pub reason: String,
120}
121
122#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
123#[serde(default)]
124pub struct ReplayFixture {
125    #[serde(rename = "_type")]
126    pub type_name: String,
127    pub id: String,
128    pub source_run_id: String,
129    pub workflow_id: String,
130    pub workflow_name: Option<String>,
131    pub created_at: String,
132    pub eval_kind: Option<String>,
133    pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
134    pub expected_status: String,
135    pub stage_assertions: Vec<ReplayStageAssertion>,
136}
137
138#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
139#[serde(default)]
140pub struct ClarifyingQuestionEvalSpec {
141    pub expected_question: Option<String>,
142    pub accepted_questions: Vec<String>,
143    pub required_terms: Vec<String>,
144    pub forbidden_terms: Vec<String>,
145    pub min_questions: usize,
146    pub max_questions: Option<usize>,
147}
148
149#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
150#[serde(default)]
151pub struct ReplayStageAssertion {
152    pub node_id: String,
153    pub expected_status: String,
154    pub expected_outcome: String,
155    pub expected_branch: Option<String>,
156    pub required_artifact_kinds: Vec<String>,
157    pub visible_text_contains: Option<String>,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalReport {
163    pub pass: bool,
164    pub failures: Vec<String>,
165    pub stage_count: usize,
166}
167
168#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
169#[serde(default)]
170pub struct ReplayEvalCaseReport {
171    pub run_id: String,
172    pub workflow_id: String,
173    pub label: Option<String>,
174    pub pass: bool,
175    pub failures: Vec<String>,
176    pub stage_count: usize,
177    pub source_path: Option<String>,
178    pub comparison: Option<RunDiffReport>,
179}
180
181#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
182#[serde(default)]
183pub struct ReplayEvalSuiteReport {
184    pub pass: bool,
185    pub total: usize,
186    pub passed: usize,
187    pub failed: usize,
188    pub cases: Vec<ReplayEvalCaseReport>,
189}
190
191#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
192#[serde(default)]
193pub struct RunDeliverableSummaryRecord {
194    pub id: String,
195    pub text: String,
196    pub status: String,
197    pub note: Option<String>,
198}
199
200#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
201#[serde(default)]
202pub struct RunTaskLedgerSummaryRecord {
203    pub root_task: String,
204    pub rationale: String,
205    pub deliverables: Vec<RunDeliverableSummaryRecord>,
206    pub observations: Vec<String>,
207    pub blocking_count: usize,
208}
209
210#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
211#[serde(default)]
212pub struct RunPlannerRoundRecord {
213    pub stage_id: String,
214    pub node_id: String,
215    pub stage_kind: String,
216    pub status: String,
217    pub outcome: String,
218    pub iteration_count: usize,
219    pub llm_call_count: usize,
220    pub tool_execution_count: usize,
221    pub tool_rejection_count: usize,
222    pub intervention_count: usize,
223    pub compaction_count: usize,
224    pub native_text_tool_fallback_count: usize,
225    pub native_text_tool_fallback_rejection_count: usize,
226    pub empty_completion_retry_count: usize,
227    pub tools_used: Vec<String>,
228    pub successful_tools: Vec<String>,
229    pub ledger_done_rejections: usize,
230    pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
231    pub research_facts: Vec<String>,
232}
233
234#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
235#[serde(default)]
236pub struct RunWorkerLineageRecord {
237    pub worker_id: String,
238    pub worker_name: String,
239    pub parent_stage_id: Option<String>,
240    pub task: String,
241    pub status: String,
242    pub session_id: Option<String>,
243    pub parent_session_id: Option<String>,
244    pub run_id: Option<String>,
245    pub run_path: Option<String>,
246    pub snapshot_path: Option<String>,
247}
248
249#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
250#[serde(default)]
251pub struct RunActionGraphNodeRecord {
252    pub id: String,
253    pub label: String,
254    pub kind: String,
255    pub status: String,
256    pub outcome: String,
257    pub trace_id: Option<String>,
258    pub stage_id: Option<String>,
259    pub node_id: Option<String>,
260    pub worker_id: Option<String>,
261    pub run_id: Option<String>,
262    pub run_path: Option<String>,
263    pub metadata: BTreeMap<String, serde_json::Value>,
264}
265
266#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
267#[serde(default)]
268pub struct RunActionGraphEdgeRecord {
269    pub from_id: String,
270    pub to_id: String,
271    pub kind: String,
272    pub label: Option<String>,
273}
274
275#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
276#[serde(default)]
277pub struct RunVerificationOutcomeRecord {
278    pub stage_id: String,
279    pub node_id: String,
280    pub status: String,
281    pub passed: Option<bool>,
282    pub summary: Option<String>,
283}
284
285#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
286#[serde(default)]
287pub struct RunTranscriptPointerRecord {
288    pub id: String,
289    pub label: String,
290    pub kind: String,
291    pub location: String,
292    pub path: Option<String>,
293    pub available: bool,
294}
295
296#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
297#[serde(default)]
298pub struct CompactionEventRecord {
299    pub id: String,
300    pub transcript_id: Option<String>,
301    pub stage_id: Option<String>,
302    pub node_id: Option<String>,
303    pub mode: String,
304    pub strategy: String,
305    pub archived_messages: usize,
306    pub estimated_tokens_before: usize,
307    pub estimated_tokens_after: usize,
308    pub snapshot_asset_id: Option<String>,
309    pub snapshot_location: String,
310    pub snapshot_path: Option<String>,
311    pub available: bool,
312}
313
314#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
315#[serde(rename_all = "snake_case")]
316pub enum DaemonEventKindRecord {
317    #[default]
318    Spawned,
319    Triggered,
320    Snapshotted,
321    Resumed,
322    Stopped,
323}
324
325#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
326#[serde(default)]
327pub struct DaemonEventRecord {
328    pub daemon_id: String,
329    pub name: String,
330    pub kind: DaemonEventKindRecord,
331    pub timestamp: String,
332    pub persist_path: String,
333    pub payload_summary: Option<String>,
334}
335
336#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
337#[serde(default)]
338pub struct RunObservabilityRecord {
339    pub schema_version: usize,
340    pub planner_rounds: Vec<RunPlannerRoundRecord>,
341    pub research_fact_count: usize,
342    pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
343    pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
344    pub worker_lineage: Vec<RunWorkerLineageRecord>,
345    pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
346    pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
347    pub compaction_events: Vec<CompactionEventRecord>,
348    pub daemon_events: Vec<DaemonEventRecord>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct RunStageDiffRecord {
354    pub node_id: String,
355    pub change: String,
356    pub details: Vec<String>,
357}
358
359#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
360#[serde(default)]
361pub struct ToolCallDiffRecord {
362    pub tool_name: String,
363    pub args_hash: String,
364    pub result_changed: bool,
365    pub left_result: Option<String>,
366    pub right_result: Option<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunObservabilityDiffRecord {
372    pub section: String,
373    pub label: String,
374    pub details: Vec<String>,
375}
376
377#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
378#[serde(default)]
379pub struct RunDiffReport {
380    pub left_run_id: String,
381    pub right_run_id: String,
382    pub identical: bool,
383    pub status_changed: bool,
384    pub left_status: String,
385    pub right_status: String,
386    pub stage_diffs: Vec<RunStageDiffRecord>,
387    pub tool_diffs: Vec<ToolCallDiffRecord>,
388    pub observability_diffs: Vec<RunObservabilityDiffRecord>,
389    pub transition_count_delta: isize,
390    pub artifact_count_delta: isize,
391    pub checkpoint_count_delta: isize,
392}
393
394#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
395#[serde(default)]
396pub struct EvalSuiteManifest {
397    #[serde(rename = "_type")]
398    pub type_name: String,
399    pub id: String,
400    pub name: Option<String>,
401    pub base_dir: Option<String>,
402    pub cases: Vec<EvalSuiteCase>,
403}
404
405#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
406#[serde(default)]
407pub struct EvalSuiteCase {
408    pub label: Option<String>,
409    pub run_path: String,
410    pub fixture_path: Option<String>,
411    pub compare_to: Option<String>,
412}
413
414#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
415#[serde(default)]
416pub struct EvalPackManifest {
417    pub version: u32,
418    pub id: String,
419    pub name: Option<String>,
420    pub description: Option<String>,
421    pub base_dir: Option<String>,
422    pub baseline: Option<String>,
423    pub package: Option<EvalPackPackage>,
424    pub defaults: EvalPackDefaults,
425    pub fixtures: Vec<EvalPackFixtureRef>,
426    pub rubrics: Vec<EvalPackRubric>,
427    pub judge: Option<EvalPackJudgeConfig>,
428    pub cases: Vec<EvalPackCase>,
429    pub ladders: Vec<PersonaEvalLadderManifest>,
430    pub metadata: BTreeMap<String, serde_json::Value>,
431}
432
433#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
434#[serde(default)]
435pub struct EvalPackPackage {
436    pub name: Option<String>,
437    pub version: Option<String>,
438    pub source: Option<String>,
439    pub templates: Vec<String>,
440}
441
442#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
443#[serde(default)]
444pub struct EvalPackDefaults {
445    pub severity: Option<String>,
446    pub fixture_root: Option<String>,
447    pub thresholds: EvalPackThresholds,
448    pub judge: Option<EvalPackJudgeConfig>,
449}
450
451#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
452#[serde(default)]
453pub struct EvalPackFixtureRef {
454    pub id: String,
455    pub kind: String,
456    pub path: Option<String>,
457    #[serde(default, alias = "trace-id")]
458    pub trace_id: Option<String>,
459    pub provider: Option<String>,
460    #[serde(default, alias = "event-kind")]
461    pub event_kind: Option<String>,
462    pub inline: Option<serde_json::Value>,
463    pub metadata: BTreeMap<String, serde_json::Value>,
464}
465
466#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
467#[serde(default)]
468pub struct EvalPackRubric {
469    pub id: String,
470    pub kind: String,
471    pub description: Option<String>,
472    pub prompt: Option<String>,
473    pub assertions: Vec<EvalPackAssertion>,
474    pub judge: Option<EvalPackJudgeConfig>,
475    pub calibration: Vec<EvalPackGoldenExample>,
476    pub thresholds: EvalPackThresholds,
477    pub metadata: BTreeMap<String, serde_json::Value>,
478}
479
480#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
481#[serde(default)]
482pub struct EvalPackAssertion {
483    pub kind: String,
484    pub stage: Option<String>,
485    pub path: Option<String>,
486    pub op: Option<String>,
487    pub expected: Option<serde_json::Value>,
488    pub contains: Option<String>,
489    pub metadata: BTreeMap<String, serde_json::Value>,
490}
491
492#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
493#[serde(default)]
494pub struct EvalPackJudgeConfig {
495    pub model: Option<String>,
496    #[serde(default, alias = "prompt-version")]
497    pub prompt_version: Option<String>,
498    #[serde(default, alias = "tie-break")]
499    pub tie_break: Option<String>,
500    #[serde(default, alias = "confidence-min")]
501    pub confidence_min: Option<f64>,
502    pub temperature: Option<f64>,
503    pub rubric: Option<String>,
504    pub metadata: BTreeMap<String, serde_json::Value>,
505}
506
507#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
508#[serde(default)]
509pub struct EvalPackGoldenExample {
510    pub input: serde_json::Value,
511    pub output: serde_json::Value,
512    pub score: Option<f64>,
513    pub explanation: Option<String>,
514}
515
516#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
517#[serde(default)]
518pub struct EvalPackThresholds {
519    pub severity: Option<String>,
520    #[serde(default, alias = "min-score")]
521    pub min_score: Option<f64>,
522    #[serde(default, alias = "min-confidence")]
523    pub min_confidence: Option<f64>,
524    #[serde(default, alias = "max-cost-usd")]
525    pub max_cost_usd: Option<f64>,
526    #[serde(default, alias = "max-latency-ms")]
527    pub max_latency_ms: Option<i64>,
528    #[serde(default, alias = "max-tokens")]
529    pub max_tokens: Option<i64>,
530    #[serde(default, alias = "max-stage-count")]
531    pub max_stage_count: Option<usize>,
532}
533
534#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
535#[serde(default)]
536pub struct EvalPackCase {
537    pub id: Option<String>,
538    pub name: Option<String>,
539    pub description: Option<String>,
540    pub run: Option<String>,
541    #[serde(default, alias = "run-path")]
542    pub run_path: Option<String>,
543    #[serde(default, alias = "friction-events", alias = "friction_events")]
544    pub friction_events: Option<String>,
545    pub fixture: Option<String>,
546    #[serde(default, alias = "fixture-path")]
547    pub fixture_path: Option<String>,
548    #[serde(default, alias = "compare-to")]
549    pub compare_to: Option<String>,
550    pub rubrics: Vec<String>,
551    pub severity: Option<String>,
552    pub thresholds: EvalPackThresholds,
553    pub metadata: BTreeMap<String, serde_json::Value>,
554}
555
556#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
557#[serde(default)]
558pub struct EvalPackReport {
559    pub pack_id: String,
560    pub pass: bool,
561    pub total: usize,
562    pub passed: usize,
563    pub failed: usize,
564    pub blocking_failed: usize,
565    pub warning_failed: usize,
566    pub informational_failed: usize,
567    pub cases: Vec<EvalPackCaseReport>,
568    pub ladders: Vec<PersonaEvalLadderReport>,
569}
570
571#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
572#[serde(default)]
573pub struct EvalPackCaseReport {
574    pub id: String,
575    pub label: String,
576    pub severity: String,
577    pub pass: bool,
578    pub blocking: bool,
579    pub run_id: String,
580    pub workflow_id: String,
581    pub source_path: Option<String>,
582    pub stage_count: usize,
583    pub failures: Vec<String>,
584    pub warnings: Vec<String>,
585    pub informational: Vec<String>,
586    pub comparison: Option<RunDiffReport>,
587}
588
589#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
590#[serde(default)]
591pub struct RunHitlQuestionRecord {
592    pub request_id: String,
593    pub prompt: String,
594    pub agent: String,
595    pub trace_id: Option<String>,
596    pub asked_at: String,
597}
598
599#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
600#[serde(default)]
601pub struct RunPersonaRuntimeRecord {
602    pub name: String,
603    pub role: String,
604    pub template_ref: Option<String>,
605    pub state: String,
606    pub entry_workflow: String,
607    pub current_assignment: Option<PersonaAssignmentStatus>,
608    pub queued_work: Vec<PersonaQueuedWork>,
609    pub handoff_inbox: Vec<PersonaHandoffInboxItem>,
610    pub budget: PersonaBudgetStatus,
611    pub value_receipts: Vec<PersonaValueReceipt>,
612    pub last_run: Option<String>,
613    pub last_error: Option<String>,
614}
615
616impl From<&PersonaStatus> for RunPersonaRuntimeRecord {
617    fn from(status: &PersonaStatus) -> Self {
618        Self {
619            name: status.name.clone(),
620            role: status.role.clone(),
621            template_ref: status.template_ref.clone(),
622            state: status.state.as_str().to_string(),
623            entry_workflow: status.entry_workflow.clone(),
624            current_assignment: status.current_assignment.clone(),
625            queued_work: status.queued_work.clone(),
626            handoff_inbox: status.handoff_inbox.clone(),
627            budget: status.budget.clone(),
628            value_receipts: status.value_receipts.clone(),
629            last_run: status.last_run.clone(),
630            last_error: status.last_error.clone(),
631        }
632    }
633}
634
635#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
636#[serde(default)]
637pub struct RunRecord {
638    #[serde(rename = "_type")]
639    pub type_name: String,
640    pub id: String,
641    pub workflow_id: String,
642    pub workflow_name: Option<String>,
643    pub task: String,
644    pub status: String,
645    pub started_at: String,
646    pub finished_at: Option<String>,
647    pub parent_run_id: Option<String>,
648    pub root_run_id: Option<String>,
649    pub stages: Vec<RunStageRecord>,
650    pub transitions: Vec<RunTransitionRecord>,
651    pub checkpoints: Vec<RunCheckpointRecord>,
652    pub pending_nodes: Vec<String>,
653    pub completed_nodes: Vec<String>,
654    pub child_runs: Vec<RunChildRecord>,
655    pub artifacts: Vec<ArtifactRecord>,
656    pub handoffs: Vec<HandoffArtifact>,
657    pub policy: CapabilityPolicy,
658    pub execution: Option<RunExecutionRecord>,
659    pub transcript: Option<serde_json::Value>,
660    pub usage: Option<LlmUsageRecord>,
661    pub replay_fixture: Option<ReplayFixture>,
662    pub observability: Option<RunObservabilityRecord>,
663    pub trace_spans: Vec<RunTraceSpanRecord>,
664    pub tool_recordings: Vec<ToolCallRecord>,
665    pub hitl_questions: Vec<RunHitlQuestionRecord>,
666    pub persona_runtime: Vec<RunPersonaRuntimeRecord>,
667    pub metadata: BTreeMap<String, serde_json::Value>,
668    pub persisted_path: Option<String>,
669}
670
671#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
672#[serde(default)]
673pub struct ToolCallRecord {
674    pub tool_name: String,
675    pub tool_use_id: String,
676    pub args_hash: String,
677    pub result: String,
678    pub is_rejected: bool,
679    pub duration_ms: u64,
680    pub iteration: usize,
681    pub timestamp: String,
682}
683
684/// Hash a tool invocation for fixture lookup (name + canonical args JSON).
685pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
686    use std::hash::{Hash, Hasher};
687    let mut hasher = std::collections::hash_map::DefaultHasher::new();
688    tool_name.hash(&mut hasher);
689    let args_str = serde_json::to_string(args).unwrap_or_default();
690    args_str.hash(&mut hasher);
691    format!("{:016x}", hasher.finish())
692}
693
694#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
695#[serde(default)]
696pub struct RunTraceSpanRecord {
697    pub span_id: u64,
698    pub parent_id: Option<u64>,
699    pub kind: String,
700    pub name: String,
701    pub start_ms: u64,
702    pub duration_ms: u64,
703    pub metadata: BTreeMap<String, serde_json::Value>,
704}
705
706#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
707#[serde(default)]
708pub struct RunChildRecord {
709    pub worker_id: String,
710    pub worker_name: String,
711    pub parent_stage_id: Option<String>,
712    pub session_id: Option<String>,
713    pub parent_session_id: Option<String>,
714    pub mutation_scope: Option<String>,
715    pub approval_policy: Option<super::ToolApprovalPolicy>,
716    pub task: String,
717    pub request: Option<serde_json::Value>,
718    pub provenance: Option<serde_json::Value>,
719    pub status: String,
720    pub started_at: String,
721    pub finished_at: Option<String>,
722    pub run_id: Option<String>,
723    pub run_path: Option<String>,
724    pub snapshot_path: Option<String>,
725    pub execution: Option<RunExecutionRecord>,
726}
727
728pub(crate) fn run_child_record_from_worker_metadata(
729    parent_stage_id: Option<String>,
730    worker: &serde_json::Value,
731) -> Option<RunChildRecord> {
732    let worker_id = worker.get("id").and_then(|value| value.as_str())?;
733    if worker_id.is_empty() {
734        return None;
735    }
736    Some(RunChildRecord {
737        worker_id: worker_id.to_string(),
738        worker_name: worker
739            .get("name")
740            .and_then(|value| value.as_str())
741            .unwrap_or("worker")
742            .to_string(),
743        parent_stage_id,
744        session_id: worker
745            .get("audit")
746            .and_then(|value| value.get("session_id"))
747            .and_then(|value| value.as_str())
748            .map(str::to_string),
749        parent_session_id: worker
750            .get("audit")
751            .and_then(|value| value.get("parent_session_id"))
752            .and_then(|value| value.as_str())
753            .map(str::to_string),
754        mutation_scope: worker
755            .get("audit")
756            .and_then(|value| value.get("mutation_scope"))
757            .and_then(|value| value.as_str())
758            .map(str::to_string),
759        approval_policy: worker
760            .get("audit")
761            .and_then(|value| value.get("approval_policy"))
762            .and_then(|value| {
763                serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
764            }),
765        task: worker
766            .get("task")
767            .and_then(|value| value.as_str())
768            .unwrap_or_default()
769            .to_string(),
770        request: worker.get("request").cloned(),
771        provenance: worker.get("provenance").cloned(),
772        status: worker
773            .get("status")
774            .and_then(|value| value.as_str())
775            .unwrap_or("completed")
776            .to_string(),
777        started_at: worker
778            .get("started_at")
779            .and_then(|value| value.as_str())
780            .unwrap_or_default()
781            .to_string(),
782        finished_at: worker
783            .get("finished_at")
784            .and_then(|value| value.as_str())
785            .map(str::to_string),
786        run_id: worker
787            .get("child_run_id")
788            .and_then(|value| value.as_str())
789            .map(str::to_string),
790        run_path: worker
791            .get("child_run_path")
792            .and_then(|value| value.as_str())
793            .map(str::to_string),
794        snapshot_path: worker
795            .get("snapshot_path")
796            .and_then(|value| value.as_str())
797            .map(str::to_string),
798        execution: worker
799            .get("execution")
800            .cloned()
801            .and_then(|value| serde_json::from_value(value).ok()),
802    })
803}
804
805fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
806    let parent_stage_id = if stage.id.is_empty() {
807        None
808    } else {
809        Some(stage.id.clone())
810    };
811    run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
812}
813
814fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
815    if existing.worker_name.is_empty() {
816        existing.worker_name = child.worker_name;
817    }
818    if existing.parent_stage_id.is_none() {
819        existing.parent_stage_id = child.parent_stage_id;
820    }
821    if existing.session_id.is_none() {
822        existing.session_id = child.session_id;
823    }
824    if existing.parent_session_id.is_none() {
825        existing.parent_session_id = child.parent_session_id;
826    }
827    if existing.mutation_scope.is_none() {
828        existing.mutation_scope = child.mutation_scope;
829    }
830    if existing.approval_policy.is_none() {
831        existing.approval_policy = child.approval_policy;
832    }
833    if existing.task.is_empty() {
834        existing.task = child.task;
835    }
836    if existing.request.is_none() {
837        existing.request = child.request;
838    }
839    if existing.provenance.is_none() {
840        existing.provenance = child.provenance;
841    }
842    if existing.status.is_empty() {
843        existing.status = child.status;
844    }
845    if existing.started_at.is_empty() {
846        existing.started_at = child.started_at;
847    }
848    if existing.finished_at.is_none() {
849        existing.finished_at = child.finished_at;
850    }
851    if existing.run_id.is_none() {
852        existing.run_id = child.run_id;
853    }
854    if existing.run_path.is_none() {
855        existing.run_path = child.run_path;
856    }
857    if existing.snapshot_path.is_none() {
858        existing.snapshot_path = child.snapshot_path;
859    }
860    if existing.execution.is_none() {
861        existing.execution = child.execution;
862    }
863}
864
865fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
866    for child in run
867        .stages
868        .iter()
869        .filter_map(run_child_from_stage_metadata)
870        .collect::<Vec<_>>()
871    {
872        match run
873            .child_runs
874            .iter_mut()
875            .find(|existing| existing.worker_id == child.worker_id)
876        {
877            Some(existing) => fill_missing_child_run_fields(existing, child),
878            None => run.child_runs.push(child),
879        }
880    }
881}
882
883#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
884#[serde(default)]
885pub struct RunExecutionRecord {
886    pub cwd: Option<String>,
887    pub source_dir: Option<String>,
888    pub env: BTreeMap<String, String>,
889    pub adapter: Option<String>,
890    pub repo_path: Option<String>,
891    pub worktree_path: Option<String>,
892    pub branch: Option<String>,
893    pub base_ref: Option<String>,
894    pub cleanup: Option<String>,
895}
896
897fn compact_json_value(value: &serde_json::Value) -> String {
898    serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
899}
900
901fn normalize_question_text(text: &str) -> String {
902    text.chars()
903        .map(|ch| {
904            if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
905                ch.to_ascii_lowercase()
906            } else {
907                ' '
908            }
909        })
910        .collect::<String>()
911        .split_whitespace()
912        .collect::<Vec<_>>()
913        .join(" ")
914}
915
916fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
917    spec.min_questions.max(1)
918}
919
920fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
921    spec.max_questions.unwrap_or(1).max(1)
922}
923
924fn read_topic_records(
925    log: &AnyEventLog,
926    topic: &Topic,
927) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
928    let mut from = None;
929    let mut records = Vec::new();
930    loop {
931        let batch =
932            futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
933        if batch.is_empty() {
934            break;
935        }
936        from = batch.last().map(|(event_id, _)| *event_id);
937        records.extend(batch);
938    }
939    records
940}
941
942#[derive(Clone, Debug)]
943pub struct AgentSessionReplayEvent {
944    pub event_id: EventId,
945    pub event: AgentEvent,
946}
947
948pub async fn load_agent_session_replay_events(
949    session_id: &str,
950) -> Result<Vec<AgentSessionReplayEvent>, VmError> {
951    let Some(log) = active_event_log() else {
952        return Ok(Vec::new());
953    };
954    let topic = Topic::new(format!(
955        "observability.agent_events.{}",
956        sanitize_topic_component(session_id)
957    ))
958    .map_err(|error| VmError::Runtime(format!("failed to build agent event topic: {error}")))?;
959
960    let mut events = Vec::new();
961    let mut from = None;
962    loop {
963        let batch = log.read_range(&topic, from, 1024).await.map_err(|error| {
964            VmError::Runtime(format!(
965                "failed to read agent event replay topic {}: {error}",
966                topic.as_str()
967            ))
968        })?;
969        let batch_len = batch.len();
970        for (event_id, record) in batch {
971            from = Some(event_id);
972            if record.headers.get("session_id").map(String::as_str) != Some(session_id) {
973                continue;
974            }
975            let Some(event_value) = record.payload.get("event").cloned() else {
976                continue;
977            };
978            let event = serde_json::from_value::<AgentEvent>(event_value).map_err(|error| {
979                VmError::Runtime(format!(
980                    "failed to decode agent event replay record {event_id}: {error}"
981                ))
982            })?;
983            if event.session_id() == session_id {
984                events.push(AgentSessionReplayEvent { event_id, event });
985            }
986        }
987        if batch_len < 1024 {
988            break;
989        }
990    }
991    Ok(events)
992}
993
994fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
995    let Some(log) = active_event_log() else {
996        return;
997    };
998    let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
999        .expect("static hitl.questions topic should always be valid");
1000    let mut merged = run
1001        .hitl_questions
1002        .iter()
1003        .cloned()
1004        .map(|question| (question.request_id.clone(), question))
1005        .collect::<BTreeMap<_, _>>();
1006
1007    for (_, event) in read_topic_records(log.as_ref(), &topic) {
1008        if event.kind != "hitl.question_asked" {
1009            continue;
1010        }
1011        let payload = &event.payload;
1012        let matches_run = event
1013            .headers
1014            .get("run_id")
1015            .is_some_and(|value| value == &run.id)
1016            || payload
1017                .get("run_id")
1018                .and_then(|value| value.as_str())
1019                .is_some_and(|value| value == run.id);
1020        if !matches_run {
1021            continue;
1022        }
1023        let request_id = payload
1024            .get("request_id")
1025            .and_then(|value| value.as_str())
1026            .or_else(|| event.headers.get("request_id").map(String::as_str))
1027            .unwrap_or_default();
1028        let prompt = payload
1029            .get("payload")
1030            .and_then(|value| value.get("prompt"))
1031            .and_then(|value| value.as_str())
1032            .unwrap_or_default();
1033        if request_id.is_empty() || prompt.is_empty() {
1034            continue;
1035        }
1036        merged.insert(
1037            request_id.to_string(),
1038            RunHitlQuestionRecord {
1039                request_id: request_id.to_string(),
1040                prompt: prompt.to_string(),
1041                agent: payload
1042                    .get("agent")
1043                    .and_then(|value| value.as_str())
1044                    .unwrap_or_default()
1045                    .to_string(),
1046                trace_id: payload
1047                    .get("trace_id")
1048                    .and_then(|value| value.as_str())
1049                    .map(str::to_string),
1050                asked_at: payload
1051                    .get("requested_at")
1052                    .and_then(|value| value.as_str())
1053                    .unwrap_or_default()
1054                    .to_string(),
1055            },
1056        );
1057    }
1058
1059    run.hitl_questions = merged.into_values().collect();
1060    run.hitl_questions.sort_by(|left, right| {
1061        (left.asked_at.as_str(), left.request_id.as_str())
1062            .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
1063    });
1064}
1065
1066fn signature_status_label(status: &SignatureStatus) -> &'static str {
1067    match status {
1068        SignatureStatus::Verified => "verified",
1069        SignatureStatus::Unsigned => "unsigned",
1070        SignatureStatus::Failed { .. } => "failed",
1071    }
1072}
1073
1074fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
1075    run.metadata
1076        .get("trigger_event")
1077        .cloned()
1078        .and_then(|value| serde_json::from_value(value).ok())
1079}
1080
1081fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
1082    trigger_event
1083        .map(|event| event.trace_id.0.clone())
1084        .or_else(|| {
1085            run.metadata
1086                .get("trace_id")
1087                .and_then(|value| value.as_str())
1088                .map(str::to_string)
1089        })
1090}
1091
1092fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
1093    run.metadata
1094        .get("replay_of_event_id")
1095        .and_then(|value| value.as_str())
1096        .map(str::to_string)
1097}
1098
1099fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1100    if stage.kind == "condition" {
1101        ACTION_GRAPH_NODE_KIND_PREDICATE
1102    } else {
1103        ACTION_GRAPH_NODE_KIND_STAGE
1104    }
1105}
1106
1107fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1108    let mut metadata = BTreeMap::new();
1109    metadata.insert(
1110        "provider".to_string(),
1111        serde_json::json!(trigger_event.provider.as_str()),
1112    );
1113    metadata.insert(
1114        "event_kind".to_string(),
1115        serde_json::json!(trigger_event.kind),
1116    );
1117    metadata.insert(
1118        "dedupe_key".to_string(),
1119        serde_json::json!(trigger_event.dedupe_key),
1120    );
1121    metadata.insert(
1122        "signature_status".to_string(),
1123        serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1124    );
1125    metadata
1126}
1127
1128fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1129    let mut metadata = BTreeMap::new();
1130    metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1131    if let Some(branch) = stage.branch.as_ref() {
1132        metadata.insert("branch".to_string(), serde_json::json!(branch));
1133    }
1134    if let Some(worker_id) = stage
1135        .metadata
1136        .get("worker_id")
1137        .and_then(|value| value.as_str())
1138    {
1139        metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1140    }
1141    metadata
1142}
1143
1144fn append_action_graph_node(
1145    nodes: &mut Vec<RunActionGraphNodeRecord>,
1146    record: RunActionGraphNodeRecord,
1147) {
1148    nodes.push(record);
1149}
1150
1151pub async fn append_action_graph_update(
1152    headers: BTreeMap<String, String>,
1153    payload: serde_json::Value,
1154) -> Result<(), crate::event_log::LogError> {
1155    let Some(log) = active_event_log() else {
1156        return Ok(());
1157    };
1158    let topic = Topic::new("observability.action_graph")
1159        .expect("static observability.action_graph topic should always be valid");
1160    let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1161    log.append(&topic, record).await.map(|_| ())
1162}
1163
1164fn publish_action_graph_event(
1165    run: &RunRecord,
1166    observability: &RunObservabilityRecord,
1167    path: &Path,
1168) {
1169    let trigger_event = trigger_event_from_run(run);
1170    let mut headers = BTreeMap::new();
1171    headers.insert("run_id".to_string(), run.id.clone());
1172    headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1173    if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1174        headers.insert("trace_id".to_string(), trace_id);
1175    }
1176    let payload = serde_json::json!({
1177        "run_id": run.id,
1178        "workflow_id": run.workflow_id,
1179        "persisted_path": path.to_string_lossy(),
1180        "status": run.status,
1181        "observability": observability,
1182    });
1183    if let Ok(handle) = tokio::runtime::Handle::try_current() {
1184        handle.spawn(async move {
1185            let _ = append_action_graph_update(headers, payload).await;
1186        });
1187    } else {
1188        let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1189    }
1190}
1191
1192fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1193    let stem = run_path.file_stem()?.to_str()?;
1194    let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1195    Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1196}
1197
1198fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1199    value
1200        .and_then(|value| value.as_array())
1201        .map(|items| {
1202            items
1203                .iter()
1204                .filter_map(|item| item.as_str().map(str::to_string))
1205                .collect::<Vec<_>>()
1206        })
1207        .unwrap_or_default()
1208}
1209
1210fn json_usize(value: Option<&serde_json::Value>) -> usize {
1211    value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1212}
1213
1214fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1215    value.and_then(|value| value.as_bool())
1216}
1217
1218fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1219    stage
1220        .artifacts
1221        .iter()
1222        .find_map(|artifact| artifact.data.as_ref())
1223}
1224
1225fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1226    let deliverables = value
1227        .get("deliverables")
1228        .and_then(|raw| raw.as_array())
1229        .map(|items| {
1230            items
1231                .iter()
1232                .map(|item| RunDeliverableSummaryRecord {
1233                    id: item
1234                        .get("id")
1235                        .and_then(|value| value.as_str())
1236                        .unwrap_or_default()
1237                        .to_string(),
1238                    text: item
1239                        .get("text")
1240                        .and_then(|value| value.as_str())
1241                        .unwrap_or_default()
1242                        .to_string(),
1243                    status: item
1244                        .get("status")
1245                        .and_then(|value| value.as_str())
1246                        .unwrap_or_default()
1247                        .to_string(),
1248                    note: item
1249                        .get("note")
1250                        .and_then(|value| value.as_str())
1251                        .map(str::to_string),
1252                })
1253                .collect::<Vec<_>>()
1254        })
1255        .unwrap_or_default();
1256    let observations = json_string_array(value.get("observations"));
1257    let root_task = value
1258        .get("root_task")
1259        .and_then(|value| value.as_str())
1260        .unwrap_or_default()
1261        .to_string();
1262    let rationale = value
1263        .get("rationale")
1264        .and_then(|value| value.as_str())
1265        .unwrap_or_default()
1266        .to_string();
1267    if root_task.is_empty()
1268        && rationale.is_empty()
1269        && deliverables.is_empty()
1270        && observations.is_empty()
1271    {
1272        return None;
1273    }
1274    let blocking_count = deliverables
1275        .iter()
1276        .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1277        .count();
1278    Some(RunTaskLedgerSummaryRecord {
1279        root_task,
1280        rationale,
1281        deliverables,
1282        observations,
1283        blocking_count,
1284    })
1285}
1286
1287fn compaction_events_from_transcript(
1288    transcript: &serde_json::Value,
1289    stage_id: Option<&str>,
1290    node_id: Option<&str>,
1291    location_prefix: &str,
1292    persisted_path: Option<&Path>,
1293) -> Vec<CompactionEventRecord> {
1294    let transcript_id = transcript
1295        .get("id")
1296        .and_then(|value| value.as_str())
1297        .map(str::to_string);
1298    let asset_ids = transcript
1299        .get("assets")
1300        .and_then(|value| value.as_array())
1301        .map(|assets| {
1302            assets
1303                .iter()
1304                .filter_map(|asset| {
1305                    asset
1306                        .get("id")
1307                        .and_then(|value| value.as_str())
1308                        .map(str::to_string)
1309                })
1310                .collect::<BTreeSet<_>>()
1311        })
1312        .unwrap_or_default();
1313    transcript
1314        .get("events")
1315        .and_then(|value| value.as_array())
1316        .map(|events| {
1317            events
1318                .iter()
1319                .filter(|event| {
1320                    event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1321                })
1322                .map(|event| {
1323                    let metadata = event.get("metadata");
1324                    let snapshot_asset_id = metadata
1325                        .and_then(|value| value.get("snapshot_asset_id"))
1326                        .and_then(|value| value.as_str())
1327                        .map(str::to_string);
1328                    let available = snapshot_asset_id
1329                        .as_ref()
1330                        .is_some_and(|asset_id| asset_ids.contains(asset_id));
1331                    let snapshot_location = snapshot_asset_id
1332                        .as_ref()
1333                        .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1334                        .unwrap_or_else(|| location_prefix.to_string());
1335                    CompactionEventRecord {
1336                        id: event
1337                            .get("id")
1338                            .and_then(|value| value.as_str())
1339                            .unwrap_or_default()
1340                            .to_string(),
1341                        transcript_id: transcript_id.clone(),
1342                        stage_id: stage_id.map(str::to_string),
1343                        node_id: node_id.map(str::to_string),
1344                        mode: metadata
1345                            .and_then(|value| value.get("mode"))
1346                            .and_then(|value| value.as_str())
1347                            .unwrap_or_default()
1348                            .to_string(),
1349                        strategy: metadata
1350                            .and_then(|value| value.get("strategy"))
1351                            .and_then(|value| value.as_str())
1352                            .unwrap_or_default()
1353                            .to_string(),
1354                        archived_messages: json_usize(
1355                            metadata.and_then(|value| value.get("archived_messages")),
1356                        ),
1357                        estimated_tokens_before: json_usize(
1358                            metadata.and_then(|value| value.get("estimated_tokens_before")),
1359                        ),
1360                        estimated_tokens_after: json_usize(
1361                            metadata.and_then(|value| value.get("estimated_tokens_after")),
1362                        ),
1363                        snapshot_asset_id,
1364                        snapshot_location,
1365                        snapshot_path: persisted_path
1366                            .map(|path| path.to_string_lossy().into_owned()),
1367                        available,
1368                    }
1369                })
1370                .collect()
1371        })
1372        .unwrap_or_default()
1373}
1374
1375fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1376    let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1377        return Vec::new();
1378    };
1379    let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1380        return Vec::new();
1381    };
1382
1383    content
1384        .lines()
1385        .filter(|line| !line.trim().is_empty())
1386        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1387        .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1388        .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1389        .collect()
1390}
1391
1392pub fn derive_run_observability(
1393    run: &RunRecord,
1394    persisted_path: Option<&Path>,
1395) -> RunObservabilityRecord {
1396    let mut action_graph_nodes = Vec::new();
1397    let mut action_graph_edges = Vec::new();
1398    let mut verification_outcomes = Vec::new();
1399    let mut planner_rounds = Vec::new();
1400    let mut transcript_pointers = Vec::new();
1401    let mut compaction_events = Vec::new();
1402    let mut daemon_events = Vec::new();
1403    let mut research_fact_count = 0usize;
1404
1405    let root_node_id = format!("run:{}", run.id);
1406    let trigger_event = trigger_event_from_run(run);
1407    let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1408    append_action_graph_node(
1409        &mut action_graph_nodes,
1410        RunActionGraphNodeRecord {
1411            id: root_node_id.clone(),
1412            label: run
1413                .workflow_name
1414                .clone()
1415                .unwrap_or_else(|| run.workflow_id.clone()),
1416            kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1417            status: run.status.clone(),
1418            outcome: run.status.clone(),
1419            trace_id: propagated_trace_id.clone(),
1420            stage_id: None,
1421            node_id: None,
1422            worker_id: None,
1423            run_id: Some(run.id.clone()),
1424            run_path: run.persisted_path.clone(),
1425            metadata: BTreeMap::from([(
1426                "workflow_id".to_string(),
1427                serde_json::json!(run.workflow_id),
1428            )]),
1429        },
1430    );
1431    let mut entry_node_id = root_node_id.clone();
1432    if let Some(trigger_event) = trigger_event.as_ref() {
1433        if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1434            let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1435            append_action_graph_node(
1436                &mut action_graph_nodes,
1437                RunActionGraphNodeRecord {
1438                    id: replay_source_node_id.clone(),
1439                    label: format!(
1440                        "{}:{} (original {})",
1441                        trigger_event.provider.as_str(),
1442                        trigger_event.kind,
1443                        replay_of_event_id
1444                    ),
1445                    kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1446                    status: "historical".to_string(),
1447                    outcome: "replayed_from".to_string(),
1448                    trace_id: Some(trigger_event.trace_id.0.clone()),
1449                    stage_id: None,
1450                    node_id: None,
1451                    worker_id: None,
1452                    run_id: Some(run.id.clone()),
1453                    run_path: run.persisted_path.clone(),
1454                    metadata: trigger_node_metadata(trigger_event),
1455                },
1456            );
1457            action_graph_edges.push(RunActionGraphEdgeRecord {
1458                from_id: replay_source_node_id,
1459                to_id: format!("trigger:{}", trigger_event.id.0),
1460                kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1461                label: Some("replay chain".to_string()),
1462            });
1463        }
1464        let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1465        append_action_graph_node(
1466            &mut action_graph_nodes,
1467            RunActionGraphNodeRecord {
1468                id: trigger_node_id.clone(),
1469                label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1470                kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1471                status: "received".to_string(),
1472                outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1473                trace_id: Some(trigger_event.trace_id.0.clone()),
1474                stage_id: None,
1475                node_id: None,
1476                worker_id: None,
1477                run_id: Some(run.id.clone()),
1478                run_path: run.persisted_path.clone(),
1479                metadata: trigger_node_metadata(trigger_event),
1480            },
1481        );
1482        action_graph_edges.push(RunActionGraphEdgeRecord {
1483            from_id: root_node_id.clone(),
1484            to_id: trigger_node_id.clone(),
1485            kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1486            label: Some(trigger_event.id.0.clone()),
1487        });
1488        entry_node_id = trigger_node_id;
1489    }
1490
1491    let stage_node_ids = run
1492        .stages
1493        .iter()
1494        .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1495        .collect::<BTreeMap<_, _>>();
1496    let stage_by_id = run
1497        .stages
1498        .iter()
1499        .map(|stage| (stage.id.as_str(), stage))
1500        .collect::<BTreeMap<_, _>>();
1501    let stage_by_node_id = run
1502        .stages
1503        .iter()
1504        .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1505        .collect::<BTreeMap<_, _>>();
1506
1507    let incoming_nodes = run
1508        .transitions
1509        .iter()
1510        .map(|transition| transition.to_node_id.clone())
1511        .collect::<BTreeSet<_>>();
1512
1513    for stage in &run.stages {
1514        let graph_node_id = stage_node_ids
1515            .get(&stage.id)
1516            .cloned()
1517            .unwrap_or_else(|| format!("stage:{}", stage.id));
1518        append_action_graph_node(
1519            &mut action_graph_nodes,
1520            RunActionGraphNodeRecord {
1521                id: graph_node_id.clone(),
1522                label: stage.node_id.clone(),
1523                kind: action_graph_kind_for_stage(stage).to_string(),
1524                status: stage.status.clone(),
1525                outcome: stage.outcome.clone(),
1526                trace_id: propagated_trace_id.clone(),
1527                stage_id: Some(stage.id.clone()),
1528                node_id: Some(stage.node_id.clone()),
1529                worker_id: stage
1530                    .metadata
1531                    .get("worker_id")
1532                    .and_then(|value| value.as_str())
1533                    .map(str::to_string),
1534                run_id: None,
1535                run_path: None,
1536                metadata: stage_node_metadata(stage),
1537            },
1538        );
1539        if !incoming_nodes.contains(&stage.node_id) {
1540            action_graph_edges.push(RunActionGraphEdgeRecord {
1541                from_id: entry_node_id.clone(),
1542                to_id: graph_node_id.clone(),
1543                kind: if trigger_event.is_some() {
1544                    ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1545                } else {
1546                    ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1547                },
1548                label: None,
1549            });
1550        }
1551
1552        if stage.kind == "verify" || stage.verification.is_some() {
1553            let passed = json_bool(
1554                stage
1555                    .verification
1556                    .as_ref()
1557                    .and_then(|value| value.get("pass")),
1558            )
1559            .or_else(|| {
1560                json_bool(
1561                    stage
1562                        .verification
1563                        .as_ref()
1564                        .and_then(|value| value.get("success")),
1565                )
1566            })
1567            .or_else(|| {
1568                if stage.status == "completed" && stage.outcome == "success" {
1569                    Some(true)
1570                } else if stage.status == "failed" || stage.outcome == "failed" {
1571                    Some(false)
1572                } else {
1573                    None
1574                }
1575            });
1576            verification_outcomes.push(RunVerificationOutcomeRecord {
1577                stage_id: stage.id.clone(),
1578                node_id: stage.node_id.clone(),
1579                status: stage.status.clone(),
1580                passed,
1581                summary: stage
1582                    .verification
1583                    .as_ref()
1584                    .map(compact_json_value)
1585                    .or_else(|| {
1586                        stage
1587                            .visible_text
1588                            .as_ref()
1589                            .filter(|value| !value.trim().is_empty())
1590                            .cloned()
1591                    }),
1592            });
1593        }
1594
1595        if stage.transcript.is_some() {
1596            transcript_pointers.push(RunTranscriptPointerRecord {
1597                id: format!("stage:{}:transcript", stage.id),
1598                label: format!("Stage {} transcript", stage.node_id),
1599                kind: "embedded_transcript".to_string(),
1600                location: format!("run.stages[{}].transcript", stage.node_id),
1601                path: run.persisted_path.clone(),
1602                available: true,
1603            });
1604            if let Some(transcript) = stage.transcript.as_ref() {
1605                compaction_events.extend(compaction_events_from_transcript(
1606                    transcript,
1607                    Some(&stage.id),
1608                    Some(&stage.node_id),
1609                    &format!("run.stages[{}].transcript", stage.node_id),
1610                    persisted_path,
1611                ));
1612            }
1613        }
1614
1615        if let Some(payload) = stage_result_payload(stage) {
1616            let trace = payload.get("trace");
1617            let task_ledger = payload
1618                .get("task_ledger")
1619                .and_then(task_ledger_summary_from_value);
1620            let research_facts = task_ledger
1621                .as_ref()
1622                .map(|ledger| ledger.observations.clone())
1623                .unwrap_or_default();
1624            research_fact_count += research_facts.len();
1625            let tools_payload = payload.get("tools");
1626            let tools_used = json_string_array(
1627                tools_payload
1628                    .and_then(|tools| tools.get("calls"))
1629                    .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1630            );
1631            let successful_tools =
1632                json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1633            let planner_round = RunPlannerRoundRecord {
1634                stage_id: stage.id.clone(),
1635                node_id: stage.node_id.clone(),
1636                stage_kind: stage.kind.clone(),
1637                status: stage.status.clone(),
1638                outcome: stage.outcome.clone(),
1639                iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1640                llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1641                tool_execution_count: json_usize(
1642                    trace.and_then(|trace| trace.get("tool_executions")),
1643                ),
1644                tool_rejection_count: json_usize(
1645                    trace.and_then(|trace| trace.get("tool_rejections")),
1646                ),
1647                intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1648                compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1649                native_text_tool_fallback_count: json_usize(
1650                    trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1651                ),
1652                native_text_tool_fallback_rejection_count: json_usize(
1653                    trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1654                ),
1655                empty_completion_retry_count: json_usize(
1656                    trace.and_then(|trace| trace.get("empty_completion_retries")),
1657                ),
1658                tools_used,
1659                successful_tools,
1660                ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1661                task_ledger,
1662                research_facts,
1663            };
1664            let has_agentic_detail = planner_round.iteration_count > 0
1665                || planner_round.llm_call_count > 0
1666                || planner_round.tool_execution_count > 0
1667                || planner_round.native_text_tool_fallback_count > 0
1668                || planner_round.native_text_tool_fallback_rejection_count > 0
1669                || planner_round.empty_completion_retry_count > 0
1670                || planner_round.ledger_done_rejections > 0
1671                || planner_round.task_ledger.is_some()
1672                || !planner_round.tools_used.is_empty()
1673                || !planner_round.successful_tools.is_empty();
1674            if has_agentic_detail {
1675                planner_rounds.push(planner_round);
1676            }
1677        }
1678    }
1679
1680    for transition in &run.transitions {
1681        let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1682            continue;
1683        };
1684        let from_stage = transition
1685            .from_stage_id
1686            .as_deref()
1687            .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1688        let from_id = transition
1689            .from_stage_id
1690            .as_ref()
1691            .and_then(|stage_id| stage_node_ids.get(stage_id))
1692            .cloned()
1693            .or_else(|| {
1694                transition
1695                    .from_node_id
1696                    .as_ref()
1697                    .and_then(|node_id| stage_by_node_id.get(node_id))
1698                    .cloned()
1699            })
1700            .unwrap_or_else(|| root_node_id.clone());
1701        action_graph_edges.push(RunActionGraphEdgeRecord {
1702            from_id,
1703            to_id,
1704            kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1705                ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1706            } else {
1707                ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1708            },
1709            label: transition.branch.clone(),
1710        });
1711    }
1712
1713    let worker_lineage = run
1714        .child_runs
1715        .iter()
1716        .map(|child| {
1717            let worker_node_id = format!("worker:{}", child.worker_id);
1718            append_action_graph_node(
1719                &mut action_graph_nodes,
1720                RunActionGraphNodeRecord {
1721                    id: worker_node_id.clone(),
1722                    label: child.worker_name.clone(),
1723                    kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1724                    status: child.status.clone(),
1725                    outcome: child.status.clone(),
1726                    trace_id: propagated_trace_id.clone(),
1727                    stage_id: child.parent_stage_id.clone(),
1728                    node_id: None,
1729                    worker_id: Some(child.worker_id.clone()),
1730                    run_id: child.run_id.clone(),
1731                    run_path: child.run_path.clone(),
1732                    metadata: BTreeMap::from([
1733                        (
1734                            "worker_name".to_string(),
1735                            serde_json::json!(child.worker_name),
1736                        ),
1737                        ("task".to_string(), serde_json::json!(child.task)),
1738                    ]),
1739                },
1740            );
1741            if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1742                if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1743                    action_graph_edges.push(RunActionGraphEdgeRecord {
1744                        from_id: stage_node_id.clone(),
1745                        to_id: worker_node_id,
1746                        kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1747                        label: Some(child.worker_name.clone()),
1748                    });
1749                }
1750            }
1751            RunWorkerLineageRecord {
1752                worker_id: child.worker_id.clone(),
1753                worker_name: child.worker_name.clone(),
1754                parent_stage_id: child.parent_stage_id.clone(),
1755                task: child.task.clone(),
1756                status: child.status.clone(),
1757                session_id: child.session_id.clone(),
1758                parent_session_id: child.parent_session_id.clone(),
1759                run_id: child.run_id.clone(),
1760                run_path: child.run_path.clone(),
1761                snapshot_path: child.snapshot_path.clone(),
1762            }
1763        })
1764        .collect::<Vec<_>>();
1765
1766    if run.transcript.is_some() {
1767        transcript_pointers.push(RunTranscriptPointerRecord {
1768            id: "run:transcript".to_string(),
1769            label: "Run transcript".to_string(),
1770            kind: "embedded_transcript".to_string(),
1771            location: "run.transcript".to_string(),
1772            path: run.persisted_path.clone(),
1773            available: true,
1774        });
1775        if let Some(transcript) = run.transcript.as_ref() {
1776            compaction_events.extend(compaction_events_from_transcript(
1777                transcript,
1778                None,
1779                None,
1780                "run.transcript",
1781                persisted_path,
1782            ));
1783        }
1784    }
1785
1786    if let Some(path) = persisted_path {
1787        if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1788            transcript_pointers.push(RunTranscriptPointerRecord {
1789                id: "run:llm_transcript".to_string(),
1790                label: "LLM transcript sidecar".to_string(),
1791                kind: "llm_jsonl".to_string(),
1792                location: "run sidecar".to_string(),
1793                path: Some(sidecar_path.to_string_lossy().into_owned()),
1794                available: sidecar_path.exists(),
1795            });
1796        }
1797        daemon_events.extend(daemon_events_from_sidecar(path));
1798    }
1799
1800    RunObservabilityRecord {
1801        schema_version: 4,
1802        planner_rounds,
1803        research_fact_count,
1804        action_graph_nodes,
1805        action_graph_edges,
1806        worker_lineage,
1807        verification_outcomes,
1808        transcript_pointers,
1809        compaction_events,
1810        daemon_events,
1811    }
1812}
1813
1814fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1815    run.observability = Some(derive_run_observability(run, persisted_path));
1816}
1817
1818pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1819    let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1820    if run.type_name.is_empty() {
1821        run.type_name = "run_record".to_string();
1822    }
1823    if run.id.is_empty() {
1824        run.id = new_id("run");
1825    }
1826    if run.started_at.is_empty() {
1827        run.started_at = now_rfc3339();
1828    }
1829    if run.status.is_empty() {
1830        run.status = "running".to_string();
1831    }
1832    if run.root_run_id.is_none() {
1833        run.root_run_id = Some(run.id.clone());
1834    }
1835    if run.replay_fixture.is_none() {
1836        run.replay_fixture = Some(replay_fixture_from_run(&run));
1837    }
1838    merge_hitl_questions_from_active_log(&mut run);
1839    materialize_child_runs_from_stage_metadata(&mut run);
1840    sync_run_handoffs(&mut run);
1841    if run.observability.is_none() {
1842        let persisted_path = run.persisted_path.clone();
1843        let persisted = persisted_path.as_deref().map(Path::new);
1844        refresh_run_observability(&mut run, persisted);
1845    }
1846    Ok(run)
1847}
1848
1849pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1850    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1851    if manifest.type_name.is_empty() {
1852        manifest.type_name = "eval_suite_manifest".to_string();
1853    }
1854    if manifest.id.is_empty() {
1855        manifest.id = new_id("eval_suite");
1856    }
1857    Ok(manifest)
1858}
1859
1860pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1861    let content = std::fs::read_to_string(path)
1862        .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1863    let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1864        .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1865    if manifest.base_dir.is_none() {
1866        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1867    }
1868    Ok(manifest)
1869}
1870
1871pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1872    let content = std::fs::read_to_string(path)
1873        .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1874    let mut manifest: EvalPackManifest =
1875        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1876            serde_json::from_str(&content)
1877                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1878        } else {
1879            toml::from_str(&content)
1880                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1881        };
1882    normalize_eval_pack_manifest(&mut manifest);
1883    if manifest.base_dir.is_none() {
1884        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1885    }
1886    Ok(manifest)
1887}
1888
1889pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
1890    let mut manifest: EvalPackManifest = parse_json_value(value)?;
1891    normalize_eval_pack_manifest(&mut manifest);
1892    Ok(manifest)
1893}
1894
1895fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1896    if manifest.version == 0 {
1897        manifest.version = 1;
1898    }
1899    if manifest.id.is_empty() {
1900        manifest.id = manifest
1901            .name
1902            .clone()
1903            .filter(|name| !name.trim().is_empty())
1904            .unwrap_or_else(|| new_id("eval_pack"));
1905    }
1906    for ladder in &mut manifest.ladders {
1907        super::normalize_persona_eval_ladder_manifest(ladder);
1908    }
1909}
1910
1911fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1912    let content = std::fs::read_to_string(path)
1913        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1914    serde_json::from_str(&content)
1915        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1916}
1917
1918fn load_run_record_from_fixture_ref(
1919    fixture: &EvalPackFixtureRef,
1920    base_dir: Option<&Path>,
1921) -> Result<RunRecord, VmError> {
1922    if let Some(inline) = &fixture.inline {
1923        let run: RunRecord = serde_json::from_value(inline.clone())
1924            .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1925        return Ok(run);
1926    }
1927    let path = fixture.path.as_deref().ok_or_else(|| {
1928        VmError::Runtime(format!(
1929            "fixture '{}' is missing path or inline run",
1930            fixture.id
1931        ))
1932    })?;
1933    load_run_record(&resolve_manifest_path(base_dir, path))
1934}
1935
1936fn load_replay_fixture_from_ref(
1937    fixture: &EvalPackFixtureRef,
1938    base_dir: Option<&Path>,
1939) -> Result<ReplayFixture, VmError> {
1940    if let Some(inline) = &fixture.inline {
1941        return serde_json::from_value(inline.clone())
1942            .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1943    }
1944    let path = fixture.path.as_deref().ok_or_else(|| {
1945        VmError::Runtime(format!(
1946            "fixture '{}' is missing path or inline replay fixture",
1947            fixture.id
1948        ))
1949    })?;
1950    load_replay_fixture(&resolve_manifest_path(base_dir, path))
1951}
1952
1953fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1954    let path_buf = PathBuf::from(path);
1955    if path_buf.is_absolute() {
1956        path_buf
1957    } else if let Some(base_dir) = base_dir {
1958        base_dir.join(path_buf)
1959    } else {
1960        path_buf
1961    }
1962}
1963
1964pub fn evaluate_run_suite_manifest(
1965    manifest: &EvalSuiteManifest,
1966) -> Result<ReplayEvalSuiteReport, VmError> {
1967    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1968    let mut reports = Vec::new();
1969    for case in &manifest.cases {
1970        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1971        let run = load_run_record(&run_path)?;
1972        let fixture = match &case.fixture_path {
1973            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1974            None => run
1975                .replay_fixture
1976                .clone()
1977                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1978        };
1979        let eval = evaluate_run_against_fixture(&run, &fixture);
1980        let mut pass = eval.pass;
1981        let mut failures = eval.failures;
1982        let comparison = match &case.compare_to {
1983            Some(path) => {
1984                let baseline_path = resolve_manifest_path(base_dir, path);
1985                let baseline = load_run_record(&baseline_path)?;
1986                let diff = diff_run_records(&baseline, &run);
1987                if !diff.identical {
1988                    pass = false;
1989                    failures.push(format!(
1990                        "run differs from baseline {} with {} stage changes",
1991                        baseline_path.display(),
1992                        diff.stage_diffs.len()
1993                    ));
1994                }
1995                Some(diff)
1996            }
1997            None => None,
1998        };
1999        reports.push(ReplayEvalCaseReport {
2000            run_id: run.id.clone(),
2001            workflow_id: run.workflow_id.clone(),
2002            label: case.label.clone(),
2003            pass,
2004            failures,
2005            stage_count: eval.stage_count,
2006            source_path: Some(run_path.display().to_string()),
2007            comparison,
2008        });
2009    }
2010    let total = reports.len();
2011    let passed = reports.iter().filter(|report| report.pass).count();
2012    let failed = total.saturating_sub(passed);
2013    Ok(ReplayEvalSuiteReport {
2014        pass: failed == 0,
2015        total,
2016        passed,
2017        failed,
2018        cases: reports,
2019    })
2020}
2021
2022pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
2023    let base_dir = manifest.base_dir.as_deref().map(Path::new);
2024    let fixture_base_dir_buf = manifest
2025        .defaults
2026        .fixture_root
2027        .as_deref()
2028        .map(|root| resolve_manifest_path(base_dir, root));
2029    let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
2030    let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
2031        .fixtures
2032        .iter()
2033        .filter(|fixture| !fixture.id.is_empty())
2034        .map(|fixture| (fixture.id.as_str(), fixture))
2035        .collect();
2036    let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
2037        .rubrics
2038        .iter()
2039        .filter(|rubric| !rubric.id.is_empty())
2040        .map(|rubric| (rubric.id.as_str(), rubric))
2041        .collect();
2042
2043    let mut reports = Vec::new();
2044    for (index, case) in manifest.cases.iter().enumerate() {
2045        let case_id = case
2046            .id
2047            .clone()
2048            .filter(|id| !id.trim().is_empty())
2049            .unwrap_or_else(|| format!("case_{}", index + 1));
2050        let label = case
2051            .name
2052            .clone()
2053            .or_else(|| case.id.clone())
2054            .unwrap_or_else(|| case_id.clone());
2055        let severity = eval_pack_case_severity(manifest, case);
2056        let blocking = severity == "blocking";
2057        let mut failures = Vec::new();
2058        let mut warnings = Vec::new();
2059        let mut informational = Vec::new();
2060
2061        if case.friction_events.is_some() {
2062            let report = evaluate_eval_pack_friction_case(
2063                manifest,
2064                case,
2065                &case_id,
2066                &label,
2067                &severity,
2068                blocking,
2069                base_dir,
2070                fixture_base_dir,
2071                &fixtures_by_id,
2072                &rubrics_by_id,
2073            )?;
2074            reports.push(report);
2075            continue;
2076        }
2077
2078        let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
2079        let fixture =
2080            load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
2081        let eval = evaluate_run_against_fixture(&run, &fixture);
2082        failures.extend(eval.failures);
2083        apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2084        apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2085
2086        let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2087            Some(path) => {
2088                let baseline_path = resolve_manifest_path(base_dir, path);
2089                let baseline = load_run_record(&baseline_path)?;
2090                let diff = diff_run_records(&baseline, &run);
2091                if !diff.identical {
2092                    failures.push(format!(
2093                        "run differs from baseline {} with {} stage changes",
2094                        baseline_path.display(),
2095                        diff.stage_diffs.len()
2096                    ));
2097                }
2098                Some(diff)
2099            }
2100            None => None,
2101        };
2102
2103        for rubric_id in &case.rubrics {
2104            let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2105                failures.push(format!("case references unknown rubric '{rubric_id}'"));
2106                continue;
2107            };
2108            apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2109        }
2110
2111        let pass = failures.is_empty() || !blocking;
2112        if !failures.is_empty() && !blocking {
2113            if severity == "warning" {
2114                warnings.append(&mut failures);
2115            } else {
2116                informational.append(&mut failures);
2117            }
2118        }
2119        reports.push(EvalPackCaseReport {
2120            id: case_id,
2121            label,
2122            severity,
2123            pass,
2124            blocking,
2125            run_id: run.id.clone(),
2126            workflow_id: run.workflow_id.clone(),
2127            source_path: eval_pack_case_source_path(
2128                case,
2129                base_dir,
2130                fixture_base_dir,
2131                &fixtures_by_id,
2132            ),
2133            stage_count: eval.stage_count,
2134            failures,
2135            warnings,
2136            informational,
2137            comparison,
2138        });
2139    }
2140
2141    let mut ladder_reports = Vec::new();
2142    for ladder in &manifest.ladders {
2143        let mut ladder = ladder.clone();
2144        if ladder.base_dir.is_none() {
2145            ladder.base_dir = manifest.base_dir.clone();
2146        }
2147        ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2148    }
2149
2150    let case_total = reports.len();
2151    let ladder_total = ladder_reports.len();
2152    let total = case_total + ladder_total;
2153    let case_blocking_failed = reports
2154        .iter()
2155        .filter(|report| report.blocking && !report.failures.is_empty())
2156        .count();
2157    let ladder_blocking_failed = ladder_reports
2158        .iter()
2159        .filter(|report| report.blocking && !report.pass)
2160        .count();
2161    let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2162    let warning_failed = reports
2163        .iter()
2164        .filter(|report| !report.warnings.is_empty())
2165        .count()
2166        + ladder_reports
2167            .iter()
2168            .filter(|report| !report.pass && report.severity == "warning")
2169            .count();
2170    let informational_failed = reports
2171        .iter()
2172        .filter(|report| !report.informational.is_empty())
2173        .count()
2174        + ladder_reports
2175            .iter()
2176            .filter(|report| !report.pass && report.severity == "informational")
2177            .count();
2178    let passed = reports.iter().filter(|report| report.pass).count()
2179        + ladder_reports.iter().filter(|report| report.pass).count();
2180    Ok(EvalPackReport {
2181        pack_id: manifest.id.clone(),
2182        pass: blocking_failed == 0,
2183        total,
2184        passed,
2185        failed: total.saturating_sub(passed),
2186        blocking_failed,
2187        warning_failed,
2188        informational_failed,
2189        cases: reports,
2190        ladders: ladder_reports,
2191    })
2192}
2193
2194#[allow(clippy::too_many_arguments)]
2195fn evaluate_eval_pack_friction_case(
2196    manifest: &EvalPackManifest,
2197    case: &EvalPackCase,
2198    case_id: &str,
2199    label: &str,
2200    severity: &str,
2201    blocking: bool,
2202    base_dir: Option<&Path>,
2203    fixture_base_dir: Option<&Path>,
2204    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2205    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2206) -> Result<EvalPackCaseReport, VmError> {
2207    let mut failures = Vec::new();
2208    let mut warnings = Vec::new();
2209    let mut informational = Vec::new();
2210    let events =
2211        load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2212    let options = friction_suggestion_options(case, manifest);
2213    let suggestions = generate_context_pack_suggestions(&events, &options);
2214
2215    for rubric_id in &case.rubrics {
2216        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2217            failures.push(format!("case references unknown rubric '{rubric_id}'"));
2218            continue;
2219        };
2220        apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2221    }
2222
2223    if case.rubrics.is_empty() && suggestions.is_empty() {
2224        failures.push("friction fixture produced no context-pack suggestions".to_string());
2225    }
2226
2227    let pass = failures.is_empty() || !blocking;
2228    if !failures.is_empty() && !blocking {
2229        if severity == "warning" {
2230            warnings.append(&mut failures);
2231        } else {
2232            informational.append(&mut failures);
2233        }
2234    }
2235
2236    Ok(EvalPackCaseReport {
2237        id: case_id.to_string(),
2238        label: label.to_string(),
2239        severity: severity.to_string(),
2240        pass,
2241        blocking,
2242        run_id: "friction_events".to_string(),
2243        workflow_id: String::new(),
2244        source_path: eval_pack_case_friction_source_path(
2245            case,
2246            base_dir,
2247            fixture_base_dir,
2248            fixtures_by_id,
2249        ),
2250        stage_count: events.len(),
2251        failures,
2252        warnings,
2253        informational,
2254        comparison: None,
2255    })
2256}
2257
2258fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2259    normalize_eval_pack_severity(
2260        case.severity
2261            .as_deref()
2262            .or(case.thresholds.severity.as_deref())
2263            .or(manifest.defaults.severity.as_deref())
2264            .or(manifest.defaults.thresholds.severity.as_deref())
2265            .unwrap_or("blocking"),
2266    )
2267}
2268
2269fn normalize_eval_pack_severity(value: &str) -> String {
2270    match value.trim().to_ascii_lowercase().as_str() {
2271        "warn" | "warning" => "warning".to_string(),
2272        "info" | "informational" => "informational".to_string(),
2273        _ => "blocking".to_string(),
2274    }
2275}
2276
2277fn load_eval_pack_case_run(
2278    case: &EvalPackCase,
2279    base_dir: Option<&Path>,
2280    fixture_base_dir: Option<&Path>,
2281    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2282) -> Result<RunRecord, VmError> {
2283    if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2284        if let Some(fixture) = fixtures_by_id.get(run_ref) {
2285            return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2286        }
2287        return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2288    }
2289    Err(VmError::Runtime(
2290        "eval pack case is missing run or run_path".to_string(),
2291    ))
2292}
2293
2294fn load_eval_pack_case_fixture(
2295    case: &EvalPackCase,
2296    base_dir: Option<&Path>,
2297    fixture_base_dir: Option<&Path>,
2298    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2299    run: &RunRecord,
2300) -> Result<ReplayFixture, VmError> {
2301    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2302        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2303            return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2304        }
2305        return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2306    }
2307    Ok(run
2308        .replay_fixture
2309        .clone()
2310        .unwrap_or_else(|| replay_fixture_from_run(run)))
2311}
2312
2313fn eval_pack_case_source_path(
2314    case: &EvalPackCase,
2315    base_dir: Option<&Path>,
2316    fixture_base_dir: Option<&Path>,
2317    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2318) -> Option<String> {
2319    let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2320    if let Some(fixture) = fixtures_by_id.get(run_ref) {
2321        return fixture.path.as_ref().map(|path| {
2322            resolve_manifest_path(fixture_base_dir, path)
2323                .display()
2324                .to_string()
2325        });
2326    }
2327    Some(
2328        resolve_manifest_path(base_dir, run_ref)
2329            .display()
2330            .to_string(),
2331    )
2332}
2333
2334fn load_eval_pack_case_friction_events(
2335    case: &EvalPackCase,
2336    base_dir: Option<&Path>,
2337    fixture_base_dir: Option<&Path>,
2338    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2339) -> Result<Vec<FrictionEvent>, VmError> {
2340    let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2341        VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2342    })?;
2343    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2344        return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2345    }
2346    load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2347}
2348
2349fn load_friction_events_from_fixture_ref(
2350    fixture: &EvalPackFixtureRef,
2351    base_dir: Option<&Path>,
2352) -> Result<Vec<FrictionEvent>, VmError> {
2353    if let Some(inline) = &fixture.inline {
2354        return normalize_friction_events_json(inline.clone());
2355    }
2356    let path = fixture.path.as_deref().ok_or_else(|| {
2357        VmError::Runtime(format!(
2358            "fixture '{}' is missing path or inline friction events",
2359            fixture.id
2360        ))
2361    })?;
2362    load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2363}
2364
2365fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2366    let content = std::fs::read_to_string(path)
2367        .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2368    let value: serde_json::Value = serde_json::from_str(&content)
2369        .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2370    normalize_friction_events_json(value)
2371}
2372
2373fn eval_pack_case_friction_source_path(
2374    case: &EvalPackCase,
2375    base_dir: Option<&Path>,
2376    fixture_base_dir: Option<&Path>,
2377    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2378) -> Option<String> {
2379    let event_ref = case.friction_events.as_deref()?;
2380    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2381        return fixture.path.as_ref().map(|path| {
2382            resolve_manifest_path(fixture_base_dir, path)
2383                .display()
2384                .to_string()
2385        });
2386    }
2387    Some(
2388        resolve_manifest_path(base_dir, event_ref)
2389            .display()
2390            .to_string(),
2391    )
2392}
2393
2394fn friction_suggestion_options(
2395    case: &EvalPackCase,
2396    manifest: &EvalPackManifest,
2397) -> ContextPackSuggestionOptions {
2398    let min_occurrences = case
2399        .metadata
2400        .get("min_occurrences")
2401        .or_else(|| manifest.metadata.get("min_occurrences"))
2402        .and_then(|value| value.as_u64())
2403        .unwrap_or(2) as usize;
2404    let owner = case
2405        .metadata
2406        .get("owner")
2407        .or_else(|| manifest.metadata.get("owner"))
2408        .and_then(|value| value.as_str())
2409        .map(str::to_string)
2410        .or_else(|| {
2411            manifest
2412                .package
2413                .as_ref()
2414                .and_then(|package| package.name.clone())
2415        });
2416    ContextPackSuggestionOptions {
2417        min_occurrences,
2418        owner,
2419    }
2420}
2421
2422fn apply_eval_pack_thresholds(
2423    run: &RunRecord,
2424    thresholds: &EvalPackThresholds,
2425    failures: &mut Vec<String>,
2426) {
2427    if let Some(max_stage_count) = thresholds.max_stage_count {
2428        if run.stages.len() > max_stage_count {
2429            failures.push(format!(
2430                "stage count {} exceeds threshold {}",
2431                run.stages.len(),
2432                max_stage_count
2433            ));
2434        }
2435    }
2436    if let Some(max_latency_ms) = thresholds.max_latency_ms {
2437        let actual = run
2438            .usage
2439            .as_ref()
2440            .map(|usage| usage.total_duration_ms)
2441            .unwrap_or_default();
2442        if actual > max_latency_ms {
2443            failures.push(format!(
2444                "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2445            ));
2446        }
2447    }
2448    if let Some(max_cost_usd) = thresholds.max_cost_usd {
2449        let actual = run
2450            .usage
2451            .as_ref()
2452            .map(|usage| usage.total_cost)
2453            .unwrap_or_default();
2454        if actual > max_cost_usd {
2455            failures.push(format!(
2456                "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2457            ));
2458        }
2459    }
2460    if let Some(max_tokens) = thresholds.max_tokens {
2461        let actual = run
2462            .usage
2463            .as_ref()
2464            .map(|usage| usage.input_tokens + usage.output_tokens)
2465            .unwrap_or_default();
2466        if actual > max_tokens {
2467            failures.push(format!(
2468                "token count {actual} exceeds threshold {max_tokens}"
2469            ));
2470        }
2471    }
2472}
2473
2474fn apply_eval_pack_rubric(
2475    rubric: &EvalPackRubric,
2476    run: &RunRecord,
2477    failures: &mut Vec<String>,
2478    warnings: &mut Vec<String>,
2479) {
2480    match rubric.kind.as_str() {
2481        "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2482            apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2483            for assertion in &rubric.assertions {
2484                apply_eval_pack_assertion(rubric, assertion, run, failures);
2485            }
2486        }
2487        "llm-judge" | "llm_as_judge" | "judge" => {
2488            let severity = normalize_eval_pack_severity(
2489                rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2490            );
2491            let message = format!(
2492                "rubric '{}' requires an external LLM judge and was not run locally",
2493                rubric.id
2494            );
2495            if severity == "blocking" {
2496                failures.push(message);
2497            } else {
2498                warnings.push(message);
2499            }
2500        }
2501        other => warnings.push(format!(
2502            "rubric '{}' has unknown kind '{}' and was not run locally",
2503            rubric.id, other
2504        )),
2505    }
2506}
2507
2508fn apply_eval_pack_friction_rubric(
2509    rubric: &EvalPackRubric,
2510    suggestions: &[super::ContextPackSuggestion],
2511    failures: &mut Vec<String>,
2512    warnings: &mut Vec<String>,
2513) {
2514    match rubric.kind.as_str() {
2515        "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2516            let mut expectations = Vec::new();
2517            for assertion in &rubric.assertions {
2518                match assertion.kind.as_str() {
2519                    "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2520                        let expectation = context_pack_expectation_from_assertion(assertion);
2521                        expectations.push(expectation);
2522                    }
2523                    other => failures.push(format!(
2524                        "rubric '{}' has unsupported friction assertion kind '{}'",
2525                        rubric.id, other
2526                    )),
2527                }
2528            }
2529            failures.extend(evaluate_context_pack_suggestion_expectations(
2530                suggestions,
2531                &expectations,
2532            ));
2533        }
2534        other => warnings.push(format!(
2535            "rubric '{}' has unknown friction kind '{}' and was not run locally",
2536            rubric.id, other
2537        )),
2538    }
2539}
2540
2541fn context_pack_expectation_from_assertion(
2542    assertion: &EvalPackAssertion,
2543) -> ContextPackSuggestionExpectation {
2544    let expected = assertion
2545        .expected
2546        .as_ref()
2547        .and_then(|value| value.as_object());
2548    let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2549    ContextPackSuggestionExpectation {
2550        min_suggestions: expected
2551            .and_then(|map| map.get("min_suggestions"))
2552            .and_then(|value| value.as_u64())
2553            .map(|value| value as usize),
2554        recommended_artifact: expected
2555            .and_then(|map| map.get("recommended_artifact"))
2556            .and_then(|value| value.as_str())
2557            .map(str::to_string)
2558            .or_else(|| expected_string.map(str::to_string)),
2559        title_contains: assertion.contains.clone().or_else(|| {
2560            expected
2561                .and_then(|map| map.get("title_contains"))
2562                .and_then(|value| value.as_str())
2563                .map(str::to_string)
2564        }),
2565        manifest_name_contains: expected
2566            .and_then(|map| map.get("manifest_name_contains"))
2567            .and_then(|value| value.as_str())
2568            .map(str::to_string),
2569        required_capability: expected
2570            .and_then(|map| map.get("required_capability"))
2571            .and_then(|value| value.as_str())
2572            .map(str::to_string),
2573        required_output_slot: expected
2574            .and_then(|map| map.get("required_output_slot"))
2575            .and_then(|value| value.as_str())
2576            .map(str::to_string),
2577    }
2578}
2579
2580fn apply_eval_pack_assertion(
2581    rubric: &EvalPackRubric,
2582    assertion: &EvalPackAssertion,
2583    run: &RunRecord,
2584    failures: &mut Vec<String>,
2585) {
2586    match assertion.kind.as_str() {
2587        "run-status" | "run_status" | "status" => {
2588            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2589            if let Some(expected) = expected {
2590                if run.status != expected {
2591                    failures.push(format!(
2592                        "rubric '{}' expected run status {}, got {}",
2593                        rubric.id, expected, run.status
2594                    ));
2595                }
2596            }
2597        }
2598        "stage-status" | "stage_status" => {
2599            let Some(stage_id) = assertion.stage.as_deref() else {
2600                failures.push(format!(
2601                    "rubric '{}' stage-status assertion missing stage",
2602                    rubric.id
2603                ));
2604                return;
2605            };
2606            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2607            let Some(expected) = expected else {
2608                failures.push(format!(
2609                    "rubric '{}' stage-status assertion missing expected string",
2610                    rubric.id
2611                ));
2612                return;
2613            };
2614            match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2615                Some(stage) if stage.status == expected => {}
2616                Some(stage) => failures.push(format!(
2617                    "rubric '{}' expected stage {} status {}, got {}",
2618                    rubric.id, stage_id, expected, stage.status
2619                )),
2620                None => failures.push(format!(
2621                    "rubric '{}' expected stage {} to exist",
2622                    rubric.id, stage_id
2623                )),
2624            }
2625        }
2626        "visible-text-contains" | "visible_text_contains" => {
2627            let Some(needle) = assertion.contains.as_deref() else {
2628                failures.push(format!(
2629                    "rubric '{}' visible-text assertion missing contains",
2630                    rubric.id
2631                ));
2632                return;
2633            };
2634            let matched = match assertion.stage.as_deref() {
2635                Some(stage_id) => run
2636                    .stages
2637                    .iter()
2638                    .find(|stage| stage.node_id == stage_id)
2639                    .and_then(|stage| stage.visible_text.as_deref())
2640                    .is_some_and(|text| text.contains(needle)),
2641                None => run
2642                    .stages
2643                    .iter()
2644                    .filter_map(|stage| stage.visible_text.as_deref())
2645                    .any(|text| text.contains(needle)),
2646            };
2647            if !matched {
2648                failures.push(format!(
2649                    "rubric '{}' expected visible text to contain {:?}",
2650                    rubric.id, needle
2651                ));
2652            }
2653        }
2654        "hitl-question-contains" | "hitl_question_contains" => {
2655            let Some(needle) = assertion.contains.as_deref() else {
2656                failures.push(format!(
2657                    "rubric '{}' HITL assertion missing contains",
2658                    rubric.id
2659                ));
2660                return;
2661            };
2662            if !run
2663                .hitl_questions
2664                .iter()
2665                .any(|question| question.prompt.contains(needle))
2666            {
2667                failures.push(format!(
2668                    "rubric '{}' expected HITL question to contain {:?}",
2669                    rubric.id, needle
2670                ));
2671            }
2672        }
2673        "" => {}
2674        other => failures.push(format!(
2675            "rubric '{}' has unsupported assertion kind '{}'",
2676            rubric.id, other
2677        )),
2678    }
2679}
2680
2681/// Edit operation in a diff sequence.
2682#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2683pub(crate) enum DiffOp {
2684    Equal,
2685    Delete,
2686    Insert,
2687}
2688
2689/// Compute the shortest edit script using Myers' O(nd) algorithm.
2690/// Returns a sequence of (DiffOp, line_index_in_before_or_after).
2691/// Time: O(nd) where d = edit distance. Space: O(d * n).
2692pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2693    let n = a.len() as isize;
2694    let m = b.len() as isize;
2695    if n == 0 && m == 0 {
2696        return Vec::new();
2697    }
2698    if n == 0 {
2699        return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2700    }
2701    if m == 0 {
2702        return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2703    }
2704
2705    let max_d = (n + m) as usize;
2706    let offset = max_d as isize;
2707    let v_size = 2 * max_d + 1;
2708    let mut v = vec![0isize; v_size];
2709    // trace[d] holds the `v` snapshot BEFORE step d ran — required for backtrack.
2710    let mut trace: Vec<Vec<isize>> = Vec::new();
2711
2712    'outer: for d in 0..=max_d as isize {
2713        trace.push(v.clone());
2714        let mut new_v = v.clone();
2715        for k in (-d..=d).step_by(2) {
2716            let ki = (k + offset) as usize;
2717            let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2718                v[ki + 1]
2719            } else {
2720                v[ki - 1] + 1
2721            };
2722            let mut y = x - k;
2723            while x < n && y < m && a[x as usize] == b[y as usize] {
2724                x += 1;
2725                y += 1;
2726            }
2727            new_v[ki] = x;
2728            if x >= n && y >= m {
2729                let _ = new_v;
2730                break 'outer;
2731            }
2732        }
2733        v = new_v;
2734    }
2735
2736    let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2737    let mut x = n;
2738    let mut y = m;
2739    for d in (1..trace.len() as isize).rev() {
2740        let k = x - y;
2741        let v_prev = &trace[d as usize];
2742        let prev_k = if k == -d
2743            || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2744        {
2745            k + 1
2746        } else {
2747            k - 1
2748        };
2749        let prev_x = v_prev[(prev_k + offset) as usize];
2750        let prev_y = prev_x - prev_k;
2751
2752        while x > prev_x && y > prev_y {
2753            x -= 1;
2754            y -= 1;
2755            ops.push((DiffOp::Equal, x as usize));
2756        }
2757        if prev_k < k {
2758            x -= 1;
2759            ops.push((DiffOp::Delete, x as usize));
2760        } else {
2761            y -= 1;
2762            ops.push((DiffOp::Insert, y as usize));
2763        }
2764    }
2765    while x > 0 && y > 0 {
2766        x -= 1;
2767        y -= 1;
2768        ops.push((DiffOp::Equal, x as usize));
2769    }
2770    ops.reverse();
2771    ops
2772}
2773
2774pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2775    let before_lines: Vec<&str> = before.lines().collect();
2776    let after_lines: Vec<&str> = after.lines().collect();
2777    let ops = myers_diff(&before_lines, &after_lines);
2778
2779    let mut diff = String::new();
2780    let file = path.unwrap_or("artifact");
2781    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2782    for &(op, idx) in &ops {
2783        match op {
2784            DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2785            DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2786            DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2787        }
2788    }
2789    diff
2790}
2791
2792pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2793    let path = path
2794        .map(PathBuf::from)
2795        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2796    let mut materialized = run.clone();
2797    merge_hitl_questions_from_active_log(&mut materialized);
2798    materialize_child_runs_from_stage_metadata(&mut materialized);
2799    if materialized.replay_fixture.is_none() {
2800        materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2801    }
2802    materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2803    sync_run_handoffs(&mut materialized);
2804    refresh_run_observability(&mut materialized, Some(&path));
2805    if let Some(parent) = path.parent() {
2806        std::fs::create_dir_all(parent)
2807            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2808    }
2809    let json = serde_json::to_string_pretty(&materialized)
2810        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2811    crate::atomic_io::atomic_write(&path, json.as_bytes())
2812        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2813    if let Some(observability) = materialized.observability.as_ref() {
2814        publish_action_graph_event(&materialized, observability, &path);
2815    }
2816    Ok(path.to_string_lossy().into_owned())
2817}
2818
2819pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2820    let content = std::fs::read_to_string(path)
2821        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2822    let mut run: RunRecord = serde_json::from_str(&content)
2823        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2824    materialize_child_runs_from_stage_metadata(&mut run);
2825    if run.replay_fixture.is_none() {
2826        run.replay_fixture = Some(replay_fixture_from_run(&run));
2827    }
2828    run.persisted_path
2829        .get_or_insert_with(|| path.to_string_lossy().into_owned());
2830    sync_run_handoffs(&mut run);
2831    refresh_run_observability(&mut run, Some(path));
2832    Ok(run)
2833}
2834
2835pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2836    ReplayFixture {
2837        type_name: "replay_fixture".to_string(),
2838        id: new_id("fixture"),
2839        source_run_id: run.id.clone(),
2840        workflow_id: run.workflow_id.clone(),
2841        workflow_name: run.workflow_name.clone(),
2842        created_at: now_rfc3339(),
2843        eval_kind: Some("replay".to_string()),
2844        clarifying_question: None,
2845        expected_status: run.status.clone(),
2846        stage_assertions: run
2847            .stages
2848            .iter()
2849            .map(|stage| ReplayStageAssertion {
2850                node_id: stage.node_id.clone(),
2851                expected_status: stage.status.clone(),
2852                expected_outcome: stage.outcome.clone(),
2853                expected_branch: stage.branch.clone(),
2854                required_artifact_kinds: stage
2855                    .artifacts
2856                    .iter()
2857                    .map(|artifact| artifact.kind.clone())
2858                    .collect(),
2859                visible_text_contains: stage
2860                    .visible_text
2861                    .as_ref()
2862                    .filter(|text| !text.is_empty())
2863                    .map(|text| text.chars().take(80).collect()),
2864            })
2865            .collect(),
2866    }
2867}
2868
2869pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2870    if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2871        return evaluate_clarifying_question(run, fixture);
2872    }
2873    let mut failures = Vec::new();
2874    if run.status != fixture.expected_status {
2875        failures.push(format!(
2876            "run status mismatch: expected {}, got {}",
2877            fixture.expected_status, run.status
2878        ));
2879    }
2880    let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2881        run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2882    for assertion in &fixture.stage_assertions {
2883        let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2884            failures.push(format!("missing stage {}", assertion.node_id));
2885            continue;
2886        };
2887        if stage.status != assertion.expected_status {
2888            failures.push(format!(
2889                "stage {} status mismatch: expected {}, got {}",
2890                assertion.node_id, assertion.expected_status, stage.status
2891            ));
2892        }
2893        if stage.outcome != assertion.expected_outcome {
2894            failures.push(format!(
2895                "stage {} outcome mismatch: expected {}, got {}",
2896                assertion.node_id, assertion.expected_outcome, stage.outcome
2897            ));
2898        }
2899        if stage.branch != assertion.expected_branch {
2900            failures.push(format!(
2901                "stage {} branch mismatch: expected {:?}, got {:?}",
2902                assertion.node_id, assertion.expected_branch, stage.branch
2903            ));
2904        }
2905        for required_kind in &assertion.required_artifact_kinds {
2906            if !stage
2907                .artifacts
2908                .iter()
2909                .any(|artifact| &artifact.kind == required_kind)
2910            {
2911                failures.push(format!(
2912                    "stage {} missing artifact kind {}",
2913                    assertion.node_id, required_kind
2914                ));
2915            }
2916        }
2917        if let Some(snippet) = &assertion.visible_text_contains {
2918            let actual = stage.visible_text.clone().unwrap_or_default();
2919            if !actual.contains(snippet) {
2920                failures.push(format!(
2921                    "stage {} visible text does not contain expected snippet {:?}",
2922                    assertion.node_id, snippet
2923                ));
2924            }
2925        }
2926    }
2927
2928    ReplayEvalReport {
2929        pass: failures.is_empty(),
2930        failures,
2931        stage_count: run.stages.len(),
2932    }
2933}
2934
2935fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2936    let mut failures = Vec::new();
2937    let spec = fixture.clarifying_question.clone().unwrap_or_default();
2938    let min_questions = clarifying_min_questions(&spec);
2939    let max_questions = clarifying_max_questions(&spec);
2940    let questions = &run.hitl_questions;
2941
2942    if run.status != fixture.expected_status {
2943        failures.push(format!(
2944            "run status mismatch: expected {}, got {}",
2945            fixture.expected_status, run.status
2946        ));
2947    }
2948    if questions.len() < min_questions {
2949        failures.push(format!(
2950            "expected at least {min_questions} clarifying question(s), got {}",
2951            questions.len()
2952        ));
2953    }
2954    if questions.len() > max_questions {
2955        failures.push(format!(
2956            "expected at most {max_questions} clarifying question(s), got {}",
2957            questions.len()
2958        ));
2959    }
2960
2961    let normalized_expected = spec
2962        .expected_question
2963        .as_deref()
2964        .map(normalize_question_text);
2965    let normalized_accepted = spec
2966        .accepted_questions
2967        .iter()
2968        .map(|question| normalize_question_text(question))
2969        .collect::<Vec<_>>();
2970    let required_terms = spec
2971        .required_terms
2972        .iter()
2973        .map(|term| normalize_question_text(term))
2974        .collect::<Vec<_>>();
2975    let forbidden_terms = spec
2976        .forbidden_terms
2977        .iter()
2978        .map(|term| normalize_question_text(term))
2979        .collect::<Vec<_>>();
2980
2981    let matched = questions.iter().any(|question| {
2982        let normalized = normalize_question_text(&question.prompt);
2983        let matches_expected = normalized_expected
2984            .as_ref()
2985            .is_none_or(|expected| &normalized == expected)
2986            && (normalized_accepted.is_empty()
2987                || normalized_accepted
2988                    .iter()
2989                    .any(|candidate| candidate == &normalized));
2990        let has_required_terms = required_terms
2991            .iter()
2992            .all(|term| normalized.contains(term.as_str()));
2993        let avoids_forbidden_terms = forbidden_terms
2994            .iter()
2995            .all(|term| !normalized.contains(term.as_str()));
2996        matches_expected && has_required_terms && avoids_forbidden_terms
2997    });
2998
2999    if !questions.is_empty()
3000        && (!normalized_accepted.is_empty()
3001            || normalized_expected.is_some()
3002            || !required_terms.is_empty()
3003            || !forbidden_terms.is_empty())
3004        && !matched
3005    {
3006        failures.push(format!(
3007            "no clarifying question matched fixture; actual questions: {}",
3008            questions
3009                .iter()
3010                .map(|question| format!("{:?}", question.prompt))
3011                .collect::<Vec<_>>()
3012                .join(", ")
3013        ));
3014    }
3015
3016    ReplayEvalReport {
3017        pass: failures.is_empty(),
3018        failures,
3019        stage_count: run.stages.len(),
3020    }
3021}
3022
3023pub fn evaluate_run_suite(
3024    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
3025) -> ReplayEvalSuiteReport {
3026    let mut reports = Vec::new();
3027    for (run, fixture, source_path) in cases {
3028        let report = evaluate_run_against_fixture(&run, &fixture);
3029        reports.push(ReplayEvalCaseReport {
3030            run_id: run.id.clone(),
3031            workflow_id: run.workflow_id.clone(),
3032            label: None,
3033            pass: report.pass,
3034            failures: report.failures,
3035            stage_count: report.stage_count,
3036            source_path,
3037            comparison: None,
3038        });
3039    }
3040    let total = reports.len();
3041    let passed = reports.iter().filter(|report| report.pass).count();
3042    let failed = total.saturating_sub(passed);
3043    ReplayEvalSuiteReport {
3044        pass: failed == 0,
3045        total,
3046        passed,
3047        failed,
3048        cases: reports,
3049    }
3050}
3051
3052pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
3053    let mut stage_diffs = Vec::new();
3054    let mut all_node_ids = BTreeSet::new();
3055    let left_by_id: BTreeMap<&str, &RunStageRecord> = left
3056        .stages
3057        .iter()
3058        .map(|s| (s.node_id.as_str(), s))
3059        .collect();
3060    let right_by_id: BTreeMap<&str, &RunStageRecord> = right
3061        .stages
3062        .iter()
3063        .map(|s| (s.node_id.as_str(), s))
3064        .collect();
3065    all_node_ids.extend(left_by_id.keys().copied());
3066    all_node_ids.extend(right_by_id.keys().copied());
3067
3068    for node_id in all_node_ids {
3069        let left_stage = left_by_id.get(node_id).copied();
3070        let right_stage = right_by_id.get(node_id).copied();
3071        match (left_stage, right_stage) {
3072            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
3073                node_id: node_id.to_string(),
3074                change: "removed".to_string(),
3075                details: vec!["stage missing from right run".to_string()],
3076            }),
3077            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
3078                node_id: node_id.to_string(),
3079                change: "added".to_string(),
3080                details: vec!["stage missing from left run".to_string()],
3081            }),
3082            (Some(left_stage), Some(right_stage)) => {
3083                let mut details = Vec::new();
3084                if left_stage.status != right_stage.status {
3085                    details.push(format!(
3086                        "status: {} -> {}",
3087                        left_stage.status, right_stage.status
3088                    ));
3089                }
3090                if left_stage.outcome != right_stage.outcome {
3091                    details.push(format!(
3092                        "outcome: {} -> {}",
3093                        left_stage.outcome, right_stage.outcome
3094                    ));
3095                }
3096                if left_stage.branch != right_stage.branch {
3097                    details.push(format!(
3098                        "branch: {:?} -> {:?}",
3099                        left_stage.branch, right_stage.branch
3100                    ));
3101                }
3102                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
3103                {
3104                    details.push(format!(
3105                        "produced_artifacts: {} -> {}",
3106                        left_stage.produced_artifact_ids.len(),
3107                        right_stage.produced_artifact_ids.len()
3108                    ));
3109                }
3110                if left_stage.artifacts.len() != right_stage.artifacts.len() {
3111                    details.push(format!(
3112                        "artifact_records: {} -> {}",
3113                        left_stage.artifacts.len(),
3114                        right_stage.artifacts.len()
3115                    ));
3116                }
3117                if !details.is_empty() {
3118                    stage_diffs.push(RunStageDiffRecord {
3119                        node_id: node_id.to_string(),
3120                        change: "changed".to_string(),
3121                        details,
3122                    });
3123                }
3124            }
3125            (None, None) => {}
3126        }
3127    }
3128
3129    let mut tool_diffs = Vec::new();
3130    let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3131        .tool_recordings
3132        .iter()
3133        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3134        .collect();
3135    let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3136        .tool_recordings
3137        .iter()
3138        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3139        .collect();
3140    let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3141        .keys()
3142        .chain(right_tools.keys())
3143        .cloned()
3144        .collect();
3145    for key in &all_tool_keys {
3146        let l = left_tools.get(key);
3147        let r = right_tools.get(key);
3148        let result_changed = match (l, r) {
3149            (Some(a), Some(b)) => a.result != b.result,
3150            _ => true,
3151        };
3152        if result_changed {
3153            tool_diffs.push(ToolCallDiffRecord {
3154                tool_name: key.0.clone(),
3155                args_hash: key.1.clone(),
3156                result_changed,
3157                left_result: l.map(|t| t.result.clone()),
3158                right_result: r.map(|t| t.result.clone()),
3159            });
3160        }
3161    }
3162
3163    let left_observability = left.observability.clone().unwrap_or_else(|| {
3164        derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3165    });
3166    let right_observability = right.observability.clone().unwrap_or_else(|| {
3167        derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3168    });
3169    let mut observability_diffs = Vec::new();
3170
3171    let left_workers = left_observability
3172        .worker_lineage
3173        .iter()
3174        .map(|worker| {
3175            (
3176                worker.worker_id.clone(),
3177                (
3178                    worker.status.clone(),
3179                    worker.run_id.clone(),
3180                    worker.run_path.clone(),
3181                ),
3182            )
3183        })
3184        .collect::<BTreeMap<_, _>>();
3185    let right_workers = right_observability
3186        .worker_lineage
3187        .iter()
3188        .map(|worker| {
3189            (
3190                worker.worker_id.clone(),
3191                (
3192                    worker.status.clone(),
3193                    worker.run_id.clone(),
3194                    worker.run_path.clone(),
3195                ),
3196            )
3197        })
3198        .collect::<BTreeMap<_, _>>();
3199    let worker_ids = left_workers
3200        .keys()
3201        .chain(right_workers.keys())
3202        .cloned()
3203        .collect::<BTreeSet<_>>();
3204    for worker_id in worker_ids {
3205        match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3206            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3207                section: "worker_lineage".to_string(),
3208                label: worker_id,
3209                details: vec!["worker missing from right run".to_string()],
3210            }),
3211            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3212                section: "worker_lineage".to_string(),
3213                label: worker_id,
3214                details: vec!["worker missing from left run".to_string()],
3215            }),
3216            (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3217                let mut details = Vec::new();
3218                if left_worker.0 != right_worker.0 {
3219                    details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3220                }
3221                if left_worker.1 != right_worker.1 {
3222                    details.push(format!(
3223                        "run_id: {:?} -> {:?}",
3224                        left_worker.1, right_worker.1
3225                    ));
3226                }
3227                if left_worker.2 != right_worker.2 {
3228                    details.push(format!(
3229                        "run_path: {:?} -> {:?}",
3230                        left_worker.2, right_worker.2
3231                    ));
3232                }
3233                observability_diffs.push(RunObservabilityDiffRecord {
3234                    section: "worker_lineage".to_string(),
3235                    label: worker_id,
3236                    details,
3237                });
3238            }
3239            _ => {}
3240        }
3241    }
3242
3243    let left_rounds = left_observability
3244        .planner_rounds
3245        .iter()
3246        .map(|round| (round.stage_id.clone(), round))
3247        .collect::<BTreeMap<_, _>>();
3248    let right_rounds = right_observability
3249        .planner_rounds
3250        .iter()
3251        .map(|round| (round.stage_id.clone(), round))
3252        .collect::<BTreeMap<_, _>>();
3253    let round_ids = left_rounds
3254        .keys()
3255        .chain(right_rounds.keys())
3256        .cloned()
3257        .collect::<BTreeSet<_>>();
3258    for stage_id in round_ids {
3259        match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3260            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3261                section: "planner_rounds".to_string(),
3262                label: stage_id,
3263                details: vec!["planner summary missing from right run".to_string()],
3264            }),
3265            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3266                section: "planner_rounds".to_string(),
3267                label: stage_id,
3268                details: vec!["planner summary missing from left run".to_string()],
3269            }),
3270            (Some(left_round), Some(right_round)) => {
3271                let mut details = Vec::new();
3272                if left_round.iteration_count != right_round.iteration_count {
3273                    details.push(format!(
3274                        "iterations: {} -> {}",
3275                        left_round.iteration_count, right_round.iteration_count
3276                    ));
3277                }
3278                if left_round.tool_execution_count != right_round.tool_execution_count {
3279                    details.push(format!(
3280                        "tool_executions: {} -> {}",
3281                        left_round.tool_execution_count, right_round.tool_execution_count
3282                    ));
3283                }
3284                if left_round.native_text_tool_fallback_count
3285                    != right_round.native_text_tool_fallback_count
3286                {
3287                    details.push(format!(
3288                        "native_text_tool_fallbacks: {} -> {}",
3289                        left_round.native_text_tool_fallback_count,
3290                        right_round.native_text_tool_fallback_count
3291                    ));
3292                }
3293                if left_round.native_text_tool_fallback_rejection_count
3294                    != right_round.native_text_tool_fallback_rejection_count
3295                {
3296                    details.push(format!(
3297                        "native_text_tool_fallback_rejections: {} -> {}",
3298                        left_round.native_text_tool_fallback_rejection_count,
3299                        right_round.native_text_tool_fallback_rejection_count
3300                    ));
3301                }
3302                if left_round.empty_completion_retry_count
3303                    != right_round.empty_completion_retry_count
3304                {
3305                    details.push(format!(
3306                        "empty_completion_retries: {} -> {}",
3307                        left_round.empty_completion_retry_count,
3308                        right_round.empty_completion_retry_count
3309                    ));
3310                }
3311                if left_round.research_facts != right_round.research_facts {
3312                    details.push(format!(
3313                        "research_facts: {:?} -> {:?}",
3314                        left_round.research_facts, right_round.research_facts
3315                    ));
3316                }
3317                let left_deliverables = left_round
3318                    .task_ledger
3319                    .as_ref()
3320                    .map(|ledger| {
3321                        ledger
3322                            .deliverables
3323                            .iter()
3324                            .map(|item| format!("{}:{}", item.id, item.status))
3325                            .collect::<Vec<_>>()
3326                    })
3327                    .unwrap_or_default();
3328                let right_deliverables = right_round
3329                    .task_ledger
3330                    .as_ref()
3331                    .map(|ledger| {
3332                        ledger
3333                            .deliverables
3334                            .iter()
3335                            .map(|item| format!("{}:{}", item.id, item.status))
3336                            .collect::<Vec<_>>()
3337                    })
3338                    .unwrap_or_default();
3339                if left_deliverables != right_deliverables {
3340                    details.push(format!(
3341                        "deliverables: {:?} -> {:?}",
3342                        left_deliverables, right_deliverables
3343                    ));
3344                }
3345                if left_round.successful_tools != right_round.successful_tools {
3346                    details.push(format!(
3347                        "successful_tools: {:?} -> {:?}",
3348                        left_round.successful_tools, right_round.successful_tools
3349                    ));
3350                }
3351                if !details.is_empty() {
3352                    observability_diffs.push(RunObservabilityDiffRecord {
3353                        section: "planner_rounds".to_string(),
3354                        label: left_round.node_id.clone(),
3355                        details,
3356                    });
3357                }
3358            }
3359            _ => {}
3360        }
3361    }
3362
3363    let left_pointers = left_observability
3364        .transcript_pointers
3365        .iter()
3366        .map(|pointer| {
3367            (
3368                pointer.id.clone(),
3369                (
3370                    pointer.available,
3371                    pointer.path.clone(),
3372                    pointer.location.clone(),
3373                ),
3374            )
3375        })
3376        .collect::<BTreeMap<_, _>>();
3377    let right_pointers = right_observability
3378        .transcript_pointers
3379        .iter()
3380        .map(|pointer| {
3381            (
3382                pointer.id.clone(),
3383                (
3384                    pointer.available,
3385                    pointer.path.clone(),
3386                    pointer.location.clone(),
3387                ),
3388            )
3389        })
3390        .collect::<BTreeMap<_, _>>();
3391    let pointer_ids = left_pointers
3392        .keys()
3393        .chain(right_pointers.keys())
3394        .cloned()
3395        .collect::<BTreeSet<_>>();
3396    for pointer_id in pointer_ids {
3397        match (
3398            left_pointers.get(&pointer_id),
3399            right_pointers.get(&pointer_id),
3400        ) {
3401            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3402                section: "transcript_pointers".to_string(),
3403                label: pointer_id,
3404                details: vec!["pointer missing from right run".to_string()],
3405            }),
3406            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3407                section: "transcript_pointers".to_string(),
3408                label: pointer_id,
3409                details: vec!["pointer missing from left run".to_string()],
3410            }),
3411            (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3412                observability_diffs.push(RunObservabilityDiffRecord {
3413                    section: "transcript_pointers".to_string(),
3414                    label: pointer_id,
3415                    details: vec![format!(
3416                        "pointer: {:?} -> {:?}",
3417                        left_pointer, right_pointer
3418                    )],
3419                });
3420            }
3421            _ => {}
3422        }
3423    }
3424
3425    let left_compactions = left_observability
3426        .compaction_events
3427        .iter()
3428        .map(|event| {
3429            (
3430                event.id.clone(),
3431                (
3432                    event.strategy.clone(),
3433                    event.archived_messages,
3434                    event.snapshot_asset_id.clone(),
3435                    event.available,
3436                ),
3437            )
3438        })
3439        .collect::<BTreeMap<_, _>>();
3440    let right_compactions = right_observability
3441        .compaction_events
3442        .iter()
3443        .map(|event| {
3444            (
3445                event.id.clone(),
3446                (
3447                    event.strategy.clone(),
3448                    event.archived_messages,
3449                    event.snapshot_asset_id.clone(),
3450                    event.available,
3451                ),
3452            )
3453        })
3454        .collect::<BTreeMap<_, _>>();
3455    let compaction_ids = left_compactions
3456        .keys()
3457        .chain(right_compactions.keys())
3458        .cloned()
3459        .collect::<BTreeSet<_>>();
3460    for compaction_id in compaction_ids {
3461        match (
3462            left_compactions.get(&compaction_id),
3463            right_compactions.get(&compaction_id),
3464        ) {
3465            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3466                section: "compaction_events".to_string(),
3467                label: compaction_id,
3468                details: vec!["compaction event missing from right run".to_string()],
3469            }),
3470            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3471                section: "compaction_events".to_string(),
3472                label: compaction_id,
3473                details: vec!["compaction event missing from left run".to_string()],
3474            }),
3475            (Some(left_event), Some(right_event)) if left_event != right_event => {
3476                observability_diffs.push(RunObservabilityDiffRecord {
3477                    section: "compaction_events".to_string(),
3478                    label: compaction_id,
3479                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3480                });
3481            }
3482            _ => {}
3483        }
3484    }
3485
3486    let left_daemons = left_observability
3487        .daemon_events
3488        .iter()
3489        .map(|event| {
3490            (
3491                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3492                (
3493                    event.name.clone(),
3494                    event.persist_path.clone(),
3495                    event.payload_summary.clone(),
3496                ),
3497            )
3498        })
3499        .collect::<BTreeMap<_, _>>();
3500    let right_daemons = right_observability
3501        .daemon_events
3502        .iter()
3503        .map(|event| {
3504            (
3505                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3506                (
3507                    event.name.clone(),
3508                    event.persist_path.clone(),
3509                    event.payload_summary.clone(),
3510                ),
3511            )
3512        })
3513        .collect::<BTreeMap<_, _>>();
3514    let daemon_keys = left_daemons
3515        .keys()
3516        .chain(right_daemons.keys())
3517        .cloned()
3518        .collect::<BTreeSet<_>>();
3519    for daemon_key in daemon_keys {
3520        let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3521        match (
3522            left_daemons.get(&daemon_key),
3523            right_daemons.get(&daemon_key),
3524        ) {
3525            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3526                section: "daemon_events".to_string(),
3527                label,
3528                details: vec!["daemon event missing from right run".to_string()],
3529            }),
3530            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3531                section: "daemon_events".to_string(),
3532                label,
3533                details: vec!["daemon event missing from left run".to_string()],
3534            }),
3535            (Some(left_event), Some(right_event)) if left_event != right_event => {
3536                observability_diffs.push(RunObservabilityDiffRecord {
3537                    section: "daemon_events".to_string(),
3538                    label,
3539                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3540                });
3541            }
3542            _ => {}
3543        }
3544    }
3545
3546    let left_verification = left_observability
3547        .verification_outcomes
3548        .iter()
3549        .map(|item| (item.stage_id.clone(), item))
3550        .collect::<BTreeMap<_, _>>();
3551    let right_verification = right_observability
3552        .verification_outcomes
3553        .iter()
3554        .map(|item| (item.stage_id.clone(), item))
3555        .collect::<BTreeMap<_, _>>();
3556    let verification_ids = left_verification
3557        .keys()
3558        .chain(right_verification.keys())
3559        .cloned()
3560        .collect::<BTreeSet<_>>();
3561    for stage_id in verification_ids {
3562        match (
3563            left_verification.get(&stage_id),
3564            right_verification.get(&stage_id),
3565        ) {
3566            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3567                section: "verification".to_string(),
3568                label: stage_id,
3569                details: vec!["verification missing from right run".to_string()],
3570            }),
3571            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3572                section: "verification".to_string(),
3573                label: stage_id,
3574                details: vec!["verification missing from left run".to_string()],
3575            }),
3576            (Some(left_item), Some(right_item)) if left_item != right_item => {
3577                let mut details = Vec::new();
3578                if left_item.passed != right_item.passed {
3579                    details.push(format!(
3580                        "passed: {:?} -> {:?}",
3581                        left_item.passed, right_item.passed
3582                    ));
3583                }
3584                if left_item.summary != right_item.summary {
3585                    details.push(format!(
3586                        "summary: {:?} -> {:?}",
3587                        left_item.summary, right_item.summary
3588                    ));
3589                }
3590                observability_diffs.push(RunObservabilityDiffRecord {
3591                    section: "verification".to_string(),
3592                    label: left_item.node_id.clone(),
3593                    details,
3594                });
3595            }
3596            _ => {}
3597        }
3598    }
3599
3600    let left_graph = (
3601        left_observability.action_graph_nodes.len(),
3602        left_observability.action_graph_edges.len(),
3603    );
3604    let right_graph = (
3605        right_observability.action_graph_nodes.len(),
3606        right_observability.action_graph_edges.len(),
3607    );
3608    if left_graph != right_graph {
3609        observability_diffs.push(RunObservabilityDiffRecord {
3610            section: "action_graph".to_string(),
3611            label: "shape".to_string(),
3612            details: vec![format!(
3613                "nodes/edges: {}/{} -> {}/{}",
3614                left_graph.0, left_graph.1, right_graph.0, right_graph.1
3615            )],
3616        });
3617    }
3618
3619    let status_changed = left.status != right.status;
3620    let identical = !status_changed
3621        && stage_diffs.is_empty()
3622        && tool_diffs.is_empty()
3623        && observability_diffs.is_empty()
3624        && left.transitions.len() == right.transitions.len()
3625        && left.artifacts.len() == right.artifacts.len()
3626        && left.checkpoints.len() == right.checkpoints.len();
3627
3628    RunDiffReport {
3629        left_run_id: left.id.clone(),
3630        right_run_id: right.id.clone(),
3631        identical,
3632        status_changed,
3633        left_status: left.status.clone(),
3634        right_status: right.status.clone(),
3635        stage_diffs,
3636        tool_diffs,
3637        observability_diffs,
3638        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3639        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3640        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3641    }
3642}
3643
3644#[cfg(test)]
3645mod tests {
3646    use super::*;
3647    use std::fs;
3648
3649    fn repo_root() -> PathBuf {
3650        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
3651            .parent()
3652            .unwrap()
3653            .parent()
3654            .unwrap()
3655            .to_path_buf()
3656    }
3657
3658    fn minimal_run(status: &str) -> RunRecord {
3659        RunRecord {
3660            type_name: "workflow_run".to_string(),
3661            id: "run_1".to_string(),
3662            workflow_id: "workflow_1".to_string(),
3663            status: status.to_string(),
3664            usage: Some(LlmUsageRecord {
3665                total_duration_ms: 12,
3666                total_cost: 0.01,
3667                input_tokens: 3,
3668                output_tokens: 4,
3669                call_count: 1,
3670                models: vec!["mock".to_string()],
3671            }),
3672            replay_fixture: Some(ReplayFixture {
3673                type_name: "replay_fixture".to_string(),
3674                expected_status: "completed".to_string(),
3675                ..ReplayFixture::default()
3676            }),
3677            ..RunRecord::default()
3678        }
3679    }
3680
3681    #[test]
3682    fn eval_pack_manifest_toml_runs_replay_case() {
3683        let temp = tempfile::tempdir().unwrap();
3684        let run_path = temp.path().join("run.json");
3685        fs::write(
3686            &run_path,
3687            serde_json::to_string(&minimal_run("completed")).unwrap(),
3688        )
3689        .unwrap();
3690        let pack_path = temp.path().join("harn.eval.toml");
3691        fs::write(
3692            &pack_path,
3693            r#"
3694version = 1
3695id = "connector-regressions"
3696name = "Connector regressions"
3697
3698[[cases]]
3699id = "webhook"
3700name = "Webhook normalization"
3701run = "run.json"
3702rubrics = ["status"]
3703
3704[[rubrics]]
3705id = "status"
3706kind = "deterministic"
3707
3708[[rubrics.assertions]]
3709kind = "run-status"
3710expected = "completed"
3711"#,
3712        )
3713        .unwrap();
3714
3715        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3716        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3717
3718        assert!(report.pass);
3719        assert_eq!(report.total, 1);
3720        assert_eq!(report.cases[0].label, "Webhook normalization");
3721    }
3722
3723    #[test]
3724    fn eval_pack_warning_case_does_not_block() {
3725        let temp = tempfile::tempdir().unwrap();
3726        let run_path = temp.path().join("run.json");
3727        fs::write(
3728            &run_path,
3729            serde_json::to_string(&minimal_run("completed")).unwrap(),
3730        )
3731        .unwrap();
3732        let pack_path = temp.path().join("harn.eval.toml");
3733        fs::write(
3734            &pack_path,
3735            r#"
3736version = 1
3737id = "budgets"
3738
3739[[cases]]
3740id = "latency-budget"
3741run = "run.json"
3742severity = "warning"
3743
3744[cases.thresholds]
3745max-latency-ms = 1
3746"#,
3747        )
3748        .unwrap();
3749
3750        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3751        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3752
3753        assert!(report.pass);
3754        assert_eq!(report.warning_failed, 1);
3755        assert!(report.cases[0].warnings[0].contains("latency"));
3756    }
3757
3758    #[test]
3759    fn eval_pack_manifest_runs_persona_ladder() {
3760        let temp = tempfile::tempdir().unwrap();
3761        let pack_path = temp.path().join("harn.eval.toml");
3762        let base_dir = format!("{:?}", repo_root().display().to_string());
3763        let artifact_root = format!("{:?}", temp.path().join("artifacts").display().to_string());
3764        fs::write(
3765            &pack_path,
3766            format!(
3767                r#"
3768version = 1
3769id = "merge-captain-ladders"
3770base_dir = {}
3771
3772[[ladders]]
3773id = "merge-captain-timeout"
3774persona = "merge_captain"
3775artifact-root = {}
3776
3777[ladders.backend]
3778kind = "replay"
3779path = "examples/personas/merge_captain/transcripts/green_pr.jsonl"
3780
3781[[ladders.model-routes]]
3782id = "gemma-value"
3783route = "local/gemma-value"
3784provider = "llama.cpp"
3785model = "gemma"
3786profile = "value"
3787
3788[[ladders.timeout-tiers]]
3789id = "tiny"
3790max-tool-calls = 1
3791
3792[[ladders.timeout-tiers]]
3793id = "balanced"
3794max-tool-calls = 4
3795max-model-calls = 1
3796"#,
3797                base_dir, artifact_root
3798            ),
3799        )
3800        .unwrap();
3801
3802        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3803        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3804
3805        assert!(report.pass);
3806        assert_eq!(report.total, 1);
3807        assert_eq!(report.ladders.len(), 1);
3808        assert_eq!(
3809            report.ladders[0].first_correct_tier.as_deref(),
3810            Some("balanced")
3811        );
3812        assert_eq!(report.ladders[0].tiers[0].outcome, "degraded");
3813        assert_eq!(report.ladders[0].tiers[1].outcome, "correct");
3814    }
3815
3816    #[test]
3817    fn eval_pack_manifest_runs_friction_context_pack_case() {
3818        let temp = tempfile::tempdir().unwrap();
3819        let events_path = temp.path().join("incident-friction.json");
3820        fs::write(
3821            &events_path,
3822            r#"
3823{
3824  "events": [
3825    {
3826      "kind": "repeated_query",
3827      "source": "incident-triage",
3828      "actor": "sre",
3829      "tool": "splunk",
3830      "provider": "splunk",
3831      "redacted_summary": "Checkout incidents need the same Splunk search",
3832      "recurrence_hints": ["checkout incident queries"],
3833      "estimated_time_ms": 300000,
3834      "metadata": {
3835        "query": "index=checkout service=api error",
3836        "capability": "splunk.search",
3837        "secret_ref": "SPLUNK_READ_TOKEN",
3838        "output_slot": "splunk_errors"
3839      }
3840    },
3841    {
3842      "kind": "repeated_query",
3843      "source": "incident-triage",
3844      "actor": "sre",
3845      "tool": "splunk",
3846      "provider": "splunk",
3847      "redacted_summary": "Checkout incident triage repeated the Splunk search",
3848      "recurrence_hints": ["checkout incident queries"],
3849      "estimated_time_ms": 240000,
3850      "metadata": {
3851        "query": "index=checkout service=api error",
3852        "capability": "splunk.search",
3853        "secret_ref": "SPLUNK_READ_TOKEN",
3854        "output_slot": "splunk_errors"
3855      }
3856    }
3857  ]
3858}
3859"#,
3860        )
3861        .unwrap();
3862        let pack_path = temp.path().join("harn.eval.toml");
3863        fs::write(
3864            &pack_path,
3865            r#"
3866version = 1
3867id = "team-learning"
3868name = "Team learning evals"
3869
3870[[fixtures]]
3871id = "incident-friction"
3872kind = "friction-events"
3873path = "incident-friction.json"
3874
3875[[cases]]
3876id = "incident-context-pack"
3877name = "Incident context pack suggestion"
3878friction_events = "incident-friction"
3879rubrics = ["context-pack"]
3880
3881[[rubrics]]
3882id = "context-pack"
3883kind = "friction"
3884
3885[[rubrics.assertions]]
3886kind = "context-pack-suggestion"
3887contains = "incident"
3888expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3889"#,
3890        )
3891        .unwrap();
3892
3893        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3894        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3895
3896        assert!(report.pass);
3897        assert_eq!(report.total, 1);
3898        assert_eq!(report.cases[0].run_id, "friction_events");
3899        assert_eq!(report.cases[0].stage_count, 2);
3900    }
3901}