Skip to main content

harn_vm/orchestration/
records.rs

1//! Run records, replay fixtures, eval reports, and diff utilities.
2
3use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9    default_run_dir, evaluate_context_pack_suggestion_expectations,
10    generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11    parse_json_payload, parse_json_value, run_persona_eval_ladder, sync_run_handoffs,
12    ArtifactRecord, CapabilityPolicy, ContextPackSuggestionExpectation,
13    ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact, PersonaEvalLadderManifest,
14    PersonaEvalLadderReport,
15};
16use crate::agent_events::AgentEvent;
17use crate::event_log::{
18    active_event_log, sanitize_topic_component, AnyEventLog, EventId, EventLog,
19    LogEvent as EventLogRecord, Topic,
20};
21use crate::llm::vm_value_to_json;
22use crate::triggers::{SignatureStatus, TriggerEvent};
23use crate::value::{VmError, VmValue};
24
25pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
26pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
27pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
28pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
29pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
30pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
31pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
32pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
33pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
34pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
35pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
36
37pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
38pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
39pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
40pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
41pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
42pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
43pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
44pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
45pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
46
47#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
48#[serde(default)]
49pub struct LlmUsageRecord {
50    pub input_tokens: i64,
51    pub output_tokens: i64,
52    pub total_duration_ms: i64,
53    pub call_count: i64,
54    pub total_cost: f64,
55    pub models: Vec<String>,
56}
57
58#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
59#[serde(default)]
60pub struct RunStageRecord {
61    pub id: String,
62    pub node_id: String,
63    pub kind: String,
64    pub status: String,
65    pub outcome: String,
66    pub branch: Option<String>,
67    pub started_at: String,
68    pub finished_at: Option<String>,
69    pub visible_text: Option<String>,
70    pub private_reasoning: Option<String>,
71    pub transcript: Option<serde_json::Value>,
72    pub verification: Option<serde_json::Value>,
73    pub usage: Option<LlmUsageRecord>,
74    pub artifacts: Vec<ArtifactRecord>,
75    pub consumed_artifact_ids: Vec<String>,
76    pub produced_artifact_ids: Vec<String>,
77    pub attempts: Vec<RunStageAttemptRecord>,
78    pub metadata: BTreeMap<String, serde_json::Value>,
79}
80
81#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
82#[serde(default)]
83pub struct RunStageAttemptRecord {
84    pub attempt: usize,
85    pub status: String,
86    pub outcome: String,
87    pub branch: Option<String>,
88    pub error: Option<String>,
89    pub verification: Option<serde_json::Value>,
90    pub started_at: String,
91    pub finished_at: Option<String>,
92}
93
94#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
95#[serde(default)]
96pub struct RunTransitionRecord {
97    pub id: String,
98    pub from_stage_id: Option<String>,
99    pub from_node_id: Option<String>,
100    pub to_node_id: String,
101    pub branch: Option<String>,
102    pub timestamp: String,
103    pub consumed_artifact_ids: Vec<String>,
104    pub produced_artifact_ids: Vec<String>,
105}
106
107#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
108#[serde(default)]
109pub struct RunCheckpointRecord {
110    pub id: String,
111    pub ready_nodes: Vec<String>,
112    pub completed_nodes: Vec<String>,
113    pub last_stage_id: Option<String>,
114    pub persisted_at: String,
115    pub reason: String,
116}
117
118#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
119#[serde(default)]
120pub struct ReplayFixture {
121    #[serde(rename = "_type")]
122    pub type_name: String,
123    pub id: String,
124    pub source_run_id: String,
125    pub workflow_id: String,
126    pub workflow_name: Option<String>,
127    pub created_at: String,
128    pub eval_kind: Option<String>,
129    pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
130    pub expected_status: String,
131    pub stage_assertions: Vec<ReplayStageAssertion>,
132}
133
134#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
135#[serde(default)]
136pub struct ClarifyingQuestionEvalSpec {
137    pub expected_question: Option<String>,
138    pub accepted_questions: Vec<String>,
139    pub required_terms: Vec<String>,
140    pub forbidden_terms: Vec<String>,
141    pub min_questions: usize,
142    pub max_questions: Option<usize>,
143}
144
145#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
146#[serde(default)]
147pub struct ReplayStageAssertion {
148    pub node_id: String,
149    pub expected_status: String,
150    pub expected_outcome: String,
151    pub expected_branch: Option<String>,
152    pub required_artifact_kinds: Vec<String>,
153    pub visible_text_contains: Option<String>,
154}
155
156#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
157#[serde(default)]
158pub struct ReplayEvalReport {
159    pub pass: bool,
160    pub failures: Vec<String>,
161    pub stage_count: usize,
162}
163
164#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
165#[serde(default)]
166pub struct ReplayEvalCaseReport {
167    pub run_id: String,
168    pub workflow_id: String,
169    pub label: Option<String>,
170    pub pass: bool,
171    pub failures: Vec<String>,
172    pub stage_count: usize,
173    pub source_path: Option<String>,
174    pub comparison: Option<RunDiffReport>,
175}
176
177#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
178#[serde(default)]
179pub struct ReplayEvalSuiteReport {
180    pub pass: bool,
181    pub total: usize,
182    pub passed: usize,
183    pub failed: usize,
184    pub cases: Vec<ReplayEvalCaseReport>,
185}
186
187#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
188#[serde(default)]
189pub struct RunDeliverableSummaryRecord {
190    pub id: String,
191    pub text: String,
192    pub status: String,
193    pub note: Option<String>,
194}
195
196#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
197#[serde(default)]
198pub struct RunTaskLedgerSummaryRecord {
199    pub root_task: String,
200    pub rationale: String,
201    pub deliverables: Vec<RunDeliverableSummaryRecord>,
202    pub observations: Vec<String>,
203    pub blocking_count: usize,
204}
205
206#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
207#[serde(default)]
208pub struct RunPlannerRoundRecord {
209    pub stage_id: String,
210    pub node_id: String,
211    pub stage_kind: String,
212    pub status: String,
213    pub outcome: String,
214    pub iteration_count: usize,
215    pub llm_call_count: usize,
216    pub tool_execution_count: usize,
217    pub tool_rejection_count: usize,
218    pub intervention_count: usize,
219    pub compaction_count: usize,
220    pub native_text_tool_fallback_count: usize,
221    pub native_text_tool_fallback_rejection_count: usize,
222    pub empty_completion_retry_count: usize,
223    pub tools_used: Vec<String>,
224    pub successful_tools: Vec<String>,
225    pub ledger_done_rejections: usize,
226    pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
227    pub research_facts: Vec<String>,
228}
229
230#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
231#[serde(default)]
232pub struct RunWorkerLineageRecord {
233    pub worker_id: String,
234    pub worker_name: String,
235    pub parent_stage_id: Option<String>,
236    pub task: String,
237    pub status: String,
238    pub session_id: Option<String>,
239    pub parent_session_id: Option<String>,
240    pub run_id: Option<String>,
241    pub run_path: Option<String>,
242    pub snapshot_path: Option<String>,
243}
244
245#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
246#[serde(default)]
247pub struct RunActionGraphNodeRecord {
248    pub id: String,
249    pub label: String,
250    pub kind: String,
251    pub status: String,
252    pub outcome: String,
253    pub trace_id: Option<String>,
254    pub stage_id: Option<String>,
255    pub node_id: Option<String>,
256    pub worker_id: Option<String>,
257    pub run_id: Option<String>,
258    pub run_path: Option<String>,
259    pub metadata: BTreeMap<String, serde_json::Value>,
260}
261
262#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
263#[serde(default)]
264pub struct RunActionGraphEdgeRecord {
265    pub from_id: String,
266    pub to_id: String,
267    pub kind: String,
268    pub label: Option<String>,
269}
270
271#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
272#[serde(default)]
273pub struct RunVerificationOutcomeRecord {
274    pub stage_id: String,
275    pub node_id: String,
276    pub status: String,
277    pub passed: Option<bool>,
278    pub summary: Option<String>,
279}
280
281#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
282#[serde(default)]
283pub struct RunTranscriptPointerRecord {
284    pub id: String,
285    pub label: String,
286    pub kind: String,
287    pub location: String,
288    pub path: Option<String>,
289    pub available: bool,
290}
291
292#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
293#[serde(default)]
294pub struct CompactionEventRecord {
295    pub id: String,
296    pub transcript_id: Option<String>,
297    pub stage_id: Option<String>,
298    pub node_id: Option<String>,
299    pub mode: String,
300    pub strategy: String,
301    pub archived_messages: usize,
302    pub estimated_tokens_before: usize,
303    pub estimated_tokens_after: usize,
304    pub snapshot_asset_id: Option<String>,
305    pub snapshot_location: String,
306    pub snapshot_path: Option<String>,
307    pub available: bool,
308}
309
310#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
311#[serde(rename_all = "snake_case")]
312pub enum DaemonEventKindRecord {
313    #[default]
314    Spawned,
315    Triggered,
316    Snapshotted,
317    Resumed,
318    Stopped,
319}
320
321#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
322#[serde(default)]
323pub struct DaemonEventRecord {
324    pub daemon_id: String,
325    pub name: String,
326    pub kind: DaemonEventKindRecord,
327    pub timestamp: String,
328    pub persist_path: String,
329    pub payload_summary: Option<String>,
330}
331
332#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
333#[serde(default)]
334pub struct RunObservabilityRecord {
335    pub schema_version: usize,
336    pub planner_rounds: Vec<RunPlannerRoundRecord>,
337    pub research_fact_count: usize,
338    pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
339    pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
340    pub worker_lineage: Vec<RunWorkerLineageRecord>,
341    pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
342    pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
343    pub compaction_events: Vec<CompactionEventRecord>,
344    pub daemon_events: Vec<DaemonEventRecord>,
345}
346
347#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
348#[serde(default)]
349pub struct RunStageDiffRecord {
350    pub node_id: String,
351    pub change: String,
352    pub details: Vec<String>,
353}
354
355#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
356#[serde(default)]
357pub struct ToolCallDiffRecord {
358    pub tool_name: String,
359    pub args_hash: String,
360    pub result_changed: bool,
361    pub left_result: Option<String>,
362    pub right_result: Option<String>,
363}
364
365#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
366#[serde(default)]
367pub struct RunObservabilityDiffRecord {
368    pub section: String,
369    pub label: String,
370    pub details: Vec<String>,
371}
372
373#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
374#[serde(default)]
375pub struct RunDiffReport {
376    pub left_run_id: String,
377    pub right_run_id: String,
378    pub identical: bool,
379    pub status_changed: bool,
380    pub left_status: String,
381    pub right_status: String,
382    pub stage_diffs: Vec<RunStageDiffRecord>,
383    pub tool_diffs: Vec<ToolCallDiffRecord>,
384    pub observability_diffs: Vec<RunObservabilityDiffRecord>,
385    pub transition_count_delta: isize,
386    pub artifact_count_delta: isize,
387    pub checkpoint_count_delta: isize,
388}
389
390#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
391#[serde(default)]
392pub struct EvalSuiteManifest {
393    #[serde(rename = "_type")]
394    pub type_name: String,
395    pub id: String,
396    pub name: Option<String>,
397    pub base_dir: Option<String>,
398    pub cases: Vec<EvalSuiteCase>,
399}
400
401#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
402#[serde(default)]
403pub struct EvalSuiteCase {
404    pub label: Option<String>,
405    pub run_path: String,
406    pub fixture_path: Option<String>,
407    pub compare_to: Option<String>,
408}
409
410#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
411#[serde(default)]
412pub struct EvalPackManifest {
413    pub version: u32,
414    pub id: String,
415    pub name: Option<String>,
416    pub description: Option<String>,
417    pub base_dir: Option<String>,
418    pub baseline: Option<String>,
419    pub package: Option<EvalPackPackage>,
420    pub defaults: EvalPackDefaults,
421    pub fixtures: Vec<EvalPackFixtureRef>,
422    pub rubrics: Vec<EvalPackRubric>,
423    pub judge: Option<EvalPackJudgeConfig>,
424    pub cases: Vec<EvalPackCase>,
425    pub ladders: Vec<PersonaEvalLadderManifest>,
426    pub metadata: BTreeMap<String, serde_json::Value>,
427}
428
429#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
430#[serde(default)]
431pub struct EvalPackPackage {
432    pub name: Option<String>,
433    pub version: Option<String>,
434    pub source: Option<String>,
435    pub templates: Vec<String>,
436}
437
438#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
439#[serde(default)]
440pub struct EvalPackDefaults {
441    pub severity: Option<String>,
442    pub fixture_root: Option<String>,
443    pub thresholds: EvalPackThresholds,
444    pub judge: Option<EvalPackJudgeConfig>,
445}
446
447#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
448#[serde(default)]
449pub struct EvalPackFixtureRef {
450    pub id: String,
451    pub kind: String,
452    pub path: Option<String>,
453    #[serde(default, alias = "trace-id")]
454    pub trace_id: Option<String>,
455    pub provider: Option<String>,
456    #[serde(default, alias = "event-kind")]
457    pub event_kind: Option<String>,
458    pub inline: Option<serde_json::Value>,
459    pub metadata: BTreeMap<String, serde_json::Value>,
460}
461
462#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
463#[serde(default)]
464pub struct EvalPackRubric {
465    pub id: String,
466    pub kind: String,
467    pub description: Option<String>,
468    pub prompt: Option<String>,
469    pub assertions: Vec<EvalPackAssertion>,
470    pub judge: Option<EvalPackJudgeConfig>,
471    pub calibration: Vec<EvalPackGoldenExample>,
472    pub thresholds: EvalPackThresholds,
473    pub metadata: BTreeMap<String, serde_json::Value>,
474}
475
476#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
477#[serde(default)]
478pub struct EvalPackAssertion {
479    pub kind: String,
480    pub stage: Option<String>,
481    pub path: Option<String>,
482    pub op: Option<String>,
483    pub expected: Option<serde_json::Value>,
484    pub contains: Option<String>,
485    pub metadata: BTreeMap<String, serde_json::Value>,
486}
487
488#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
489#[serde(default)]
490pub struct EvalPackJudgeConfig {
491    pub model: Option<String>,
492    #[serde(default, alias = "prompt-version")]
493    pub prompt_version: Option<String>,
494    #[serde(default, alias = "tie-break")]
495    pub tie_break: Option<String>,
496    #[serde(default, alias = "confidence-min")]
497    pub confidence_min: Option<f64>,
498    pub temperature: Option<f64>,
499    pub rubric: Option<String>,
500    pub metadata: BTreeMap<String, serde_json::Value>,
501}
502
503#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
504#[serde(default)]
505pub struct EvalPackGoldenExample {
506    pub input: serde_json::Value,
507    pub output: serde_json::Value,
508    pub score: Option<f64>,
509    pub explanation: Option<String>,
510}
511
512#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
513#[serde(default)]
514pub struct EvalPackThresholds {
515    pub severity: Option<String>,
516    #[serde(default, alias = "min-score")]
517    pub min_score: Option<f64>,
518    #[serde(default, alias = "min-confidence")]
519    pub min_confidence: Option<f64>,
520    #[serde(default, alias = "max-cost-usd")]
521    pub max_cost_usd: Option<f64>,
522    #[serde(default, alias = "max-latency-ms")]
523    pub max_latency_ms: Option<i64>,
524    #[serde(default, alias = "max-tokens")]
525    pub max_tokens: Option<i64>,
526    #[serde(default, alias = "max-stage-count")]
527    pub max_stage_count: Option<usize>,
528}
529
530#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
531#[serde(default)]
532pub struct EvalPackCase {
533    pub id: Option<String>,
534    pub name: Option<String>,
535    pub description: Option<String>,
536    pub run: Option<String>,
537    #[serde(default, alias = "run-path")]
538    pub run_path: Option<String>,
539    #[serde(default, alias = "friction-events", alias = "friction_events")]
540    pub friction_events: Option<String>,
541    pub fixture: Option<String>,
542    #[serde(default, alias = "fixture-path")]
543    pub fixture_path: Option<String>,
544    #[serde(default, alias = "compare-to")]
545    pub compare_to: Option<String>,
546    pub rubrics: Vec<String>,
547    pub severity: Option<String>,
548    pub thresholds: EvalPackThresholds,
549    pub metadata: BTreeMap<String, serde_json::Value>,
550}
551
552#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
553#[serde(default)]
554pub struct EvalPackReport {
555    pub pack_id: String,
556    pub pass: bool,
557    pub total: usize,
558    pub passed: usize,
559    pub failed: usize,
560    pub blocking_failed: usize,
561    pub warning_failed: usize,
562    pub informational_failed: usize,
563    pub cases: Vec<EvalPackCaseReport>,
564    pub ladders: Vec<PersonaEvalLadderReport>,
565}
566
567#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
568#[serde(default)]
569pub struct EvalPackCaseReport {
570    pub id: String,
571    pub label: String,
572    pub severity: String,
573    pub pass: bool,
574    pub blocking: bool,
575    pub run_id: String,
576    pub workflow_id: String,
577    pub source_path: Option<String>,
578    pub stage_count: usize,
579    pub failures: Vec<String>,
580    pub warnings: Vec<String>,
581    pub informational: Vec<String>,
582    pub comparison: Option<RunDiffReport>,
583}
584
585#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
586#[serde(default)]
587pub struct RunHitlQuestionRecord {
588    pub request_id: String,
589    pub prompt: String,
590    pub agent: String,
591    pub trace_id: Option<String>,
592    pub asked_at: String,
593}
594
595#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
596#[serde(default)]
597pub struct RunRecord {
598    #[serde(rename = "_type")]
599    pub type_name: String,
600    pub id: String,
601    pub workflow_id: String,
602    pub workflow_name: Option<String>,
603    pub task: String,
604    pub status: String,
605    pub started_at: String,
606    pub finished_at: Option<String>,
607    pub parent_run_id: Option<String>,
608    pub root_run_id: Option<String>,
609    pub stages: Vec<RunStageRecord>,
610    pub transitions: Vec<RunTransitionRecord>,
611    pub checkpoints: Vec<RunCheckpointRecord>,
612    pub pending_nodes: Vec<String>,
613    pub completed_nodes: Vec<String>,
614    pub child_runs: Vec<RunChildRecord>,
615    pub artifacts: Vec<ArtifactRecord>,
616    pub handoffs: Vec<HandoffArtifact>,
617    pub policy: CapabilityPolicy,
618    pub execution: Option<RunExecutionRecord>,
619    pub transcript: Option<serde_json::Value>,
620    pub usage: Option<LlmUsageRecord>,
621    pub replay_fixture: Option<ReplayFixture>,
622    pub observability: Option<RunObservabilityRecord>,
623    pub trace_spans: Vec<RunTraceSpanRecord>,
624    pub tool_recordings: Vec<ToolCallRecord>,
625    pub hitl_questions: Vec<RunHitlQuestionRecord>,
626    pub metadata: BTreeMap<String, serde_json::Value>,
627    pub persisted_path: Option<String>,
628}
629
630#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
631#[serde(default)]
632pub struct ToolCallRecord {
633    pub tool_name: String,
634    pub tool_use_id: String,
635    pub args_hash: String,
636    pub result: String,
637    pub is_rejected: bool,
638    pub duration_ms: u64,
639    pub iteration: usize,
640    pub timestamp: String,
641}
642
643/// Hash a tool invocation for fixture lookup (name + canonical args JSON).
644pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
645    use std::hash::{Hash, Hasher};
646    let mut hasher = std::collections::hash_map::DefaultHasher::new();
647    tool_name.hash(&mut hasher);
648    let args_str = serde_json::to_string(args).unwrap_or_default();
649    args_str.hash(&mut hasher);
650    format!("{:016x}", hasher.finish())
651}
652
653#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
654#[serde(default)]
655pub struct RunTraceSpanRecord {
656    pub span_id: u64,
657    pub parent_id: Option<u64>,
658    pub kind: String,
659    pub name: String,
660    pub start_ms: u64,
661    pub duration_ms: u64,
662    pub metadata: BTreeMap<String, serde_json::Value>,
663}
664
665#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
666#[serde(default)]
667pub struct RunChildRecord {
668    pub worker_id: String,
669    pub worker_name: String,
670    pub parent_stage_id: Option<String>,
671    pub session_id: Option<String>,
672    pub parent_session_id: Option<String>,
673    pub mutation_scope: Option<String>,
674    pub approval_policy: Option<super::ToolApprovalPolicy>,
675    pub task: String,
676    pub request: Option<serde_json::Value>,
677    pub provenance: Option<serde_json::Value>,
678    pub status: String,
679    pub started_at: String,
680    pub finished_at: Option<String>,
681    pub run_id: Option<String>,
682    pub run_path: Option<String>,
683    pub snapshot_path: Option<String>,
684    pub execution: Option<RunExecutionRecord>,
685}
686
687pub(crate) fn run_child_record_from_worker_metadata(
688    parent_stage_id: Option<String>,
689    worker: &serde_json::Value,
690) -> Option<RunChildRecord> {
691    let worker_id = worker.get("id").and_then(|value| value.as_str())?;
692    if worker_id.is_empty() {
693        return None;
694    }
695    Some(RunChildRecord {
696        worker_id: worker_id.to_string(),
697        worker_name: worker
698            .get("name")
699            .and_then(|value| value.as_str())
700            .unwrap_or("worker")
701            .to_string(),
702        parent_stage_id,
703        session_id: worker
704            .get("audit")
705            .and_then(|value| value.get("session_id"))
706            .and_then(|value| value.as_str())
707            .map(str::to_string),
708        parent_session_id: worker
709            .get("audit")
710            .and_then(|value| value.get("parent_session_id"))
711            .and_then(|value| value.as_str())
712            .map(str::to_string),
713        mutation_scope: worker
714            .get("audit")
715            .and_then(|value| value.get("mutation_scope"))
716            .and_then(|value| value.as_str())
717            .map(str::to_string),
718        approval_policy: worker
719            .get("audit")
720            .and_then(|value| value.get("approval_policy"))
721            .and_then(|value| {
722                serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
723            }),
724        task: worker
725            .get("task")
726            .and_then(|value| value.as_str())
727            .unwrap_or_default()
728            .to_string(),
729        request: worker.get("request").cloned(),
730        provenance: worker.get("provenance").cloned(),
731        status: worker
732            .get("status")
733            .and_then(|value| value.as_str())
734            .unwrap_or("completed")
735            .to_string(),
736        started_at: worker
737            .get("started_at")
738            .and_then(|value| value.as_str())
739            .unwrap_or_default()
740            .to_string(),
741        finished_at: worker
742            .get("finished_at")
743            .and_then(|value| value.as_str())
744            .map(str::to_string),
745        run_id: worker
746            .get("child_run_id")
747            .and_then(|value| value.as_str())
748            .map(str::to_string),
749        run_path: worker
750            .get("child_run_path")
751            .and_then(|value| value.as_str())
752            .map(str::to_string),
753        snapshot_path: worker
754            .get("snapshot_path")
755            .and_then(|value| value.as_str())
756            .map(str::to_string),
757        execution: worker
758            .get("execution")
759            .cloned()
760            .and_then(|value| serde_json::from_value(value).ok()),
761    })
762}
763
764fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
765    let parent_stage_id = if stage.id.is_empty() {
766        None
767    } else {
768        Some(stage.id.clone())
769    };
770    run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
771}
772
773fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
774    if existing.worker_name.is_empty() {
775        existing.worker_name = child.worker_name;
776    }
777    if existing.parent_stage_id.is_none() {
778        existing.parent_stage_id = child.parent_stage_id;
779    }
780    if existing.session_id.is_none() {
781        existing.session_id = child.session_id;
782    }
783    if existing.parent_session_id.is_none() {
784        existing.parent_session_id = child.parent_session_id;
785    }
786    if existing.mutation_scope.is_none() {
787        existing.mutation_scope = child.mutation_scope;
788    }
789    if existing.approval_policy.is_none() {
790        existing.approval_policy = child.approval_policy;
791    }
792    if existing.task.is_empty() {
793        existing.task = child.task;
794    }
795    if existing.request.is_none() {
796        existing.request = child.request;
797    }
798    if existing.provenance.is_none() {
799        existing.provenance = child.provenance;
800    }
801    if existing.status.is_empty() {
802        existing.status = child.status;
803    }
804    if existing.started_at.is_empty() {
805        existing.started_at = child.started_at;
806    }
807    if existing.finished_at.is_none() {
808        existing.finished_at = child.finished_at;
809    }
810    if existing.run_id.is_none() {
811        existing.run_id = child.run_id;
812    }
813    if existing.run_path.is_none() {
814        existing.run_path = child.run_path;
815    }
816    if existing.snapshot_path.is_none() {
817        existing.snapshot_path = child.snapshot_path;
818    }
819    if existing.execution.is_none() {
820        existing.execution = child.execution;
821    }
822}
823
824fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
825    for child in run
826        .stages
827        .iter()
828        .filter_map(run_child_from_stage_metadata)
829        .collect::<Vec<_>>()
830    {
831        match run
832            .child_runs
833            .iter_mut()
834            .find(|existing| existing.worker_id == child.worker_id)
835        {
836            Some(existing) => fill_missing_child_run_fields(existing, child),
837            None => run.child_runs.push(child),
838        }
839    }
840}
841
842#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
843#[serde(default)]
844pub struct RunExecutionRecord {
845    pub cwd: Option<String>,
846    pub source_dir: Option<String>,
847    pub env: BTreeMap<String, String>,
848    pub adapter: Option<String>,
849    pub repo_path: Option<String>,
850    pub worktree_path: Option<String>,
851    pub branch: Option<String>,
852    pub base_ref: Option<String>,
853    pub cleanup: Option<String>,
854}
855
856fn compact_json_value(value: &serde_json::Value) -> String {
857    serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
858}
859
860fn normalize_question_text(text: &str) -> String {
861    text.chars()
862        .map(|ch| {
863            if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
864                ch.to_ascii_lowercase()
865            } else {
866                ' '
867            }
868        })
869        .collect::<String>()
870        .split_whitespace()
871        .collect::<Vec<_>>()
872        .join(" ")
873}
874
875fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
876    spec.min_questions.max(1)
877}
878
879fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
880    spec.max_questions.unwrap_or(1).max(1)
881}
882
883fn read_topic_records(
884    log: &AnyEventLog,
885    topic: &Topic,
886) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
887    let mut from = None;
888    let mut records = Vec::new();
889    loop {
890        let batch =
891            futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
892        if batch.is_empty() {
893            break;
894        }
895        from = batch.last().map(|(event_id, _)| *event_id);
896        records.extend(batch);
897    }
898    records
899}
900
901#[derive(Clone, Debug)]
902pub struct AgentSessionReplayEvent {
903    pub event_id: EventId,
904    pub event: AgentEvent,
905}
906
907pub async fn load_agent_session_replay_events(
908    session_id: &str,
909) -> Result<Vec<AgentSessionReplayEvent>, VmError> {
910    let Some(log) = active_event_log() else {
911        return Ok(Vec::new());
912    };
913    let topic = Topic::new(format!(
914        "observability.agent_events.{}",
915        sanitize_topic_component(session_id)
916    ))
917    .map_err(|error| VmError::Runtime(format!("failed to build agent event topic: {error}")))?;
918
919    let mut events = Vec::new();
920    let mut from = None;
921    loop {
922        let batch = log.read_range(&topic, from, 1024).await.map_err(|error| {
923            VmError::Runtime(format!(
924                "failed to read agent event replay topic {}: {error}",
925                topic.as_str()
926            ))
927        })?;
928        let batch_len = batch.len();
929        for (event_id, record) in batch {
930            from = Some(event_id);
931            if record.headers.get("session_id").map(String::as_str) != Some(session_id) {
932                continue;
933            }
934            let Some(event_value) = record.payload.get("event").cloned() else {
935                continue;
936            };
937            let event = serde_json::from_value::<AgentEvent>(event_value).map_err(|error| {
938                VmError::Runtime(format!(
939                    "failed to decode agent event replay record {event_id}: {error}"
940                ))
941            })?;
942            if event.session_id() == session_id {
943                events.push(AgentSessionReplayEvent { event_id, event });
944            }
945        }
946        if batch_len < 1024 {
947            break;
948        }
949    }
950    Ok(events)
951}
952
953fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
954    let Some(log) = active_event_log() else {
955        return;
956    };
957    let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
958        .expect("static hitl.questions topic should always be valid");
959    let mut merged = run
960        .hitl_questions
961        .iter()
962        .cloned()
963        .map(|question| (question.request_id.clone(), question))
964        .collect::<BTreeMap<_, _>>();
965
966    for (_, event) in read_topic_records(log.as_ref(), &topic) {
967        if event.kind != "hitl.question_asked" {
968            continue;
969        }
970        let payload = &event.payload;
971        let matches_run = event
972            .headers
973            .get("run_id")
974            .is_some_and(|value| value == &run.id)
975            || payload
976                .get("run_id")
977                .and_then(|value| value.as_str())
978                .is_some_and(|value| value == run.id);
979        if !matches_run {
980            continue;
981        }
982        let request_id = payload
983            .get("request_id")
984            .and_then(|value| value.as_str())
985            .or_else(|| event.headers.get("request_id").map(String::as_str))
986            .unwrap_or_default();
987        let prompt = payload
988            .get("payload")
989            .and_then(|value| value.get("prompt"))
990            .and_then(|value| value.as_str())
991            .unwrap_or_default();
992        if request_id.is_empty() || prompt.is_empty() {
993            continue;
994        }
995        merged.insert(
996            request_id.to_string(),
997            RunHitlQuestionRecord {
998                request_id: request_id.to_string(),
999                prompt: prompt.to_string(),
1000                agent: payload
1001                    .get("agent")
1002                    .and_then(|value| value.as_str())
1003                    .unwrap_or_default()
1004                    .to_string(),
1005                trace_id: payload
1006                    .get("trace_id")
1007                    .and_then(|value| value.as_str())
1008                    .map(str::to_string),
1009                asked_at: payload
1010                    .get("requested_at")
1011                    .and_then(|value| value.as_str())
1012                    .unwrap_or_default()
1013                    .to_string(),
1014            },
1015        );
1016    }
1017
1018    run.hitl_questions = merged.into_values().collect();
1019    run.hitl_questions.sort_by(|left, right| {
1020        (left.asked_at.as_str(), left.request_id.as_str())
1021            .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
1022    });
1023}
1024
1025fn signature_status_label(status: &SignatureStatus) -> &'static str {
1026    match status {
1027        SignatureStatus::Verified => "verified",
1028        SignatureStatus::Unsigned => "unsigned",
1029        SignatureStatus::Failed { .. } => "failed",
1030    }
1031}
1032
1033fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
1034    run.metadata
1035        .get("trigger_event")
1036        .cloned()
1037        .and_then(|value| serde_json::from_value(value).ok())
1038}
1039
1040fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
1041    trigger_event
1042        .map(|event| event.trace_id.0.clone())
1043        .or_else(|| {
1044            run.metadata
1045                .get("trace_id")
1046                .and_then(|value| value.as_str())
1047                .map(str::to_string)
1048        })
1049}
1050
1051fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
1052    run.metadata
1053        .get("replay_of_event_id")
1054        .and_then(|value| value.as_str())
1055        .map(str::to_string)
1056}
1057
1058fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1059    if stage.kind == "condition" {
1060        ACTION_GRAPH_NODE_KIND_PREDICATE
1061    } else {
1062        ACTION_GRAPH_NODE_KIND_STAGE
1063    }
1064}
1065
1066fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1067    let mut metadata = BTreeMap::new();
1068    metadata.insert(
1069        "provider".to_string(),
1070        serde_json::json!(trigger_event.provider.as_str()),
1071    );
1072    metadata.insert(
1073        "event_kind".to_string(),
1074        serde_json::json!(trigger_event.kind),
1075    );
1076    metadata.insert(
1077        "dedupe_key".to_string(),
1078        serde_json::json!(trigger_event.dedupe_key),
1079    );
1080    metadata.insert(
1081        "signature_status".to_string(),
1082        serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1083    );
1084    metadata
1085}
1086
1087fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1088    let mut metadata = BTreeMap::new();
1089    metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1090    if let Some(branch) = stage.branch.as_ref() {
1091        metadata.insert("branch".to_string(), serde_json::json!(branch));
1092    }
1093    if let Some(worker_id) = stage
1094        .metadata
1095        .get("worker_id")
1096        .and_then(|value| value.as_str())
1097    {
1098        metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1099    }
1100    metadata
1101}
1102
1103fn append_action_graph_node(
1104    nodes: &mut Vec<RunActionGraphNodeRecord>,
1105    record: RunActionGraphNodeRecord,
1106) {
1107    nodes.push(record);
1108}
1109
1110pub async fn append_action_graph_update(
1111    headers: BTreeMap<String, String>,
1112    payload: serde_json::Value,
1113) -> Result<(), crate::event_log::LogError> {
1114    let Some(log) = active_event_log() else {
1115        return Ok(());
1116    };
1117    let topic = Topic::new("observability.action_graph")
1118        .expect("static observability.action_graph topic should always be valid");
1119    let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1120    log.append(&topic, record).await.map(|_| ())
1121}
1122
1123fn publish_action_graph_event(
1124    run: &RunRecord,
1125    observability: &RunObservabilityRecord,
1126    path: &Path,
1127) {
1128    let trigger_event = trigger_event_from_run(run);
1129    let mut headers = BTreeMap::new();
1130    headers.insert("run_id".to_string(), run.id.clone());
1131    headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1132    if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1133        headers.insert("trace_id".to_string(), trace_id);
1134    }
1135    let payload = serde_json::json!({
1136        "run_id": run.id,
1137        "workflow_id": run.workflow_id,
1138        "persisted_path": path.to_string_lossy(),
1139        "status": run.status,
1140        "observability": observability,
1141    });
1142    if let Ok(handle) = tokio::runtime::Handle::try_current() {
1143        handle.spawn(async move {
1144            let _ = append_action_graph_update(headers, payload).await;
1145        });
1146    } else {
1147        let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1148    }
1149}
1150
1151fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1152    let stem = run_path.file_stem()?.to_str()?;
1153    let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1154    Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1155}
1156
1157fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1158    value
1159        .and_then(|value| value.as_array())
1160        .map(|items| {
1161            items
1162                .iter()
1163                .filter_map(|item| item.as_str().map(str::to_string))
1164                .collect::<Vec<_>>()
1165        })
1166        .unwrap_or_default()
1167}
1168
1169fn json_usize(value: Option<&serde_json::Value>) -> usize {
1170    value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1171}
1172
1173fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1174    value.and_then(|value| value.as_bool())
1175}
1176
1177fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1178    stage
1179        .artifacts
1180        .iter()
1181        .find_map(|artifact| artifact.data.as_ref())
1182}
1183
1184fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1185    let deliverables = value
1186        .get("deliverables")
1187        .and_then(|raw| raw.as_array())
1188        .map(|items| {
1189            items
1190                .iter()
1191                .map(|item| RunDeliverableSummaryRecord {
1192                    id: item
1193                        .get("id")
1194                        .and_then(|value| value.as_str())
1195                        .unwrap_or_default()
1196                        .to_string(),
1197                    text: item
1198                        .get("text")
1199                        .and_then(|value| value.as_str())
1200                        .unwrap_or_default()
1201                        .to_string(),
1202                    status: item
1203                        .get("status")
1204                        .and_then(|value| value.as_str())
1205                        .unwrap_or_default()
1206                        .to_string(),
1207                    note: item
1208                        .get("note")
1209                        .and_then(|value| value.as_str())
1210                        .map(str::to_string),
1211                })
1212                .collect::<Vec<_>>()
1213        })
1214        .unwrap_or_default();
1215    let observations = json_string_array(value.get("observations"));
1216    let root_task = value
1217        .get("root_task")
1218        .and_then(|value| value.as_str())
1219        .unwrap_or_default()
1220        .to_string();
1221    let rationale = value
1222        .get("rationale")
1223        .and_then(|value| value.as_str())
1224        .unwrap_or_default()
1225        .to_string();
1226    if root_task.is_empty()
1227        && rationale.is_empty()
1228        && deliverables.is_empty()
1229        && observations.is_empty()
1230    {
1231        return None;
1232    }
1233    let blocking_count = deliverables
1234        .iter()
1235        .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1236        .count();
1237    Some(RunTaskLedgerSummaryRecord {
1238        root_task,
1239        rationale,
1240        deliverables,
1241        observations,
1242        blocking_count,
1243    })
1244}
1245
1246fn compaction_events_from_transcript(
1247    transcript: &serde_json::Value,
1248    stage_id: Option<&str>,
1249    node_id: Option<&str>,
1250    location_prefix: &str,
1251    persisted_path: Option<&Path>,
1252) -> Vec<CompactionEventRecord> {
1253    let transcript_id = transcript
1254        .get("id")
1255        .and_then(|value| value.as_str())
1256        .map(str::to_string);
1257    let asset_ids = transcript
1258        .get("assets")
1259        .and_then(|value| value.as_array())
1260        .map(|assets| {
1261            assets
1262                .iter()
1263                .filter_map(|asset| {
1264                    asset
1265                        .get("id")
1266                        .and_then(|value| value.as_str())
1267                        .map(str::to_string)
1268                })
1269                .collect::<BTreeSet<_>>()
1270        })
1271        .unwrap_or_default();
1272    transcript
1273        .get("events")
1274        .and_then(|value| value.as_array())
1275        .map(|events| {
1276            events
1277                .iter()
1278                .filter(|event| {
1279                    event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1280                })
1281                .map(|event| {
1282                    let metadata = event.get("metadata");
1283                    let snapshot_asset_id = metadata
1284                        .and_then(|value| value.get("snapshot_asset_id"))
1285                        .and_then(|value| value.as_str())
1286                        .map(str::to_string);
1287                    let available = snapshot_asset_id
1288                        .as_ref()
1289                        .is_some_and(|asset_id| asset_ids.contains(asset_id));
1290                    let snapshot_location = snapshot_asset_id
1291                        .as_ref()
1292                        .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1293                        .unwrap_or_else(|| location_prefix.to_string());
1294                    CompactionEventRecord {
1295                        id: event
1296                            .get("id")
1297                            .and_then(|value| value.as_str())
1298                            .unwrap_or_default()
1299                            .to_string(),
1300                        transcript_id: transcript_id.clone(),
1301                        stage_id: stage_id.map(str::to_string),
1302                        node_id: node_id.map(str::to_string),
1303                        mode: metadata
1304                            .and_then(|value| value.get("mode"))
1305                            .and_then(|value| value.as_str())
1306                            .unwrap_or_default()
1307                            .to_string(),
1308                        strategy: metadata
1309                            .and_then(|value| value.get("strategy"))
1310                            .and_then(|value| value.as_str())
1311                            .unwrap_or_default()
1312                            .to_string(),
1313                        archived_messages: json_usize(
1314                            metadata.and_then(|value| value.get("archived_messages")),
1315                        ),
1316                        estimated_tokens_before: json_usize(
1317                            metadata.and_then(|value| value.get("estimated_tokens_before")),
1318                        ),
1319                        estimated_tokens_after: json_usize(
1320                            metadata.and_then(|value| value.get("estimated_tokens_after")),
1321                        ),
1322                        snapshot_asset_id,
1323                        snapshot_location,
1324                        snapshot_path: persisted_path
1325                            .map(|path| path.to_string_lossy().into_owned()),
1326                        available,
1327                    }
1328                })
1329                .collect()
1330        })
1331        .unwrap_or_default()
1332}
1333
1334fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1335    let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1336        return Vec::new();
1337    };
1338    let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1339        return Vec::new();
1340    };
1341
1342    content
1343        .lines()
1344        .filter(|line| !line.trim().is_empty())
1345        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1346        .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1347        .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1348        .collect()
1349}
1350
1351pub fn derive_run_observability(
1352    run: &RunRecord,
1353    persisted_path: Option<&Path>,
1354) -> RunObservabilityRecord {
1355    let mut action_graph_nodes = Vec::new();
1356    let mut action_graph_edges = Vec::new();
1357    let mut verification_outcomes = Vec::new();
1358    let mut planner_rounds = Vec::new();
1359    let mut transcript_pointers = Vec::new();
1360    let mut compaction_events = Vec::new();
1361    let mut daemon_events = Vec::new();
1362    let mut research_fact_count = 0usize;
1363
1364    let root_node_id = format!("run:{}", run.id);
1365    let trigger_event = trigger_event_from_run(run);
1366    let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1367    append_action_graph_node(
1368        &mut action_graph_nodes,
1369        RunActionGraphNodeRecord {
1370            id: root_node_id.clone(),
1371            label: run
1372                .workflow_name
1373                .clone()
1374                .unwrap_or_else(|| run.workflow_id.clone()),
1375            kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1376            status: run.status.clone(),
1377            outcome: run.status.clone(),
1378            trace_id: propagated_trace_id.clone(),
1379            stage_id: None,
1380            node_id: None,
1381            worker_id: None,
1382            run_id: Some(run.id.clone()),
1383            run_path: run.persisted_path.clone(),
1384            metadata: BTreeMap::from([(
1385                "workflow_id".to_string(),
1386                serde_json::json!(run.workflow_id),
1387            )]),
1388        },
1389    );
1390    let mut entry_node_id = root_node_id.clone();
1391    if let Some(trigger_event) = trigger_event.as_ref() {
1392        if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1393            let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1394            append_action_graph_node(
1395                &mut action_graph_nodes,
1396                RunActionGraphNodeRecord {
1397                    id: replay_source_node_id.clone(),
1398                    label: format!(
1399                        "{}:{} (original {})",
1400                        trigger_event.provider.as_str(),
1401                        trigger_event.kind,
1402                        replay_of_event_id
1403                    ),
1404                    kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1405                    status: "historical".to_string(),
1406                    outcome: "replayed_from".to_string(),
1407                    trace_id: Some(trigger_event.trace_id.0.clone()),
1408                    stage_id: None,
1409                    node_id: None,
1410                    worker_id: None,
1411                    run_id: Some(run.id.clone()),
1412                    run_path: run.persisted_path.clone(),
1413                    metadata: trigger_node_metadata(trigger_event),
1414                },
1415            );
1416            action_graph_edges.push(RunActionGraphEdgeRecord {
1417                from_id: replay_source_node_id,
1418                to_id: format!("trigger:{}", trigger_event.id.0),
1419                kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1420                label: Some("replay chain".to_string()),
1421            });
1422        }
1423        let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1424        append_action_graph_node(
1425            &mut action_graph_nodes,
1426            RunActionGraphNodeRecord {
1427                id: trigger_node_id.clone(),
1428                label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1429                kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1430                status: "received".to_string(),
1431                outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1432                trace_id: Some(trigger_event.trace_id.0.clone()),
1433                stage_id: None,
1434                node_id: None,
1435                worker_id: None,
1436                run_id: Some(run.id.clone()),
1437                run_path: run.persisted_path.clone(),
1438                metadata: trigger_node_metadata(trigger_event),
1439            },
1440        );
1441        action_graph_edges.push(RunActionGraphEdgeRecord {
1442            from_id: root_node_id.clone(),
1443            to_id: trigger_node_id.clone(),
1444            kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1445            label: Some(trigger_event.id.0.clone()),
1446        });
1447        entry_node_id = trigger_node_id;
1448    }
1449
1450    let stage_node_ids = run
1451        .stages
1452        .iter()
1453        .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1454        .collect::<BTreeMap<_, _>>();
1455    let stage_by_id = run
1456        .stages
1457        .iter()
1458        .map(|stage| (stage.id.as_str(), stage))
1459        .collect::<BTreeMap<_, _>>();
1460    let stage_by_node_id = run
1461        .stages
1462        .iter()
1463        .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1464        .collect::<BTreeMap<_, _>>();
1465
1466    let incoming_nodes = run
1467        .transitions
1468        .iter()
1469        .map(|transition| transition.to_node_id.clone())
1470        .collect::<BTreeSet<_>>();
1471
1472    for stage in &run.stages {
1473        let graph_node_id = stage_node_ids
1474            .get(&stage.id)
1475            .cloned()
1476            .unwrap_or_else(|| format!("stage:{}", stage.id));
1477        append_action_graph_node(
1478            &mut action_graph_nodes,
1479            RunActionGraphNodeRecord {
1480                id: graph_node_id.clone(),
1481                label: stage.node_id.clone(),
1482                kind: action_graph_kind_for_stage(stage).to_string(),
1483                status: stage.status.clone(),
1484                outcome: stage.outcome.clone(),
1485                trace_id: propagated_trace_id.clone(),
1486                stage_id: Some(stage.id.clone()),
1487                node_id: Some(stage.node_id.clone()),
1488                worker_id: stage
1489                    .metadata
1490                    .get("worker_id")
1491                    .and_then(|value| value.as_str())
1492                    .map(str::to_string),
1493                run_id: None,
1494                run_path: None,
1495                metadata: stage_node_metadata(stage),
1496            },
1497        );
1498        if !incoming_nodes.contains(&stage.node_id) {
1499            action_graph_edges.push(RunActionGraphEdgeRecord {
1500                from_id: entry_node_id.clone(),
1501                to_id: graph_node_id.clone(),
1502                kind: if trigger_event.is_some() {
1503                    ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1504                } else {
1505                    ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1506                },
1507                label: None,
1508            });
1509        }
1510
1511        if stage.kind == "verify" || stage.verification.is_some() {
1512            let passed = json_bool(
1513                stage
1514                    .verification
1515                    .as_ref()
1516                    .and_then(|value| value.get("pass")),
1517            )
1518            .or_else(|| {
1519                json_bool(
1520                    stage
1521                        .verification
1522                        .as_ref()
1523                        .and_then(|value| value.get("success")),
1524                )
1525            })
1526            .or_else(|| {
1527                if stage.status == "completed" && stage.outcome == "success" {
1528                    Some(true)
1529                } else if stage.status == "failed" || stage.outcome == "failed" {
1530                    Some(false)
1531                } else {
1532                    None
1533                }
1534            });
1535            verification_outcomes.push(RunVerificationOutcomeRecord {
1536                stage_id: stage.id.clone(),
1537                node_id: stage.node_id.clone(),
1538                status: stage.status.clone(),
1539                passed,
1540                summary: stage
1541                    .verification
1542                    .as_ref()
1543                    .map(compact_json_value)
1544                    .or_else(|| {
1545                        stage
1546                            .visible_text
1547                            .as_ref()
1548                            .filter(|value| !value.trim().is_empty())
1549                            .cloned()
1550                    }),
1551            });
1552        }
1553
1554        if stage.transcript.is_some() {
1555            transcript_pointers.push(RunTranscriptPointerRecord {
1556                id: format!("stage:{}:transcript", stage.id),
1557                label: format!("Stage {} transcript", stage.node_id),
1558                kind: "embedded_transcript".to_string(),
1559                location: format!("run.stages[{}].transcript", stage.node_id),
1560                path: run.persisted_path.clone(),
1561                available: true,
1562            });
1563            if let Some(transcript) = stage.transcript.as_ref() {
1564                compaction_events.extend(compaction_events_from_transcript(
1565                    transcript,
1566                    Some(&stage.id),
1567                    Some(&stage.node_id),
1568                    &format!("run.stages[{}].transcript", stage.node_id),
1569                    persisted_path,
1570                ));
1571            }
1572        }
1573
1574        if let Some(payload) = stage_result_payload(stage) {
1575            let trace = payload.get("trace");
1576            let task_ledger = payload
1577                .get("task_ledger")
1578                .and_then(task_ledger_summary_from_value);
1579            let research_facts = task_ledger
1580                .as_ref()
1581                .map(|ledger| ledger.observations.clone())
1582                .unwrap_or_default();
1583            research_fact_count += research_facts.len();
1584            let tools_payload = payload.get("tools");
1585            let tools_used = json_string_array(
1586                tools_payload
1587                    .and_then(|tools| tools.get("calls"))
1588                    .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1589            );
1590            let successful_tools =
1591                json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1592            let planner_round = RunPlannerRoundRecord {
1593                stage_id: stage.id.clone(),
1594                node_id: stage.node_id.clone(),
1595                stage_kind: stage.kind.clone(),
1596                status: stage.status.clone(),
1597                outcome: stage.outcome.clone(),
1598                iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1599                llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1600                tool_execution_count: json_usize(
1601                    trace.and_then(|trace| trace.get("tool_executions")),
1602                ),
1603                tool_rejection_count: json_usize(
1604                    trace.and_then(|trace| trace.get("tool_rejections")),
1605                ),
1606                intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1607                compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1608                native_text_tool_fallback_count: json_usize(
1609                    trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1610                ),
1611                native_text_tool_fallback_rejection_count: json_usize(
1612                    trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1613                ),
1614                empty_completion_retry_count: json_usize(
1615                    trace.and_then(|trace| trace.get("empty_completion_retries")),
1616                ),
1617                tools_used,
1618                successful_tools,
1619                ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1620                task_ledger,
1621                research_facts,
1622            };
1623            let has_agentic_detail = planner_round.iteration_count > 0
1624                || planner_round.llm_call_count > 0
1625                || planner_round.tool_execution_count > 0
1626                || planner_round.native_text_tool_fallback_count > 0
1627                || planner_round.native_text_tool_fallback_rejection_count > 0
1628                || planner_round.empty_completion_retry_count > 0
1629                || planner_round.ledger_done_rejections > 0
1630                || planner_round.task_ledger.is_some()
1631                || !planner_round.tools_used.is_empty()
1632                || !planner_round.successful_tools.is_empty();
1633            if has_agentic_detail {
1634                planner_rounds.push(planner_round);
1635            }
1636        }
1637    }
1638
1639    for transition in &run.transitions {
1640        let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1641            continue;
1642        };
1643        let from_stage = transition
1644            .from_stage_id
1645            .as_deref()
1646            .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1647        let from_id = transition
1648            .from_stage_id
1649            .as_ref()
1650            .and_then(|stage_id| stage_node_ids.get(stage_id))
1651            .cloned()
1652            .or_else(|| {
1653                transition
1654                    .from_node_id
1655                    .as_ref()
1656                    .and_then(|node_id| stage_by_node_id.get(node_id))
1657                    .cloned()
1658            })
1659            .unwrap_or_else(|| root_node_id.clone());
1660        action_graph_edges.push(RunActionGraphEdgeRecord {
1661            from_id,
1662            to_id,
1663            kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1664                ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1665            } else {
1666                ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1667            },
1668            label: transition.branch.clone(),
1669        });
1670    }
1671
1672    let worker_lineage = run
1673        .child_runs
1674        .iter()
1675        .map(|child| {
1676            let worker_node_id = format!("worker:{}", child.worker_id);
1677            append_action_graph_node(
1678                &mut action_graph_nodes,
1679                RunActionGraphNodeRecord {
1680                    id: worker_node_id.clone(),
1681                    label: child.worker_name.clone(),
1682                    kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1683                    status: child.status.clone(),
1684                    outcome: child.status.clone(),
1685                    trace_id: propagated_trace_id.clone(),
1686                    stage_id: child.parent_stage_id.clone(),
1687                    node_id: None,
1688                    worker_id: Some(child.worker_id.clone()),
1689                    run_id: child.run_id.clone(),
1690                    run_path: child.run_path.clone(),
1691                    metadata: BTreeMap::from([
1692                        (
1693                            "worker_name".to_string(),
1694                            serde_json::json!(child.worker_name),
1695                        ),
1696                        ("task".to_string(), serde_json::json!(child.task)),
1697                    ]),
1698                },
1699            );
1700            if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1701                if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1702                    action_graph_edges.push(RunActionGraphEdgeRecord {
1703                        from_id: stage_node_id.clone(),
1704                        to_id: worker_node_id,
1705                        kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1706                        label: Some(child.worker_name.clone()),
1707                    });
1708                }
1709            }
1710            RunWorkerLineageRecord {
1711                worker_id: child.worker_id.clone(),
1712                worker_name: child.worker_name.clone(),
1713                parent_stage_id: child.parent_stage_id.clone(),
1714                task: child.task.clone(),
1715                status: child.status.clone(),
1716                session_id: child.session_id.clone(),
1717                parent_session_id: child.parent_session_id.clone(),
1718                run_id: child.run_id.clone(),
1719                run_path: child.run_path.clone(),
1720                snapshot_path: child.snapshot_path.clone(),
1721            }
1722        })
1723        .collect::<Vec<_>>();
1724
1725    if run.transcript.is_some() {
1726        transcript_pointers.push(RunTranscriptPointerRecord {
1727            id: "run:transcript".to_string(),
1728            label: "Run transcript".to_string(),
1729            kind: "embedded_transcript".to_string(),
1730            location: "run.transcript".to_string(),
1731            path: run.persisted_path.clone(),
1732            available: true,
1733        });
1734        if let Some(transcript) = run.transcript.as_ref() {
1735            compaction_events.extend(compaction_events_from_transcript(
1736                transcript,
1737                None,
1738                None,
1739                "run.transcript",
1740                persisted_path,
1741            ));
1742        }
1743    }
1744
1745    if let Some(path) = persisted_path {
1746        if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1747            transcript_pointers.push(RunTranscriptPointerRecord {
1748                id: "run:llm_transcript".to_string(),
1749                label: "LLM transcript sidecar".to_string(),
1750                kind: "llm_jsonl".to_string(),
1751                location: "run sidecar".to_string(),
1752                path: Some(sidecar_path.to_string_lossy().into_owned()),
1753                available: sidecar_path.exists(),
1754            });
1755        }
1756        daemon_events.extend(daemon_events_from_sidecar(path));
1757    }
1758
1759    RunObservabilityRecord {
1760        schema_version: 4,
1761        planner_rounds,
1762        research_fact_count,
1763        action_graph_nodes,
1764        action_graph_edges,
1765        worker_lineage,
1766        verification_outcomes,
1767        transcript_pointers,
1768        compaction_events,
1769        daemon_events,
1770    }
1771}
1772
1773fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1774    run.observability = Some(derive_run_observability(run, persisted_path));
1775}
1776
1777pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1778    let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1779    if run.type_name.is_empty() {
1780        run.type_name = "run_record".to_string();
1781    }
1782    if run.id.is_empty() {
1783        run.id = new_id("run");
1784    }
1785    if run.started_at.is_empty() {
1786        run.started_at = now_rfc3339();
1787    }
1788    if run.status.is_empty() {
1789        run.status = "running".to_string();
1790    }
1791    if run.root_run_id.is_none() {
1792        run.root_run_id = Some(run.id.clone());
1793    }
1794    if run.replay_fixture.is_none() {
1795        run.replay_fixture = Some(replay_fixture_from_run(&run));
1796    }
1797    merge_hitl_questions_from_active_log(&mut run);
1798    materialize_child_runs_from_stage_metadata(&mut run);
1799    sync_run_handoffs(&mut run);
1800    if run.observability.is_none() {
1801        let persisted_path = run.persisted_path.clone();
1802        let persisted = persisted_path.as_deref().map(Path::new);
1803        refresh_run_observability(&mut run, persisted);
1804    }
1805    Ok(run)
1806}
1807
1808pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1809    let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1810    if manifest.type_name.is_empty() {
1811        manifest.type_name = "eval_suite_manifest".to_string();
1812    }
1813    if manifest.id.is_empty() {
1814        manifest.id = new_id("eval_suite");
1815    }
1816    Ok(manifest)
1817}
1818
1819pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1820    let content = std::fs::read_to_string(path)
1821        .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1822    let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1823        .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1824    if manifest.base_dir.is_none() {
1825        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1826    }
1827    Ok(manifest)
1828}
1829
1830pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1831    let content = std::fs::read_to_string(path)
1832        .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1833    let mut manifest: EvalPackManifest =
1834        if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1835            serde_json::from_str(&content)
1836                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1837        } else {
1838            toml::from_str(&content)
1839                .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1840        };
1841    normalize_eval_pack_manifest(&mut manifest);
1842    if manifest.base_dir.is_none() {
1843        manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1844    }
1845    Ok(manifest)
1846}
1847
1848pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
1849    let mut manifest: EvalPackManifest = parse_json_value(value)?;
1850    normalize_eval_pack_manifest(&mut manifest);
1851    Ok(manifest)
1852}
1853
1854fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1855    if manifest.version == 0 {
1856        manifest.version = 1;
1857    }
1858    if manifest.id.is_empty() {
1859        manifest.id = manifest
1860            .name
1861            .clone()
1862            .filter(|name| !name.trim().is_empty())
1863            .unwrap_or_else(|| new_id("eval_pack"));
1864    }
1865    for ladder in &mut manifest.ladders {
1866        super::normalize_persona_eval_ladder_manifest(ladder);
1867    }
1868}
1869
1870fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1871    let content = std::fs::read_to_string(path)
1872        .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1873    serde_json::from_str(&content)
1874        .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1875}
1876
1877fn load_run_record_from_fixture_ref(
1878    fixture: &EvalPackFixtureRef,
1879    base_dir: Option<&Path>,
1880) -> Result<RunRecord, VmError> {
1881    if let Some(inline) = &fixture.inline {
1882        let run: RunRecord = serde_json::from_value(inline.clone())
1883            .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1884        return Ok(run);
1885    }
1886    let path = fixture.path.as_deref().ok_or_else(|| {
1887        VmError::Runtime(format!(
1888            "fixture '{}' is missing path or inline run",
1889            fixture.id
1890        ))
1891    })?;
1892    load_run_record(&resolve_manifest_path(base_dir, path))
1893}
1894
1895fn load_replay_fixture_from_ref(
1896    fixture: &EvalPackFixtureRef,
1897    base_dir: Option<&Path>,
1898) -> Result<ReplayFixture, VmError> {
1899    if let Some(inline) = &fixture.inline {
1900        return serde_json::from_value(inline.clone())
1901            .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1902    }
1903    let path = fixture.path.as_deref().ok_or_else(|| {
1904        VmError::Runtime(format!(
1905            "fixture '{}' is missing path or inline replay fixture",
1906            fixture.id
1907        ))
1908    })?;
1909    load_replay_fixture(&resolve_manifest_path(base_dir, path))
1910}
1911
1912fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1913    let path_buf = PathBuf::from(path);
1914    if path_buf.is_absolute() {
1915        path_buf
1916    } else if let Some(base_dir) = base_dir {
1917        base_dir.join(path_buf)
1918    } else {
1919        path_buf
1920    }
1921}
1922
1923pub fn evaluate_run_suite_manifest(
1924    manifest: &EvalSuiteManifest,
1925) -> Result<ReplayEvalSuiteReport, VmError> {
1926    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1927    let mut reports = Vec::new();
1928    for case in &manifest.cases {
1929        let run_path = resolve_manifest_path(base_dir, &case.run_path);
1930        let run = load_run_record(&run_path)?;
1931        let fixture = match &case.fixture_path {
1932            Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1933            None => run
1934                .replay_fixture
1935                .clone()
1936                .unwrap_or_else(|| replay_fixture_from_run(&run)),
1937        };
1938        let eval = evaluate_run_against_fixture(&run, &fixture);
1939        let mut pass = eval.pass;
1940        let mut failures = eval.failures;
1941        let comparison = match &case.compare_to {
1942            Some(path) => {
1943                let baseline_path = resolve_manifest_path(base_dir, path);
1944                let baseline = load_run_record(&baseline_path)?;
1945                let diff = diff_run_records(&baseline, &run);
1946                if !diff.identical {
1947                    pass = false;
1948                    failures.push(format!(
1949                        "run differs from baseline {} with {} stage changes",
1950                        baseline_path.display(),
1951                        diff.stage_diffs.len()
1952                    ));
1953                }
1954                Some(diff)
1955            }
1956            None => None,
1957        };
1958        reports.push(ReplayEvalCaseReport {
1959            run_id: run.id.clone(),
1960            workflow_id: run.workflow_id.clone(),
1961            label: case.label.clone(),
1962            pass,
1963            failures,
1964            stage_count: eval.stage_count,
1965            source_path: Some(run_path.display().to_string()),
1966            comparison,
1967        });
1968    }
1969    let total = reports.len();
1970    let passed = reports.iter().filter(|report| report.pass).count();
1971    let failed = total.saturating_sub(passed);
1972    Ok(ReplayEvalSuiteReport {
1973        pass: failed == 0,
1974        total,
1975        passed,
1976        failed,
1977        cases: reports,
1978    })
1979}
1980
1981pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1982    let base_dir = manifest.base_dir.as_deref().map(Path::new);
1983    let fixture_base_dir_buf = manifest
1984        .defaults
1985        .fixture_root
1986        .as_deref()
1987        .map(|root| resolve_manifest_path(base_dir, root));
1988    let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1989    let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1990        .fixtures
1991        .iter()
1992        .filter(|fixture| !fixture.id.is_empty())
1993        .map(|fixture| (fixture.id.as_str(), fixture))
1994        .collect();
1995    let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1996        .rubrics
1997        .iter()
1998        .filter(|rubric| !rubric.id.is_empty())
1999        .map(|rubric| (rubric.id.as_str(), rubric))
2000        .collect();
2001
2002    let mut reports = Vec::new();
2003    for (index, case) in manifest.cases.iter().enumerate() {
2004        let case_id = case
2005            .id
2006            .clone()
2007            .filter(|id| !id.trim().is_empty())
2008            .unwrap_or_else(|| format!("case_{}", index + 1));
2009        let label = case
2010            .name
2011            .clone()
2012            .or_else(|| case.id.clone())
2013            .unwrap_or_else(|| case_id.clone());
2014        let severity = eval_pack_case_severity(manifest, case);
2015        let blocking = severity == "blocking";
2016        let mut failures = Vec::new();
2017        let mut warnings = Vec::new();
2018        let mut informational = Vec::new();
2019
2020        if case.friction_events.is_some() {
2021            let report = evaluate_eval_pack_friction_case(
2022                manifest,
2023                case,
2024                &case_id,
2025                &label,
2026                &severity,
2027                blocking,
2028                base_dir,
2029                fixture_base_dir,
2030                &fixtures_by_id,
2031                &rubrics_by_id,
2032            )?;
2033            reports.push(report);
2034            continue;
2035        }
2036
2037        let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
2038        let fixture =
2039            load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
2040        let eval = evaluate_run_against_fixture(&run, &fixture);
2041        failures.extend(eval.failures);
2042        apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2043        apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2044
2045        let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2046            Some(path) => {
2047                let baseline_path = resolve_manifest_path(base_dir, path);
2048                let baseline = load_run_record(&baseline_path)?;
2049                let diff = diff_run_records(&baseline, &run);
2050                if !diff.identical {
2051                    failures.push(format!(
2052                        "run differs from baseline {} with {} stage changes",
2053                        baseline_path.display(),
2054                        diff.stage_diffs.len()
2055                    ));
2056                }
2057                Some(diff)
2058            }
2059            None => None,
2060        };
2061
2062        for rubric_id in &case.rubrics {
2063            let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2064                failures.push(format!("case references unknown rubric '{rubric_id}'"));
2065                continue;
2066            };
2067            apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2068        }
2069
2070        let pass = failures.is_empty() || !blocking;
2071        if !failures.is_empty() && !blocking {
2072            if severity == "warning" {
2073                warnings.append(&mut failures);
2074            } else {
2075                informational.append(&mut failures);
2076            }
2077        }
2078        reports.push(EvalPackCaseReport {
2079            id: case_id,
2080            label,
2081            severity,
2082            pass,
2083            blocking,
2084            run_id: run.id.clone(),
2085            workflow_id: run.workflow_id.clone(),
2086            source_path: eval_pack_case_source_path(
2087                case,
2088                base_dir,
2089                fixture_base_dir,
2090                &fixtures_by_id,
2091            ),
2092            stage_count: eval.stage_count,
2093            failures,
2094            warnings,
2095            informational,
2096            comparison,
2097        });
2098    }
2099
2100    let mut ladder_reports = Vec::new();
2101    for ladder in &manifest.ladders {
2102        let mut ladder = ladder.clone();
2103        if ladder.base_dir.is_none() {
2104            ladder.base_dir = manifest.base_dir.clone();
2105        }
2106        ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2107    }
2108
2109    let case_total = reports.len();
2110    let ladder_total = ladder_reports.len();
2111    let total = case_total + ladder_total;
2112    let case_blocking_failed = reports
2113        .iter()
2114        .filter(|report| report.blocking && !report.failures.is_empty())
2115        .count();
2116    let ladder_blocking_failed = ladder_reports
2117        .iter()
2118        .filter(|report| report.blocking && !report.pass)
2119        .count();
2120    let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2121    let warning_failed = reports
2122        .iter()
2123        .filter(|report| !report.warnings.is_empty())
2124        .count()
2125        + ladder_reports
2126            .iter()
2127            .filter(|report| !report.pass && report.severity == "warning")
2128            .count();
2129    let informational_failed = reports
2130        .iter()
2131        .filter(|report| !report.informational.is_empty())
2132        .count()
2133        + ladder_reports
2134            .iter()
2135            .filter(|report| !report.pass && report.severity == "informational")
2136            .count();
2137    let passed = reports.iter().filter(|report| report.pass).count()
2138        + ladder_reports.iter().filter(|report| report.pass).count();
2139    Ok(EvalPackReport {
2140        pack_id: manifest.id.clone(),
2141        pass: blocking_failed == 0,
2142        total,
2143        passed,
2144        failed: total.saturating_sub(passed),
2145        blocking_failed,
2146        warning_failed,
2147        informational_failed,
2148        cases: reports,
2149        ladders: ladder_reports,
2150    })
2151}
2152
2153#[allow(clippy::too_many_arguments)]
2154fn evaluate_eval_pack_friction_case(
2155    manifest: &EvalPackManifest,
2156    case: &EvalPackCase,
2157    case_id: &str,
2158    label: &str,
2159    severity: &str,
2160    blocking: bool,
2161    base_dir: Option<&Path>,
2162    fixture_base_dir: Option<&Path>,
2163    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2164    rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2165) -> Result<EvalPackCaseReport, VmError> {
2166    let mut failures = Vec::new();
2167    let mut warnings = Vec::new();
2168    let mut informational = Vec::new();
2169    let events =
2170        load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2171    let options = friction_suggestion_options(case, manifest);
2172    let suggestions = generate_context_pack_suggestions(&events, &options);
2173
2174    for rubric_id in &case.rubrics {
2175        let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2176            failures.push(format!("case references unknown rubric '{rubric_id}'"));
2177            continue;
2178        };
2179        apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2180    }
2181
2182    if case.rubrics.is_empty() && suggestions.is_empty() {
2183        failures.push("friction fixture produced no context-pack suggestions".to_string());
2184    }
2185
2186    let pass = failures.is_empty() || !blocking;
2187    if !failures.is_empty() && !blocking {
2188        if severity == "warning" {
2189            warnings.append(&mut failures);
2190        } else {
2191            informational.append(&mut failures);
2192        }
2193    }
2194
2195    Ok(EvalPackCaseReport {
2196        id: case_id.to_string(),
2197        label: label.to_string(),
2198        severity: severity.to_string(),
2199        pass,
2200        blocking,
2201        run_id: "friction_events".to_string(),
2202        workflow_id: String::new(),
2203        source_path: eval_pack_case_friction_source_path(
2204            case,
2205            base_dir,
2206            fixture_base_dir,
2207            fixtures_by_id,
2208        ),
2209        stage_count: events.len(),
2210        failures,
2211        warnings,
2212        informational,
2213        comparison: None,
2214    })
2215}
2216
2217fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2218    normalize_eval_pack_severity(
2219        case.severity
2220            .as_deref()
2221            .or(case.thresholds.severity.as_deref())
2222            .or(manifest.defaults.severity.as_deref())
2223            .or(manifest.defaults.thresholds.severity.as_deref())
2224            .unwrap_or("blocking"),
2225    )
2226}
2227
2228fn normalize_eval_pack_severity(value: &str) -> String {
2229    match value.trim().to_ascii_lowercase().as_str() {
2230        "warn" | "warning" => "warning".to_string(),
2231        "info" | "informational" => "informational".to_string(),
2232        _ => "blocking".to_string(),
2233    }
2234}
2235
2236fn load_eval_pack_case_run(
2237    case: &EvalPackCase,
2238    base_dir: Option<&Path>,
2239    fixture_base_dir: Option<&Path>,
2240    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2241) -> Result<RunRecord, VmError> {
2242    if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2243        if let Some(fixture) = fixtures_by_id.get(run_ref) {
2244            return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2245        }
2246        return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2247    }
2248    Err(VmError::Runtime(
2249        "eval pack case is missing run or run_path".to_string(),
2250    ))
2251}
2252
2253fn load_eval_pack_case_fixture(
2254    case: &EvalPackCase,
2255    base_dir: Option<&Path>,
2256    fixture_base_dir: Option<&Path>,
2257    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2258    run: &RunRecord,
2259) -> Result<ReplayFixture, VmError> {
2260    if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2261        if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2262            return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2263        }
2264        return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2265    }
2266    Ok(run
2267        .replay_fixture
2268        .clone()
2269        .unwrap_or_else(|| replay_fixture_from_run(run)))
2270}
2271
2272fn eval_pack_case_source_path(
2273    case: &EvalPackCase,
2274    base_dir: Option<&Path>,
2275    fixture_base_dir: Option<&Path>,
2276    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2277) -> Option<String> {
2278    let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2279    if let Some(fixture) = fixtures_by_id.get(run_ref) {
2280        return fixture.path.as_ref().map(|path| {
2281            resolve_manifest_path(fixture_base_dir, path)
2282                .display()
2283                .to_string()
2284        });
2285    }
2286    Some(
2287        resolve_manifest_path(base_dir, run_ref)
2288            .display()
2289            .to_string(),
2290    )
2291}
2292
2293fn load_eval_pack_case_friction_events(
2294    case: &EvalPackCase,
2295    base_dir: Option<&Path>,
2296    fixture_base_dir: Option<&Path>,
2297    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2298) -> Result<Vec<FrictionEvent>, VmError> {
2299    let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2300        VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2301    })?;
2302    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2303        return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2304    }
2305    load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2306}
2307
2308fn load_friction_events_from_fixture_ref(
2309    fixture: &EvalPackFixtureRef,
2310    base_dir: Option<&Path>,
2311) -> Result<Vec<FrictionEvent>, VmError> {
2312    if let Some(inline) = &fixture.inline {
2313        return normalize_friction_events_json(inline.clone());
2314    }
2315    let path = fixture.path.as_deref().ok_or_else(|| {
2316        VmError::Runtime(format!(
2317            "fixture '{}' is missing path or inline friction events",
2318            fixture.id
2319        ))
2320    })?;
2321    load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2322}
2323
2324fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2325    let content = std::fs::read_to_string(path)
2326        .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2327    let value: serde_json::Value = serde_json::from_str(&content)
2328        .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2329    normalize_friction_events_json(value)
2330}
2331
2332fn eval_pack_case_friction_source_path(
2333    case: &EvalPackCase,
2334    base_dir: Option<&Path>,
2335    fixture_base_dir: Option<&Path>,
2336    fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2337) -> Option<String> {
2338    let event_ref = case.friction_events.as_deref()?;
2339    if let Some(fixture) = fixtures_by_id.get(event_ref) {
2340        return fixture.path.as_ref().map(|path| {
2341            resolve_manifest_path(fixture_base_dir, path)
2342                .display()
2343                .to_string()
2344        });
2345    }
2346    Some(
2347        resolve_manifest_path(base_dir, event_ref)
2348            .display()
2349            .to_string(),
2350    )
2351}
2352
2353fn friction_suggestion_options(
2354    case: &EvalPackCase,
2355    manifest: &EvalPackManifest,
2356) -> ContextPackSuggestionOptions {
2357    let min_occurrences = case
2358        .metadata
2359        .get("min_occurrences")
2360        .or_else(|| manifest.metadata.get("min_occurrences"))
2361        .and_then(|value| value.as_u64())
2362        .unwrap_or(2) as usize;
2363    let owner = case
2364        .metadata
2365        .get("owner")
2366        .or_else(|| manifest.metadata.get("owner"))
2367        .and_then(|value| value.as_str())
2368        .map(str::to_string)
2369        .or_else(|| {
2370            manifest
2371                .package
2372                .as_ref()
2373                .and_then(|package| package.name.clone())
2374        });
2375    ContextPackSuggestionOptions {
2376        min_occurrences,
2377        owner,
2378    }
2379}
2380
2381fn apply_eval_pack_thresholds(
2382    run: &RunRecord,
2383    thresholds: &EvalPackThresholds,
2384    failures: &mut Vec<String>,
2385) {
2386    if let Some(max_stage_count) = thresholds.max_stage_count {
2387        if run.stages.len() > max_stage_count {
2388            failures.push(format!(
2389                "stage count {} exceeds threshold {}",
2390                run.stages.len(),
2391                max_stage_count
2392            ));
2393        }
2394    }
2395    if let Some(max_latency_ms) = thresholds.max_latency_ms {
2396        let actual = run
2397            .usage
2398            .as_ref()
2399            .map(|usage| usage.total_duration_ms)
2400            .unwrap_or_default();
2401        if actual > max_latency_ms {
2402            failures.push(format!(
2403                "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2404            ));
2405        }
2406    }
2407    if let Some(max_cost_usd) = thresholds.max_cost_usd {
2408        let actual = run
2409            .usage
2410            .as_ref()
2411            .map(|usage| usage.total_cost)
2412            .unwrap_or_default();
2413        if actual > max_cost_usd {
2414            failures.push(format!(
2415                "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2416            ));
2417        }
2418    }
2419    if let Some(max_tokens) = thresholds.max_tokens {
2420        let actual = run
2421            .usage
2422            .as_ref()
2423            .map(|usage| usage.input_tokens + usage.output_tokens)
2424            .unwrap_or_default();
2425        if actual > max_tokens {
2426            failures.push(format!(
2427                "token count {actual} exceeds threshold {max_tokens}"
2428            ));
2429        }
2430    }
2431}
2432
2433fn apply_eval_pack_rubric(
2434    rubric: &EvalPackRubric,
2435    run: &RunRecord,
2436    failures: &mut Vec<String>,
2437    warnings: &mut Vec<String>,
2438) {
2439    match rubric.kind.as_str() {
2440        "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2441            apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2442            for assertion in &rubric.assertions {
2443                apply_eval_pack_assertion(rubric, assertion, run, failures);
2444            }
2445        }
2446        "llm-judge" | "llm_as_judge" | "judge" => {
2447            let severity = normalize_eval_pack_severity(
2448                rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2449            );
2450            let message = format!(
2451                "rubric '{}' requires an external LLM judge and was not run locally",
2452                rubric.id
2453            );
2454            if severity == "blocking" {
2455                failures.push(message);
2456            } else {
2457                warnings.push(message);
2458            }
2459        }
2460        other => warnings.push(format!(
2461            "rubric '{}' has unknown kind '{}' and was not run locally",
2462            rubric.id, other
2463        )),
2464    }
2465}
2466
2467fn apply_eval_pack_friction_rubric(
2468    rubric: &EvalPackRubric,
2469    suggestions: &[super::ContextPackSuggestion],
2470    failures: &mut Vec<String>,
2471    warnings: &mut Vec<String>,
2472) {
2473    match rubric.kind.as_str() {
2474        "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2475            let mut expectations = Vec::new();
2476            for assertion in &rubric.assertions {
2477                match assertion.kind.as_str() {
2478                    "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2479                        let expectation = context_pack_expectation_from_assertion(assertion);
2480                        expectations.push(expectation);
2481                    }
2482                    other => failures.push(format!(
2483                        "rubric '{}' has unsupported friction assertion kind '{}'",
2484                        rubric.id, other
2485                    )),
2486                }
2487            }
2488            failures.extend(evaluate_context_pack_suggestion_expectations(
2489                suggestions,
2490                &expectations,
2491            ));
2492        }
2493        other => warnings.push(format!(
2494            "rubric '{}' has unknown friction kind '{}' and was not run locally",
2495            rubric.id, other
2496        )),
2497    }
2498}
2499
2500fn context_pack_expectation_from_assertion(
2501    assertion: &EvalPackAssertion,
2502) -> ContextPackSuggestionExpectation {
2503    let expected = assertion
2504        .expected
2505        .as_ref()
2506        .and_then(|value| value.as_object());
2507    let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2508    ContextPackSuggestionExpectation {
2509        min_suggestions: expected
2510            .and_then(|map| map.get("min_suggestions"))
2511            .and_then(|value| value.as_u64())
2512            .map(|value| value as usize),
2513        recommended_artifact: expected
2514            .and_then(|map| map.get("recommended_artifact"))
2515            .and_then(|value| value.as_str())
2516            .map(str::to_string)
2517            .or_else(|| expected_string.map(str::to_string)),
2518        title_contains: assertion.contains.clone().or_else(|| {
2519            expected
2520                .and_then(|map| map.get("title_contains"))
2521                .and_then(|value| value.as_str())
2522                .map(str::to_string)
2523        }),
2524        manifest_name_contains: expected
2525            .and_then(|map| map.get("manifest_name_contains"))
2526            .and_then(|value| value.as_str())
2527            .map(str::to_string),
2528        required_capability: expected
2529            .and_then(|map| map.get("required_capability"))
2530            .and_then(|value| value.as_str())
2531            .map(str::to_string),
2532        required_output_slot: expected
2533            .and_then(|map| map.get("required_output_slot"))
2534            .and_then(|value| value.as_str())
2535            .map(str::to_string),
2536    }
2537}
2538
2539fn apply_eval_pack_assertion(
2540    rubric: &EvalPackRubric,
2541    assertion: &EvalPackAssertion,
2542    run: &RunRecord,
2543    failures: &mut Vec<String>,
2544) {
2545    match assertion.kind.as_str() {
2546        "run-status" | "run_status" | "status" => {
2547            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2548            if let Some(expected) = expected {
2549                if run.status != expected {
2550                    failures.push(format!(
2551                        "rubric '{}' expected run status {}, got {}",
2552                        rubric.id, expected, run.status
2553                    ));
2554                }
2555            }
2556        }
2557        "stage-status" | "stage_status" => {
2558            let Some(stage_id) = assertion.stage.as_deref() else {
2559                failures.push(format!(
2560                    "rubric '{}' stage-status assertion missing stage",
2561                    rubric.id
2562                ));
2563                return;
2564            };
2565            let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2566            let Some(expected) = expected else {
2567                failures.push(format!(
2568                    "rubric '{}' stage-status assertion missing expected string",
2569                    rubric.id
2570                ));
2571                return;
2572            };
2573            match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2574                Some(stage) if stage.status == expected => {}
2575                Some(stage) => failures.push(format!(
2576                    "rubric '{}' expected stage {} status {}, got {}",
2577                    rubric.id, stage_id, expected, stage.status
2578                )),
2579                None => failures.push(format!(
2580                    "rubric '{}' expected stage {} to exist",
2581                    rubric.id, stage_id
2582                )),
2583            }
2584        }
2585        "visible-text-contains" | "visible_text_contains" => {
2586            let Some(needle) = assertion.contains.as_deref() else {
2587                failures.push(format!(
2588                    "rubric '{}' visible-text assertion missing contains",
2589                    rubric.id
2590                ));
2591                return;
2592            };
2593            let matched = match assertion.stage.as_deref() {
2594                Some(stage_id) => run
2595                    .stages
2596                    .iter()
2597                    .find(|stage| stage.node_id == stage_id)
2598                    .and_then(|stage| stage.visible_text.as_deref())
2599                    .is_some_and(|text| text.contains(needle)),
2600                None => run
2601                    .stages
2602                    .iter()
2603                    .filter_map(|stage| stage.visible_text.as_deref())
2604                    .any(|text| text.contains(needle)),
2605            };
2606            if !matched {
2607                failures.push(format!(
2608                    "rubric '{}' expected visible text to contain {:?}",
2609                    rubric.id, needle
2610                ));
2611            }
2612        }
2613        "hitl-question-contains" | "hitl_question_contains" => {
2614            let Some(needle) = assertion.contains.as_deref() else {
2615                failures.push(format!(
2616                    "rubric '{}' HITL assertion missing contains",
2617                    rubric.id
2618                ));
2619                return;
2620            };
2621            if !run
2622                .hitl_questions
2623                .iter()
2624                .any(|question| question.prompt.contains(needle))
2625            {
2626                failures.push(format!(
2627                    "rubric '{}' expected HITL question to contain {:?}",
2628                    rubric.id, needle
2629                ));
2630            }
2631        }
2632        "" => {}
2633        other => failures.push(format!(
2634            "rubric '{}' has unsupported assertion kind '{}'",
2635            rubric.id, other
2636        )),
2637    }
2638}
2639
2640/// Edit operation in a diff sequence.
2641#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2642pub(crate) enum DiffOp {
2643    Equal,
2644    Delete,
2645    Insert,
2646}
2647
2648/// Compute the shortest edit script using Myers' O(nd) algorithm.
2649/// Returns a sequence of (DiffOp, line_index_in_before_or_after).
2650/// Time: O(nd) where d = edit distance. Space: O(d * n).
2651pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2652    let n = a.len() as isize;
2653    let m = b.len() as isize;
2654    if n == 0 && m == 0 {
2655        return Vec::new();
2656    }
2657    if n == 0 {
2658        return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2659    }
2660    if m == 0 {
2661        return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2662    }
2663
2664    let max_d = (n + m) as usize;
2665    let offset = max_d as isize;
2666    let v_size = 2 * max_d + 1;
2667    let mut v = vec![0isize; v_size];
2668    // trace[d] holds the `v` snapshot BEFORE step d ran — required for backtrack.
2669    let mut trace: Vec<Vec<isize>> = Vec::new();
2670
2671    'outer: for d in 0..=max_d as isize {
2672        trace.push(v.clone());
2673        let mut new_v = v.clone();
2674        for k in (-d..=d).step_by(2) {
2675            let ki = (k + offset) as usize;
2676            let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2677                v[ki + 1]
2678            } else {
2679                v[ki - 1] + 1
2680            };
2681            let mut y = x - k;
2682            while x < n && y < m && a[x as usize] == b[y as usize] {
2683                x += 1;
2684                y += 1;
2685            }
2686            new_v[ki] = x;
2687            if x >= n && y >= m {
2688                let _ = new_v;
2689                break 'outer;
2690            }
2691        }
2692        v = new_v;
2693    }
2694
2695    let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2696    let mut x = n;
2697    let mut y = m;
2698    for d in (1..trace.len() as isize).rev() {
2699        let k = x - y;
2700        let v_prev = &trace[d as usize];
2701        let prev_k = if k == -d
2702            || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2703        {
2704            k + 1
2705        } else {
2706            k - 1
2707        };
2708        let prev_x = v_prev[(prev_k + offset) as usize];
2709        let prev_y = prev_x - prev_k;
2710
2711        while x > prev_x && y > prev_y {
2712            x -= 1;
2713            y -= 1;
2714            ops.push((DiffOp::Equal, x as usize));
2715        }
2716        if prev_k < k {
2717            x -= 1;
2718            ops.push((DiffOp::Delete, x as usize));
2719        } else {
2720            y -= 1;
2721            ops.push((DiffOp::Insert, y as usize));
2722        }
2723    }
2724    while x > 0 && y > 0 {
2725        x -= 1;
2726        y -= 1;
2727        ops.push((DiffOp::Equal, x as usize));
2728    }
2729    ops.reverse();
2730    ops
2731}
2732
2733pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2734    let before_lines: Vec<&str> = before.lines().collect();
2735    let after_lines: Vec<&str> = after.lines().collect();
2736    let ops = myers_diff(&before_lines, &after_lines);
2737
2738    let mut diff = String::new();
2739    let file = path.unwrap_or("artifact");
2740    diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2741    for &(op, idx) in &ops {
2742        match op {
2743            DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2744            DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2745            DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2746        }
2747    }
2748    diff
2749}
2750
2751pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2752    let path = path
2753        .map(PathBuf::from)
2754        .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2755    let mut materialized = run.clone();
2756    merge_hitl_questions_from_active_log(&mut materialized);
2757    materialize_child_runs_from_stage_metadata(&mut materialized);
2758    if materialized.replay_fixture.is_none() {
2759        materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2760    }
2761    materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2762    sync_run_handoffs(&mut materialized);
2763    refresh_run_observability(&mut materialized, Some(&path));
2764    if let Some(parent) = path.parent() {
2765        std::fs::create_dir_all(parent)
2766            .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2767    }
2768    let json = serde_json::to_string_pretty(&materialized)
2769        .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2770    crate::atomic_io::atomic_write(&path, json.as_bytes())
2771        .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2772    if let Some(observability) = materialized.observability.as_ref() {
2773        publish_action_graph_event(&materialized, observability, &path);
2774    }
2775    Ok(path.to_string_lossy().into_owned())
2776}
2777
2778pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2779    let content = std::fs::read_to_string(path)
2780        .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2781    let mut run: RunRecord = serde_json::from_str(&content)
2782        .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2783    materialize_child_runs_from_stage_metadata(&mut run);
2784    if run.replay_fixture.is_none() {
2785        run.replay_fixture = Some(replay_fixture_from_run(&run));
2786    }
2787    run.persisted_path
2788        .get_or_insert_with(|| path.to_string_lossy().into_owned());
2789    sync_run_handoffs(&mut run);
2790    refresh_run_observability(&mut run, Some(path));
2791    Ok(run)
2792}
2793
2794pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2795    ReplayFixture {
2796        type_name: "replay_fixture".to_string(),
2797        id: new_id("fixture"),
2798        source_run_id: run.id.clone(),
2799        workflow_id: run.workflow_id.clone(),
2800        workflow_name: run.workflow_name.clone(),
2801        created_at: now_rfc3339(),
2802        eval_kind: Some("replay".to_string()),
2803        clarifying_question: None,
2804        expected_status: run.status.clone(),
2805        stage_assertions: run
2806            .stages
2807            .iter()
2808            .map(|stage| ReplayStageAssertion {
2809                node_id: stage.node_id.clone(),
2810                expected_status: stage.status.clone(),
2811                expected_outcome: stage.outcome.clone(),
2812                expected_branch: stage.branch.clone(),
2813                required_artifact_kinds: stage
2814                    .artifacts
2815                    .iter()
2816                    .map(|artifact| artifact.kind.clone())
2817                    .collect(),
2818                visible_text_contains: stage
2819                    .visible_text
2820                    .as_ref()
2821                    .filter(|text| !text.is_empty())
2822                    .map(|text| text.chars().take(80).collect()),
2823            })
2824            .collect(),
2825    }
2826}
2827
2828pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2829    if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2830        return evaluate_clarifying_question(run, fixture);
2831    }
2832    let mut failures = Vec::new();
2833    if run.status != fixture.expected_status {
2834        failures.push(format!(
2835            "run status mismatch: expected {}, got {}",
2836            fixture.expected_status, run.status
2837        ));
2838    }
2839    let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2840        run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2841    for assertion in &fixture.stage_assertions {
2842        let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2843            failures.push(format!("missing stage {}", assertion.node_id));
2844            continue;
2845        };
2846        if stage.status != assertion.expected_status {
2847            failures.push(format!(
2848                "stage {} status mismatch: expected {}, got {}",
2849                assertion.node_id, assertion.expected_status, stage.status
2850            ));
2851        }
2852        if stage.outcome != assertion.expected_outcome {
2853            failures.push(format!(
2854                "stage {} outcome mismatch: expected {}, got {}",
2855                assertion.node_id, assertion.expected_outcome, stage.outcome
2856            ));
2857        }
2858        if stage.branch != assertion.expected_branch {
2859            failures.push(format!(
2860                "stage {} branch mismatch: expected {:?}, got {:?}",
2861                assertion.node_id, assertion.expected_branch, stage.branch
2862            ));
2863        }
2864        for required_kind in &assertion.required_artifact_kinds {
2865            if !stage
2866                .artifacts
2867                .iter()
2868                .any(|artifact| &artifact.kind == required_kind)
2869            {
2870                failures.push(format!(
2871                    "stage {} missing artifact kind {}",
2872                    assertion.node_id, required_kind
2873                ));
2874            }
2875        }
2876        if let Some(snippet) = &assertion.visible_text_contains {
2877            let actual = stage.visible_text.clone().unwrap_or_default();
2878            if !actual.contains(snippet) {
2879                failures.push(format!(
2880                    "stage {} visible text does not contain expected snippet {:?}",
2881                    assertion.node_id, snippet
2882                ));
2883            }
2884        }
2885    }
2886
2887    ReplayEvalReport {
2888        pass: failures.is_empty(),
2889        failures,
2890        stage_count: run.stages.len(),
2891    }
2892}
2893
2894fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2895    let mut failures = Vec::new();
2896    let spec = fixture.clarifying_question.clone().unwrap_or_default();
2897    let min_questions = clarifying_min_questions(&spec);
2898    let max_questions = clarifying_max_questions(&spec);
2899    let questions = &run.hitl_questions;
2900
2901    if run.status != fixture.expected_status {
2902        failures.push(format!(
2903            "run status mismatch: expected {}, got {}",
2904            fixture.expected_status, run.status
2905        ));
2906    }
2907    if questions.len() < min_questions {
2908        failures.push(format!(
2909            "expected at least {min_questions} clarifying question(s), got {}",
2910            questions.len()
2911        ));
2912    }
2913    if questions.len() > max_questions {
2914        failures.push(format!(
2915            "expected at most {max_questions} clarifying question(s), got {}",
2916            questions.len()
2917        ));
2918    }
2919
2920    let normalized_expected = spec
2921        .expected_question
2922        .as_deref()
2923        .map(normalize_question_text);
2924    let normalized_accepted = spec
2925        .accepted_questions
2926        .iter()
2927        .map(|question| normalize_question_text(question))
2928        .collect::<Vec<_>>();
2929    let required_terms = spec
2930        .required_terms
2931        .iter()
2932        .map(|term| normalize_question_text(term))
2933        .collect::<Vec<_>>();
2934    let forbidden_terms = spec
2935        .forbidden_terms
2936        .iter()
2937        .map(|term| normalize_question_text(term))
2938        .collect::<Vec<_>>();
2939
2940    let matched = questions.iter().any(|question| {
2941        let normalized = normalize_question_text(&question.prompt);
2942        let matches_expected = normalized_expected
2943            .as_ref()
2944            .is_none_or(|expected| &normalized == expected)
2945            && (normalized_accepted.is_empty()
2946                || normalized_accepted
2947                    .iter()
2948                    .any(|candidate| candidate == &normalized));
2949        let has_required_terms = required_terms
2950            .iter()
2951            .all(|term| normalized.contains(term.as_str()));
2952        let avoids_forbidden_terms = forbidden_terms
2953            .iter()
2954            .all(|term| !normalized.contains(term.as_str()));
2955        matches_expected && has_required_terms && avoids_forbidden_terms
2956    });
2957
2958    if !questions.is_empty()
2959        && (!normalized_accepted.is_empty()
2960            || normalized_expected.is_some()
2961            || !required_terms.is_empty()
2962            || !forbidden_terms.is_empty())
2963        && !matched
2964    {
2965        failures.push(format!(
2966            "no clarifying question matched fixture; actual questions: {}",
2967            questions
2968                .iter()
2969                .map(|question| format!("{:?}", question.prompt))
2970                .collect::<Vec<_>>()
2971                .join(", ")
2972        ));
2973    }
2974
2975    ReplayEvalReport {
2976        pass: failures.is_empty(),
2977        failures,
2978        stage_count: run.stages.len(),
2979    }
2980}
2981
2982pub fn evaluate_run_suite(
2983    cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2984) -> ReplayEvalSuiteReport {
2985    let mut reports = Vec::new();
2986    for (run, fixture, source_path) in cases {
2987        let report = evaluate_run_against_fixture(&run, &fixture);
2988        reports.push(ReplayEvalCaseReport {
2989            run_id: run.id.clone(),
2990            workflow_id: run.workflow_id.clone(),
2991            label: None,
2992            pass: report.pass,
2993            failures: report.failures,
2994            stage_count: report.stage_count,
2995            source_path,
2996            comparison: None,
2997        });
2998    }
2999    let total = reports.len();
3000    let passed = reports.iter().filter(|report| report.pass).count();
3001    let failed = total.saturating_sub(passed);
3002    ReplayEvalSuiteReport {
3003        pass: failed == 0,
3004        total,
3005        passed,
3006        failed,
3007        cases: reports,
3008    }
3009}
3010
3011pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
3012    let mut stage_diffs = Vec::new();
3013    let mut all_node_ids = BTreeSet::new();
3014    let left_by_id: BTreeMap<&str, &RunStageRecord> = left
3015        .stages
3016        .iter()
3017        .map(|s| (s.node_id.as_str(), s))
3018        .collect();
3019    let right_by_id: BTreeMap<&str, &RunStageRecord> = right
3020        .stages
3021        .iter()
3022        .map(|s| (s.node_id.as_str(), s))
3023        .collect();
3024    all_node_ids.extend(left_by_id.keys().copied());
3025    all_node_ids.extend(right_by_id.keys().copied());
3026
3027    for node_id in all_node_ids {
3028        let left_stage = left_by_id.get(node_id).copied();
3029        let right_stage = right_by_id.get(node_id).copied();
3030        match (left_stage, right_stage) {
3031            (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
3032                node_id: node_id.to_string(),
3033                change: "removed".to_string(),
3034                details: vec!["stage missing from right run".to_string()],
3035            }),
3036            (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
3037                node_id: node_id.to_string(),
3038                change: "added".to_string(),
3039                details: vec!["stage missing from left run".to_string()],
3040            }),
3041            (Some(left_stage), Some(right_stage)) => {
3042                let mut details = Vec::new();
3043                if left_stage.status != right_stage.status {
3044                    details.push(format!(
3045                        "status: {} -> {}",
3046                        left_stage.status, right_stage.status
3047                    ));
3048                }
3049                if left_stage.outcome != right_stage.outcome {
3050                    details.push(format!(
3051                        "outcome: {} -> {}",
3052                        left_stage.outcome, right_stage.outcome
3053                    ));
3054                }
3055                if left_stage.branch != right_stage.branch {
3056                    details.push(format!(
3057                        "branch: {:?} -> {:?}",
3058                        left_stage.branch, right_stage.branch
3059                    ));
3060                }
3061                if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
3062                {
3063                    details.push(format!(
3064                        "produced_artifacts: {} -> {}",
3065                        left_stage.produced_artifact_ids.len(),
3066                        right_stage.produced_artifact_ids.len()
3067                    ));
3068                }
3069                if left_stage.artifacts.len() != right_stage.artifacts.len() {
3070                    details.push(format!(
3071                        "artifact_records: {} -> {}",
3072                        left_stage.artifacts.len(),
3073                        right_stage.artifacts.len()
3074                    ));
3075                }
3076                if !details.is_empty() {
3077                    stage_diffs.push(RunStageDiffRecord {
3078                        node_id: node_id.to_string(),
3079                        change: "changed".to_string(),
3080                        details,
3081                    });
3082                }
3083            }
3084            (None, None) => {}
3085        }
3086    }
3087
3088    let mut tool_diffs = Vec::new();
3089    let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3090        .tool_recordings
3091        .iter()
3092        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3093        .collect();
3094    let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3095        .tool_recordings
3096        .iter()
3097        .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3098        .collect();
3099    let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3100        .keys()
3101        .chain(right_tools.keys())
3102        .cloned()
3103        .collect();
3104    for key in &all_tool_keys {
3105        let l = left_tools.get(key);
3106        let r = right_tools.get(key);
3107        let result_changed = match (l, r) {
3108            (Some(a), Some(b)) => a.result != b.result,
3109            _ => true,
3110        };
3111        if result_changed {
3112            tool_diffs.push(ToolCallDiffRecord {
3113                tool_name: key.0.clone(),
3114                args_hash: key.1.clone(),
3115                result_changed,
3116                left_result: l.map(|t| t.result.clone()),
3117                right_result: r.map(|t| t.result.clone()),
3118            });
3119        }
3120    }
3121
3122    let left_observability = left.observability.clone().unwrap_or_else(|| {
3123        derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3124    });
3125    let right_observability = right.observability.clone().unwrap_or_else(|| {
3126        derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3127    });
3128    let mut observability_diffs = Vec::new();
3129
3130    let left_workers = left_observability
3131        .worker_lineage
3132        .iter()
3133        .map(|worker| {
3134            (
3135                worker.worker_id.clone(),
3136                (
3137                    worker.status.clone(),
3138                    worker.run_id.clone(),
3139                    worker.run_path.clone(),
3140                ),
3141            )
3142        })
3143        .collect::<BTreeMap<_, _>>();
3144    let right_workers = right_observability
3145        .worker_lineage
3146        .iter()
3147        .map(|worker| {
3148            (
3149                worker.worker_id.clone(),
3150                (
3151                    worker.status.clone(),
3152                    worker.run_id.clone(),
3153                    worker.run_path.clone(),
3154                ),
3155            )
3156        })
3157        .collect::<BTreeMap<_, _>>();
3158    let worker_ids = left_workers
3159        .keys()
3160        .chain(right_workers.keys())
3161        .cloned()
3162        .collect::<BTreeSet<_>>();
3163    for worker_id in worker_ids {
3164        match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3165            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3166                section: "worker_lineage".to_string(),
3167                label: worker_id,
3168                details: vec!["worker missing from right run".to_string()],
3169            }),
3170            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3171                section: "worker_lineage".to_string(),
3172                label: worker_id,
3173                details: vec!["worker missing from left run".to_string()],
3174            }),
3175            (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3176                let mut details = Vec::new();
3177                if left_worker.0 != right_worker.0 {
3178                    details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3179                }
3180                if left_worker.1 != right_worker.1 {
3181                    details.push(format!(
3182                        "run_id: {:?} -> {:?}",
3183                        left_worker.1, right_worker.1
3184                    ));
3185                }
3186                if left_worker.2 != right_worker.2 {
3187                    details.push(format!(
3188                        "run_path: {:?} -> {:?}",
3189                        left_worker.2, right_worker.2
3190                    ));
3191                }
3192                observability_diffs.push(RunObservabilityDiffRecord {
3193                    section: "worker_lineage".to_string(),
3194                    label: worker_id,
3195                    details,
3196                });
3197            }
3198            _ => {}
3199        }
3200    }
3201
3202    let left_rounds = left_observability
3203        .planner_rounds
3204        .iter()
3205        .map(|round| (round.stage_id.clone(), round))
3206        .collect::<BTreeMap<_, _>>();
3207    let right_rounds = right_observability
3208        .planner_rounds
3209        .iter()
3210        .map(|round| (round.stage_id.clone(), round))
3211        .collect::<BTreeMap<_, _>>();
3212    let round_ids = left_rounds
3213        .keys()
3214        .chain(right_rounds.keys())
3215        .cloned()
3216        .collect::<BTreeSet<_>>();
3217    for stage_id in round_ids {
3218        match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3219            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3220                section: "planner_rounds".to_string(),
3221                label: stage_id,
3222                details: vec!["planner summary missing from right run".to_string()],
3223            }),
3224            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3225                section: "planner_rounds".to_string(),
3226                label: stage_id,
3227                details: vec!["planner summary missing from left run".to_string()],
3228            }),
3229            (Some(left_round), Some(right_round)) => {
3230                let mut details = Vec::new();
3231                if left_round.iteration_count != right_round.iteration_count {
3232                    details.push(format!(
3233                        "iterations: {} -> {}",
3234                        left_round.iteration_count, right_round.iteration_count
3235                    ));
3236                }
3237                if left_round.tool_execution_count != right_round.tool_execution_count {
3238                    details.push(format!(
3239                        "tool_executions: {} -> {}",
3240                        left_round.tool_execution_count, right_round.tool_execution_count
3241                    ));
3242                }
3243                if left_round.native_text_tool_fallback_count
3244                    != right_round.native_text_tool_fallback_count
3245                {
3246                    details.push(format!(
3247                        "native_text_tool_fallbacks: {} -> {}",
3248                        left_round.native_text_tool_fallback_count,
3249                        right_round.native_text_tool_fallback_count
3250                    ));
3251                }
3252                if left_round.native_text_tool_fallback_rejection_count
3253                    != right_round.native_text_tool_fallback_rejection_count
3254                {
3255                    details.push(format!(
3256                        "native_text_tool_fallback_rejections: {} -> {}",
3257                        left_round.native_text_tool_fallback_rejection_count,
3258                        right_round.native_text_tool_fallback_rejection_count
3259                    ));
3260                }
3261                if left_round.empty_completion_retry_count
3262                    != right_round.empty_completion_retry_count
3263                {
3264                    details.push(format!(
3265                        "empty_completion_retries: {} -> {}",
3266                        left_round.empty_completion_retry_count,
3267                        right_round.empty_completion_retry_count
3268                    ));
3269                }
3270                if left_round.research_facts != right_round.research_facts {
3271                    details.push(format!(
3272                        "research_facts: {:?} -> {:?}",
3273                        left_round.research_facts, right_round.research_facts
3274                    ));
3275                }
3276                let left_deliverables = left_round
3277                    .task_ledger
3278                    .as_ref()
3279                    .map(|ledger| {
3280                        ledger
3281                            .deliverables
3282                            .iter()
3283                            .map(|item| format!("{}:{}", item.id, item.status))
3284                            .collect::<Vec<_>>()
3285                    })
3286                    .unwrap_or_default();
3287                let right_deliverables = right_round
3288                    .task_ledger
3289                    .as_ref()
3290                    .map(|ledger| {
3291                        ledger
3292                            .deliverables
3293                            .iter()
3294                            .map(|item| format!("{}:{}", item.id, item.status))
3295                            .collect::<Vec<_>>()
3296                    })
3297                    .unwrap_or_default();
3298                if left_deliverables != right_deliverables {
3299                    details.push(format!(
3300                        "deliverables: {:?} -> {:?}",
3301                        left_deliverables, right_deliverables
3302                    ));
3303                }
3304                if left_round.successful_tools != right_round.successful_tools {
3305                    details.push(format!(
3306                        "successful_tools: {:?} -> {:?}",
3307                        left_round.successful_tools, right_round.successful_tools
3308                    ));
3309                }
3310                if !details.is_empty() {
3311                    observability_diffs.push(RunObservabilityDiffRecord {
3312                        section: "planner_rounds".to_string(),
3313                        label: left_round.node_id.clone(),
3314                        details,
3315                    });
3316                }
3317            }
3318            _ => {}
3319        }
3320    }
3321
3322    let left_pointers = left_observability
3323        .transcript_pointers
3324        .iter()
3325        .map(|pointer| {
3326            (
3327                pointer.id.clone(),
3328                (
3329                    pointer.available,
3330                    pointer.path.clone(),
3331                    pointer.location.clone(),
3332                ),
3333            )
3334        })
3335        .collect::<BTreeMap<_, _>>();
3336    let right_pointers = right_observability
3337        .transcript_pointers
3338        .iter()
3339        .map(|pointer| {
3340            (
3341                pointer.id.clone(),
3342                (
3343                    pointer.available,
3344                    pointer.path.clone(),
3345                    pointer.location.clone(),
3346                ),
3347            )
3348        })
3349        .collect::<BTreeMap<_, _>>();
3350    let pointer_ids = left_pointers
3351        .keys()
3352        .chain(right_pointers.keys())
3353        .cloned()
3354        .collect::<BTreeSet<_>>();
3355    for pointer_id in pointer_ids {
3356        match (
3357            left_pointers.get(&pointer_id),
3358            right_pointers.get(&pointer_id),
3359        ) {
3360            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3361                section: "transcript_pointers".to_string(),
3362                label: pointer_id,
3363                details: vec!["pointer missing from right run".to_string()],
3364            }),
3365            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3366                section: "transcript_pointers".to_string(),
3367                label: pointer_id,
3368                details: vec!["pointer missing from left run".to_string()],
3369            }),
3370            (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3371                observability_diffs.push(RunObservabilityDiffRecord {
3372                    section: "transcript_pointers".to_string(),
3373                    label: pointer_id,
3374                    details: vec![format!(
3375                        "pointer: {:?} -> {:?}",
3376                        left_pointer, right_pointer
3377                    )],
3378                });
3379            }
3380            _ => {}
3381        }
3382    }
3383
3384    let left_compactions = left_observability
3385        .compaction_events
3386        .iter()
3387        .map(|event| {
3388            (
3389                event.id.clone(),
3390                (
3391                    event.strategy.clone(),
3392                    event.archived_messages,
3393                    event.snapshot_asset_id.clone(),
3394                    event.available,
3395                ),
3396            )
3397        })
3398        .collect::<BTreeMap<_, _>>();
3399    let right_compactions = right_observability
3400        .compaction_events
3401        .iter()
3402        .map(|event| {
3403            (
3404                event.id.clone(),
3405                (
3406                    event.strategy.clone(),
3407                    event.archived_messages,
3408                    event.snapshot_asset_id.clone(),
3409                    event.available,
3410                ),
3411            )
3412        })
3413        .collect::<BTreeMap<_, _>>();
3414    let compaction_ids = left_compactions
3415        .keys()
3416        .chain(right_compactions.keys())
3417        .cloned()
3418        .collect::<BTreeSet<_>>();
3419    for compaction_id in compaction_ids {
3420        match (
3421            left_compactions.get(&compaction_id),
3422            right_compactions.get(&compaction_id),
3423        ) {
3424            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3425                section: "compaction_events".to_string(),
3426                label: compaction_id,
3427                details: vec!["compaction event missing from right run".to_string()],
3428            }),
3429            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3430                section: "compaction_events".to_string(),
3431                label: compaction_id,
3432                details: vec!["compaction event missing from left run".to_string()],
3433            }),
3434            (Some(left_event), Some(right_event)) if left_event != right_event => {
3435                observability_diffs.push(RunObservabilityDiffRecord {
3436                    section: "compaction_events".to_string(),
3437                    label: compaction_id,
3438                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3439                });
3440            }
3441            _ => {}
3442        }
3443    }
3444
3445    let left_daemons = left_observability
3446        .daemon_events
3447        .iter()
3448        .map(|event| {
3449            (
3450                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3451                (
3452                    event.name.clone(),
3453                    event.persist_path.clone(),
3454                    event.payload_summary.clone(),
3455                ),
3456            )
3457        })
3458        .collect::<BTreeMap<_, _>>();
3459    let right_daemons = right_observability
3460        .daemon_events
3461        .iter()
3462        .map(|event| {
3463            (
3464                (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3465                (
3466                    event.name.clone(),
3467                    event.persist_path.clone(),
3468                    event.payload_summary.clone(),
3469                ),
3470            )
3471        })
3472        .collect::<BTreeMap<_, _>>();
3473    let daemon_keys = left_daemons
3474        .keys()
3475        .chain(right_daemons.keys())
3476        .cloned()
3477        .collect::<BTreeSet<_>>();
3478    for daemon_key in daemon_keys {
3479        let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3480        match (
3481            left_daemons.get(&daemon_key),
3482            right_daemons.get(&daemon_key),
3483        ) {
3484            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3485                section: "daemon_events".to_string(),
3486                label,
3487                details: vec!["daemon event missing from right run".to_string()],
3488            }),
3489            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3490                section: "daemon_events".to_string(),
3491                label,
3492                details: vec!["daemon event missing from left run".to_string()],
3493            }),
3494            (Some(left_event), Some(right_event)) if left_event != right_event => {
3495                observability_diffs.push(RunObservabilityDiffRecord {
3496                    section: "daemon_events".to_string(),
3497                    label,
3498                    details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3499                });
3500            }
3501            _ => {}
3502        }
3503    }
3504
3505    let left_verification = left_observability
3506        .verification_outcomes
3507        .iter()
3508        .map(|item| (item.stage_id.clone(), item))
3509        .collect::<BTreeMap<_, _>>();
3510    let right_verification = right_observability
3511        .verification_outcomes
3512        .iter()
3513        .map(|item| (item.stage_id.clone(), item))
3514        .collect::<BTreeMap<_, _>>();
3515    let verification_ids = left_verification
3516        .keys()
3517        .chain(right_verification.keys())
3518        .cloned()
3519        .collect::<BTreeSet<_>>();
3520    for stage_id in verification_ids {
3521        match (
3522            left_verification.get(&stage_id),
3523            right_verification.get(&stage_id),
3524        ) {
3525            (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3526                section: "verification".to_string(),
3527                label: stage_id,
3528                details: vec!["verification missing from right run".to_string()],
3529            }),
3530            (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3531                section: "verification".to_string(),
3532                label: stage_id,
3533                details: vec!["verification missing from left run".to_string()],
3534            }),
3535            (Some(left_item), Some(right_item)) if left_item != right_item => {
3536                let mut details = Vec::new();
3537                if left_item.passed != right_item.passed {
3538                    details.push(format!(
3539                        "passed: {:?} -> {:?}",
3540                        left_item.passed, right_item.passed
3541                    ));
3542                }
3543                if left_item.summary != right_item.summary {
3544                    details.push(format!(
3545                        "summary: {:?} -> {:?}",
3546                        left_item.summary, right_item.summary
3547                    ));
3548                }
3549                observability_diffs.push(RunObservabilityDiffRecord {
3550                    section: "verification".to_string(),
3551                    label: left_item.node_id.clone(),
3552                    details,
3553                });
3554            }
3555            _ => {}
3556        }
3557    }
3558
3559    let left_graph = (
3560        left_observability.action_graph_nodes.len(),
3561        left_observability.action_graph_edges.len(),
3562    );
3563    let right_graph = (
3564        right_observability.action_graph_nodes.len(),
3565        right_observability.action_graph_edges.len(),
3566    );
3567    if left_graph != right_graph {
3568        observability_diffs.push(RunObservabilityDiffRecord {
3569            section: "action_graph".to_string(),
3570            label: "shape".to_string(),
3571            details: vec![format!(
3572                "nodes/edges: {}/{} -> {}/{}",
3573                left_graph.0, left_graph.1, right_graph.0, right_graph.1
3574            )],
3575        });
3576    }
3577
3578    let status_changed = left.status != right.status;
3579    let identical = !status_changed
3580        && stage_diffs.is_empty()
3581        && tool_diffs.is_empty()
3582        && observability_diffs.is_empty()
3583        && left.transitions.len() == right.transitions.len()
3584        && left.artifacts.len() == right.artifacts.len()
3585        && left.checkpoints.len() == right.checkpoints.len();
3586
3587    RunDiffReport {
3588        left_run_id: left.id.clone(),
3589        right_run_id: right.id.clone(),
3590        identical,
3591        status_changed,
3592        left_status: left.status.clone(),
3593        right_status: right.status.clone(),
3594        stage_diffs,
3595        tool_diffs,
3596        observability_diffs,
3597        transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3598        artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3599        checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3600    }
3601}
3602
3603#[cfg(test)]
3604mod tests {
3605    use super::*;
3606    use std::fs;
3607
3608    fn repo_root() -> PathBuf {
3609        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
3610            .parent()
3611            .unwrap()
3612            .parent()
3613            .unwrap()
3614            .to_path_buf()
3615    }
3616
3617    fn minimal_run(status: &str) -> RunRecord {
3618        RunRecord {
3619            type_name: "workflow_run".to_string(),
3620            id: "run_1".to_string(),
3621            workflow_id: "workflow_1".to_string(),
3622            status: status.to_string(),
3623            usage: Some(LlmUsageRecord {
3624                total_duration_ms: 12,
3625                total_cost: 0.01,
3626                input_tokens: 3,
3627                output_tokens: 4,
3628                call_count: 1,
3629                models: vec!["mock".to_string()],
3630            }),
3631            replay_fixture: Some(ReplayFixture {
3632                type_name: "replay_fixture".to_string(),
3633                expected_status: "completed".to_string(),
3634                ..ReplayFixture::default()
3635            }),
3636            ..RunRecord::default()
3637        }
3638    }
3639
3640    #[test]
3641    fn eval_pack_manifest_toml_runs_replay_case() {
3642        let temp = tempfile::tempdir().unwrap();
3643        let run_path = temp.path().join("run.json");
3644        fs::write(
3645            &run_path,
3646            serde_json::to_string(&minimal_run("completed")).unwrap(),
3647        )
3648        .unwrap();
3649        let pack_path = temp.path().join("harn.eval.toml");
3650        fs::write(
3651            &pack_path,
3652            r#"
3653version = 1
3654id = "connector-regressions"
3655name = "Connector regressions"
3656
3657[[cases]]
3658id = "webhook"
3659name = "Webhook normalization"
3660run = "run.json"
3661rubrics = ["status"]
3662
3663[[rubrics]]
3664id = "status"
3665kind = "deterministic"
3666
3667[[rubrics.assertions]]
3668kind = "run-status"
3669expected = "completed"
3670"#,
3671        )
3672        .unwrap();
3673
3674        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3675        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3676
3677        assert!(report.pass);
3678        assert_eq!(report.total, 1);
3679        assert_eq!(report.cases[0].label, "Webhook normalization");
3680    }
3681
3682    #[test]
3683    fn eval_pack_warning_case_does_not_block() {
3684        let temp = tempfile::tempdir().unwrap();
3685        let run_path = temp.path().join("run.json");
3686        fs::write(
3687            &run_path,
3688            serde_json::to_string(&minimal_run("completed")).unwrap(),
3689        )
3690        .unwrap();
3691        let pack_path = temp.path().join("harn.eval.toml");
3692        fs::write(
3693            &pack_path,
3694            r#"
3695version = 1
3696id = "budgets"
3697
3698[[cases]]
3699id = "latency-budget"
3700run = "run.json"
3701severity = "warning"
3702
3703[cases.thresholds]
3704max-latency-ms = 1
3705"#,
3706        )
3707        .unwrap();
3708
3709        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3710        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3711
3712        assert!(report.pass);
3713        assert_eq!(report.warning_failed, 1);
3714        assert!(report.cases[0].warnings[0].contains("latency"));
3715    }
3716
3717    #[test]
3718    fn eval_pack_manifest_runs_persona_ladder() {
3719        let temp = tempfile::tempdir().unwrap();
3720        let pack_path = temp.path().join("harn.eval.toml");
3721        let base_dir = format!("{:?}", repo_root().display().to_string());
3722        let artifact_root = format!("{:?}", temp.path().join("artifacts").display().to_string());
3723        fs::write(
3724            &pack_path,
3725            format!(
3726                r#"
3727version = 1
3728id = "merge-captain-ladders"
3729base_dir = {}
3730
3731[[ladders]]
3732id = "merge-captain-timeout"
3733persona = "merge_captain"
3734artifact-root = {}
3735
3736[ladders.backend]
3737kind = "replay"
3738path = "examples/personas/merge_captain/transcripts/green_pr.jsonl"
3739
3740[[ladders.model-routes]]
3741id = "gemma-value"
3742route = "local/gemma-value"
3743provider = "llama.cpp"
3744model = "gemma"
3745profile = "value"
3746
3747[[ladders.timeout-tiers]]
3748id = "tiny"
3749max-tool-calls = 1
3750
3751[[ladders.timeout-tiers]]
3752id = "balanced"
3753max-tool-calls = 4
3754max-model-calls = 1
3755"#,
3756                base_dir, artifact_root
3757            ),
3758        )
3759        .unwrap();
3760
3761        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3762        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3763
3764        assert!(report.pass);
3765        assert_eq!(report.total, 1);
3766        assert_eq!(report.ladders.len(), 1);
3767        assert_eq!(
3768            report.ladders[0].first_correct_tier.as_deref(),
3769            Some("balanced")
3770        );
3771        assert_eq!(report.ladders[0].tiers[0].outcome, "degraded");
3772        assert_eq!(report.ladders[0].tiers[1].outcome, "correct");
3773    }
3774
3775    #[test]
3776    fn eval_pack_manifest_runs_friction_context_pack_case() {
3777        let temp = tempfile::tempdir().unwrap();
3778        let events_path = temp.path().join("incident-friction.json");
3779        fs::write(
3780            &events_path,
3781            r#"
3782{
3783  "events": [
3784    {
3785      "kind": "repeated_query",
3786      "source": "incident-triage",
3787      "actor": "sre",
3788      "tool": "splunk",
3789      "provider": "splunk",
3790      "redacted_summary": "Checkout incidents need the same Splunk search",
3791      "recurrence_hints": ["checkout incident queries"],
3792      "estimated_time_ms": 300000,
3793      "metadata": {
3794        "query": "index=checkout service=api error",
3795        "capability": "splunk.search",
3796        "secret_ref": "SPLUNK_READ_TOKEN",
3797        "output_slot": "splunk_errors"
3798      }
3799    },
3800    {
3801      "kind": "repeated_query",
3802      "source": "incident-triage",
3803      "actor": "sre",
3804      "tool": "splunk",
3805      "provider": "splunk",
3806      "redacted_summary": "Checkout incident triage repeated the Splunk search",
3807      "recurrence_hints": ["checkout incident queries"],
3808      "estimated_time_ms": 240000,
3809      "metadata": {
3810        "query": "index=checkout service=api error",
3811        "capability": "splunk.search",
3812        "secret_ref": "SPLUNK_READ_TOKEN",
3813        "output_slot": "splunk_errors"
3814      }
3815    }
3816  ]
3817}
3818"#,
3819        )
3820        .unwrap();
3821        let pack_path = temp.path().join("harn.eval.toml");
3822        fs::write(
3823            &pack_path,
3824            r#"
3825version = 1
3826id = "team-learning"
3827name = "Team learning evals"
3828
3829[[fixtures]]
3830id = "incident-friction"
3831kind = "friction-events"
3832path = "incident-friction.json"
3833
3834[[cases]]
3835id = "incident-context-pack"
3836name = "Incident context pack suggestion"
3837friction_events = "incident-friction"
3838rubrics = ["context-pack"]
3839
3840[[rubrics]]
3841id = "context-pack"
3842kind = "friction"
3843
3844[[rubrics.assertions]]
3845kind = "context-pack-suggestion"
3846contains = "incident"
3847expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3848"#,
3849        )
3850        .unwrap();
3851
3852        let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3853        let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3854
3855        assert!(report.pass);
3856        assert_eq!(report.total, 1);
3857        assert_eq!(report.cases[0].run_id, "friction_events");
3858        assert_eq!(report.cases[0].stage_count, 2);
3859    }
3860}