1use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 default_run_dir, evaluate_context_pack_suggestion_expectations,
10 generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11 parse_json_payload, parse_json_value, sync_run_handoffs, ArtifactRecord, CapabilityPolicy,
12 ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact,
13};
14use crate::event_log::{
15 active_event_log, AnyEventLog, EventLog, LogEvent as EventLogRecord, Topic,
16};
17use crate::llm::vm_value_to_json;
18use crate::triggers::{SignatureStatus, TriggerEvent};
19use crate::value::{VmError, VmValue};
20
21pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
22pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
23pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
24pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
25pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
26pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
27pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
28pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
29pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
30pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
31pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
32
33pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
34pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
35pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
36pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
37pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
38pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
39pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
40pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
41pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
42
43#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
44#[serde(default)]
45pub struct LlmUsageRecord {
46 pub input_tokens: i64,
47 pub output_tokens: i64,
48 pub total_duration_ms: i64,
49 pub call_count: i64,
50 pub total_cost: f64,
51 pub models: Vec<String>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
55#[serde(default)]
56pub struct RunStageRecord {
57 pub id: String,
58 pub node_id: String,
59 pub kind: String,
60 pub status: String,
61 pub outcome: String,
62 pub branch: Option<String>,
63 pub started_at: String,
64 pub finished_at: Option<String>,
65 pub visible_text: Option<String>,
66 pub private_reasoning: Option<String>,
67 pub transcript: Option<serde_json::Value>,
68 pub verification: Option<serde_json::Value>,
69 pub usage: Option<LlmUsageRecord>,
70 pub artifacts: Vec<ArtifactRecord>,
71 pub consumed_artifact_ids: Vec<String>,
72 pub produced_artifact_ids: Vec<String>,
73 pub attempts: Vec<RunStageAttemptRecord>,
74 pub metadata: BTreeMap<String, serde_json::Value>,
75}
76
77#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
78#[serde(default)]
79pub struct RunStageAttemptRecord {
80 pub attempt: usize,
81 pub status: String,
82 pub outcome: String,
83 pub branch: Option<String>,
84 pub error: Option<String>,
85 pub verification: Option<serde_json::Value>,
86 pub started_at: String,
87 pub finished_at: Option<String>,
88}
89
90#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(default)]
92pub struct RunTransitionRecord {
93 pub id: String,
94 pub from_stage_id: Option<String>,
95 pub from_node_id: Option<String>,
96 pub to_node_id: String,
97 pub branch: Option<String>,
98 pub timestamp: String,
99 pub consumed_artifact_ids: Vec<String>,
100 pub produced_artifact_ids: Vec<String>,
101}
102
103#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
104#[serde(default)]
105pub struct RunCheckpointRecord {
106 pub id: String,
107 pub ready_nodes: Vec<String>,
108 pub completed_nodes: Vec<String>,
109 pub last_stage_id: Option<String>,
110 pub persisted_at: String,
111 pub reason: String,
112}
113
114#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
115#[serde(default)]
116pub struct ReplayFixture {
117 #[serde(rename = "_type")]
118 pub type_name: String,
119 pub id: String,
120 pub source_run_id: String,
121 pub workflow_id: String,
122 pub workflow_name: Option<String>,
123 pub created_at: String,
124 pub eval_kind: Option<String>,
125 pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
126 pub expected_status: String,
127 pub stage_assertions: Vec<ReplayStageAssertion>,
128}
129
130#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
131#[serde(default)]
132pub struct ClarifyingQuestionEvalSpec {
133 pub expected_question: Option<String>,
134 pub accepted_questions: Vec<String>,
135 pub required_terms: Vec<String>,
136 pub forbidden_terms: Vec<String>,
137 pub min_questions: usize,
138 pub max_questions: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
142#[serde(default)]
143pub struct ReplayStageAssertion {
144 pub node_id: String,
145 pub expected_status: String,
146 pub expected_outcome: String,
147 pub expected_branch: Option<String>,
148 pub required_artifact_kinds: Vec<String>,
149 pub visible_text_contains: Option<String>,
150}
151
152#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
153#[serde(default)]
154pub struct ReplayEvalReport {
155 pub pass: bool,
156 pub failures: Vec<String>,
157 pub stage_count: usize,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalCaseReport {
163 pub run_id: String,
164 pub workflow_id: String,
165 pub label: Option<String>,
166 pub pass: bool,
167 pub failures: Vec<String>,
168 pub stage_count: usize,
169 pub source_path: Option<String>,
170 pub comparison: Option<RunDiffReport>,
171}
172
173#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
174#[serde(default)]
175pub struct ReplayEvalSuiteReport {
176 pub pass: bool,
177 pub total: usize,
178 pub passed: usize,
179 pub failed: usize,
180 pub cases: Vec<ReplayEvalCaseReport>,
181}
182
183#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(default)]
185pub struct RunDeliverableSummaryRecord {
186 pub id: String,
187 pub text: String,
188 pub status: String,
189 pub note: Option<String>,
190}
191
192#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
193#[serde(default)]
194pub struct RunTaskLedgerSummaryRecord {
195 pub root_task: String,
196 pub rationale: String,
197 pub deliverables: Vec<RunDeliverableSummaryRecord>,
198 pub observations: Vec<String>,
199 pub blocking_count: usize,
200}
201
202#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
203#[serde(default)]
204pub struct RunPlannerRoundRecord {
205 pub stage_id: String,
206 pub node_id: String,
207 pub stage_kind: String,
208 pub status: String,
209 pub outcome: String,
210 pub iteration_count: usize,
211 pub llm_call_count: usize,
212 pub tool_execution_count: usize,
213 pub tool_rejection_count: usize,
214 pub intervention_count: usize,
215 pub compaction_count: usize,
216 pub native_text_tool_fallback_count: usize,
217 pub native_text_tool_fallback_rejection_count: usize,
218 pub empty_completion_retry_count: usize,
219 pub tools_used: Vec<String>,
220 pub successful_tools: Vec<String>,
221 pub ledger_done_rejections: usize,
222 pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
223 pub research_facts: Vec<String>,
224}
225
226#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
227#[serde(default)]
228pub struct RunWorkerLineageRecord {
229 pub worker_id: String,
230 pub worker_name: String,
231 pub parent_stage_id: Option<String>,
232 pub task: String,
233 pub status: String,
234 pub session_id: Option<String>,
235 pub parent_session_id: Option<String>,
236 pub run_id: Option<String>,
237 pub run_path: Option<String>,
238 pub snapshot_path: Option<String>,
239}
240
241#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
242#[serde(default)]
243pub struct RunActionGraphNodeRecord {
244 pub id: String,
245 pub label: String,
246 pub kind: String,
247 pub status: String,
248 pub outcome: String,
249 pub trace_id: Option<String>,
250 pub stage_id: Option<String>,
251 pub node_id: Option<String>,
252 pub worker_id: Option<String>,
253 pub run_id: Option<String>,
254 pub run_path: Option<String>,
255 pub metadata: BTreeMap<String, serde_json::Value>,
256}
257
258#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
259#[serde(default)]
260pub struct RunActionGraphEdgeRecord {
261 pub from_id: String,
262 pub to_id: String,
263 pub kind: String,
264 pub label: Option<String>,
265}
266
267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
268#[serde(default)]
269pub struct RunVerificationOutcomeRecord {
270 pub stage_id: String,
271 pub node_id: String,
272 pub status: String,
273 pub passed: Option<bool>,
274 pub summary: Option<String>,
275}
276
277#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
278#[serde(default)]
279pub struct RunTranscriptPointerRecord {
280 pub id: String,
281 pub label: String,
282 pub kind: String,
283 pub location: String,
284 pub path: Option<String>,
285 pub available: bool,
286}
287
288#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
289#[serde(default)]
290pub struct CompactionEventRecord {
291 pub id: String,
292 pub transcript_id: Option<String>,
293 pub stage_id: Option<String>,
294 pub node_id: Option<String>,
295 pub mode: String,
296 pub strategy: String,
297 pub archived_messages: usize,
298 pub estimated_tokens_before: usize,
299 pub estimated_tokens_after: usize,
300 pub snapshot_asset_id: Option<String>,
301 pub snapshot_location: String,
302 pub snapshot_path: Option<String>,
303 pub available: bool,
304}
305
306#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
307#[serde(rename_all = "snake_case")]
308pub enum DaemonEventKindRecord {
309 #[default]
310 Spawned,
311 Triggered,
312 Snapshotted,
313 Resumed,
314 Stopped,
315}
316
317#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
318#[serde(default)]
319pub struct DaemonEventRecord {
320 pub daemon_id: String,
321 pub name: String,
322 pub kind: DaemonEventKindRecord,
323 pub timestamp: String,
324 pub persist_path: String,
325 pub payload_summary: Option<String>,
326}
327
328#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
329#[serde(default)]
330pub struct RunObservabilityRecord {
331 pub schema_version: usize,
332 pub planner_rounds: Vec<RunPlannerRoundRecord>,
333 pub research_fact_count: usize,
334 pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
335 pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
336 pub worker_lineage: Vec<RunWorkerLineageRecord>,
337 pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
338 pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
339 pub compaction_events: Vec<CompactionEventRecord>,
340 pub daemon_events: Vec<DaemonEventRecord>,
341}
342
343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
344#[serde(default)]
345pub struct RunStageDiffRecord {
346 pub node_id: String,
347 pub change: String,
348 pub details: Vec<String>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct ToolCallDiffRecord {
354 pub tool_name: String,
355 pub args_hash: String,
356 pub result_changed: bool,
357 pub left_result: Option<String>,
358 pub right_result: Option<String>,
359}
360
361#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
362#[serde(default)]
363pub struct RunObservabilityDiffRecord {
364 pub section: String,
365 pub label: String,
366 pub details: Vec<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunDiffReport {
372 pub left_run_id: String,
373 pub right_run_id: String,
374 pub identical: bool,
375 pub status_changed: bool,
376 pub left_status: String,
377 pub right_status: String,
378 pub stage_diffs: Vec<RunStageDiffRecord>,
379 pub tool_diffs: Vec<ToolCallDiffRecord>,
380 pub observability_diffs: Vec<RunObservabilityDiffRecord>,
381 pub transition_count_delta: isize,
382 pub artifact_count_delta: isize,
383 pub checkpoint_count_delta: isize,
384}
385
386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
387#[serde(default)]
388pub struct EvalSuiteManifest {
389 #[serde(rename = "_type")]
390 pub type_name: String,
391 pub id: String,
392 pub name: Option<String>,
393 pub base_dir: Option<String>,
394 pub cases: Vec<EvalSuiteCase>,
395}
396
397#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
398#[serde(default)]
399pub struct EvalSuiteCase {
400 pub label: Option<String>,
401 pub run_path: String,
402 pub fixture_path: Option<String>,
403 pub compare_to: Option<String>,
404}
405
406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
407#[serde(default)]
408pub struct EvalPackManifest {
409 pub version: u32,
410 pub id: String,
411 pub name: Option<String>,
412 pub description: Option<String>,
413 pub base_dir: Option<String>,
414 pub package: Option<EvalPackPackage>,
415 pub defaults: EvalPackDefaults,
416 pub fixtures: Vec<EvalPackFixtureRef>,
417 pub rubrics: Vec<EvalPackRubric>,
418 pub judge: Option<EvalPackJudgeConfig>,
419 pub cases: Vec<EvalPackCase>,
420 pub metadata: BTreeMap<String, serde_json::Value>,
421}
422
423#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
424#[serde(default)]
425pub struct EvalPackPackage {
426 pub name: Option<String>,
427 pub version: Option<String>,
428 pub source: Option<String>,
429 pub templates: Vec<String>,
430}
431
432#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
433#[serde(default)]
434pub struct EvalPackDefaults {
435 pub severity: Option<String>,
436 pub fixture_root: Option<String>,
437 pub thresholds: EvalPackThresholds,
438 pub judge: Option<EvalPackJudgeConfig>,
439}
440
441#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
442#[serde(default)]
443pub struct EvalPackFixtureRef {
444 pub id: String,
445 pub kind: String,
446 pub path: Option<String>,
447 #[serde(default, alias = "trace-id")]
448 pub trace_id: Option<String>,
449 pub provider: Option<String>,
450 #[serde(default, alias = "event-kind")]
451 pub event_kind: Option<String>,
452 pub inline: Option<serde_json::Value>,
453 pub metadata: BTreeMap<String, serde_json::Value>,
454}
455
456#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
457#[serde(default)]
458pub struct EvalPackRubric {
459 pub id: String,
460 pub kind: String,
461 pub description: Option<String>,
462 pub prompt: Option<String>,
463 pub assertions: Vec<EvalPackAssertion>,
464 pub judge: Option<EvalPackJudgeConfig>,
465 pub calibration: Vec<EvalPackGoldenExample>,
466 pub thresholds: EvalPackThresholds,
467 pub metadata: BTreeMap<String, serde_json::Value>,
468}
469
470#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
471#[serde(default)]
472pub struct EvalPackAssertion {
473 pub kind: String,
474 pub stage: Option<String>,
475 pub path: Option<String>,
476 pub op: Option<String>,
477 pub expected: Option<serde_json::Value>,
478 pub contains: Option<String>,
479 pub metadata: BTreeMap<String, serde_json::Value>,
480}
481
482#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
483#[serde(default)]
484pub struct EvalPackJudgeConfig {
485 pub model: Option<String>,
486 #[serde(default, alias = "prompt-version")]
487 pub prompt_version: Option<String>,
488 #[serde(default, alias = "tie-break")]
489 pub tie_break: Option<String>,
490 #[serde(default, alias = "confidence-min")]
491 pub confidence_min: Option<f64>,
492 pub temperature: Option<f64>,
493 pub rubric: Option<String>,
494 pub metadata: BTreeMap<String, serde_json::Value>,
495}
496
497#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
498#[serde(default)]
499pub struct EvalPackGoldenExample {
500 pub input: serde_json::Value,
501 pub output: serde_json::Value,
502 pub score: Option<f64>,
503 pub explanation: Option<String>,
504}
505
506#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
507#[serde(default)]
508pub struct EvalPackThresholds {
509 pub severity: Option<String>,
510 #[serde(default, alias = "min-score")]
511 pub min_score: Option<f64>,
512 #[serde(default, alias = "min-confidence")]
513 pub min_confidence: Option<f64>,
514 #[serde(default, alias = "max-cost-usd")]
515 pub max_cost_usd: Option<f64>,
516 #[serde(default, alias = "max-latency-ms")]
517 pub max_latency_ms: Option<i64>,
518 #[serde(default, alias = "max-tokens")]
519 pub max_tokens: Option<i64>,
520 #[serde(default, alias = "max-stage-count")]
521 pub max_stage_count: Option<usize>,
522}
523
524#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
525#[serde(default)]
526pub struct EvalPackCase {
527 pub id: Option<String>,
528 pub name: Option<String>,
529 pub description: Option<String>,
530 pub run: Option<String>,
531 #[serde(default, alias = "run-path")]
532 pub run_path: Option<String>,
533 #[serde(default, alias = "friction-events", alias = "friction_events")]
534 pub friction_events: Option<String>,
535 pub fixture: Option<String>,
536 #[serde(default, alias = "fixture-path")]
537 pub fixture_path: Option<String>,
538 #[serde(default, alias = "compare-to")]
539 pub compare_to: Option<String>,
540 pub rubrics: Vec<String>,
541 pub severity: Option<String>,
542 pub thresholds: EvalPackThresholds,
543 pub metadata: BTreeMap<String, serde_json::Value>,
544}
545
546#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
547#[serde(default)]
548pub struct EvalPackReport {
549 pub pack_id: String,
550 pub pass: bool,
551 pub total: usize,
552 pub passed: usize,
553 pub failed: usize,
554 pub blocking_failed: usize,
555 pub warning_failed: usize,
556 pub informational_failed: usize,
557 pub cases: Vec<EvalPackCaseReport>,
558}
559
560#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
561#[serde(default)]
562pub struct EvalPackCaseReport {
563 pub id: String,
564 pub label: String,
565 pub severity: String,
566 pub pass: bool,
567 pub blocking: bool,
568 pub run_id: String,
569 pub workflow_id: String,
570 pub source_path: Option<String>,
571 pub stage_count: usize,
572 pub failures: Vec<String>,
573 pub warnings: Vec<String>,
574 pub informational: Vec<String>,
575 pub comparison: Option<RunDiffReport>,
576}
577
578#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
579#[serde(default)]
580pub struct RunHitlQuestionRecord {
581 pub request_id: String,
582 pub prompt: String,
583 pub agent: String,
584 pub trace_id: Option<String>,
585 pub asked_at: String,
586}
587
588#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
589#[serde(default)]
590pub struct RunRecord {
591 #[serde(rename = "_type")]
592 pub type_name: String,
593 pub id: String,
594 pub workflow_id: String,
595 pub workflow_name: Option<String>,
596 pub task: String,
597 pub status: String,
598 pub started_at: String,
599 pub finished_at: Option<String>,
600 pub parent_run_id: Option<String>,
601 pub root_run_id: Option<String>,
602 pub stages: Vec<RunStageRecord>,
603 pub transitions: Vec<RunTransitionRecord>,
604 pub checkpoints: Vec<RunCheckpointRecord>,
605 pub pending_nodes: Vec<String>,
606 pub completed_nodes: Vec<String>,
607 pub child_runs: Vec<RunChildRecord>,
608 pub artifacts: Vec<ArtifactRecord>,
609 pub handoffs: Vec<HandoffArtifact>,
610 pub policy: CapabilityPolicy,
611 pub execution: Option<RunExecutionRecord>,
612 pub transcript: Option<serde_json::Value>,
613 pub usage: Option<LlmUsageRecord>,
614 pub replay_fixture: Option<ReplayFixture>,
615 pub observability: Option<RunObservabilityRecord>,
616 pub trace_spans: Vec<RunTraceSpanRecord>,
617 pub tool_recordings: Vec<ToolCallRecord>,
618 pub hitl_questions: Vec<RunHitlQuestionRecord>,
619 pub metadata: BTreeMap<String, serde_json::Value>,
620 pub persisted_path: Option<String>,
621}
622
623#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
624#[serde(default)]
625pub struct ToolCallRecord {
626 pub tool_name: String,
627 pub tool_use_id: String,
628 pub args_hash: String,
629 pub result: String,
630 pub is_rejected: bool,
631 pub duration_ms: u64,
632 pub iteration: usize,
633 pub timestamp: String,
634}
635
636pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
638 use std::hash::{Hash, Hasher};
639 let mut hasher = std::collections::hash_map::DefaultHasher::new();
640 tool_name.hash(&mut hasher);
641 let args_str = serde_json::to_string(args).unwrap_or_default();
642 args_str.hash(&mut hasher);
643 format!("{:016x}", hasher.finish())
644}
645
646#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
647#[serde(default)]
648pub struct RunTraceSpanRecord {
649 pub span_id: u64,
650 pub parent_id: Option<u64>,
651 pub kind: String,
652 pub name: String,
653 pub start_ms: u64,
654 pub duration_ms: u64,
655 pub metadata: BTreeMap<String, serde_json::Value>,
656}
657
658#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
659#[serde(default)]
660pub struct RunChildRecord {
661 pub worker_id: String,
662 pub worker_name: String,
663 pub parent_stage_id: Option<String>,
664 pub session_id: Option<String>,
665 pub parent_session_id: Option<String>,
666 pub mutation_scope: Option<String>,
667 pub approval_policy: Option<super::ToolApprovalPolicy>,
668 pub task: String,
669 pub request: Option<serde_json::Value>,
670 pub provenance: Option<serde_json::Value>,
671 pub status: String,
672 pub started_at: String,
673 pub finished_at: Option<String>,
674 pub run_id: Option<String>,
675 pub run_path: Option<String>,
676 pub snapshot_path: Option<String>,
677 pub execution: Option<RunExecutionRecord>,
678}
679
680pub(crate) fn run_child_record_from_worker_metadata(
681 parent_stage_id: Option<String>,
682 worker: &serde_json::Value,
683) -> Option<RunChildRecord> {
684 let worker_id = worker.get("id").and_then(|value| value.as_str())?;
685 if worker_id.is_empty() {
686 return None;
687 }
688 Some(RunChildRecord {
689 worker_id: worker_id.to_string(),
690 worker_name: worker
691 .get("name")
692 .and_then(|value| value.as_str())
693 .unwrap_or("worker")
694 .to_string(),
695 parent_stage_id,
696 session_id: worker
697 .get("audit")
698 .and_then(|value| value.get("session_id"))
699 .and_then(|value| value.as_str())
700 .map(str::to_string),
701 parent_session_id: worker
702 .get("audit")
703 .and_then(|value| value.get("parent_session_id"))
704 .and_then(|value| value.as_str())
705 .map(str::to_string),
706 mutation_scope: worker
707 .get("audit")
708 .and_then(|value| value.get("mutation_scope"))
709 .and_then(|value| value.as_str())
710 .map(str::to_string),
711 approval_policy: worker
712 .get("audit")
713 .and_then(|value| value.get("approval_policy"))
714 .and_then(|value| {
715 serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
716 }),
717 task: worker
718 .get("task")
719 .and_then(|value| value.as_str())
720 .unwrap_or_default()
721 .to_string(),
722 request: worker.get("request").cloned(),
723 provenance: worker.get("provenance").cloned(),
724 status: worker
725 .get("status")
726 .and_then(|value| value.as_str())
727 .unwrap_or("completed")
728 .to_string(),
729 started_at: worker
730 .get("started_at")
731 .and_then(|value| value.as_str())
732 .unwrap_or_default()
733 .to_string(),
734 finished_at: worker
735 .get("finished_at")
736 .and_then(|value| value.as_str())
737 .map(str::to_string),
738 run_id: worker
739 .get("child_run_id")
740 .and_then(|value| value.as_str())
741 .map(str::to_string),
742 run_path: worker
743 .get("child_run_path")
744 .and_then(|value| value.as_str())
745 .map(str::to_string),
746 snapshot_path: worker
747 .get("snapshot_path")
748 .and_then(|value| value.as_str())
749 .map(str::to_string),
750 execution: worker
751 .get("execution")
752 .cloned()
753 .and_then(|value| serde_json::from_value(value).ok()),
754 })
755}
756
757fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
758 let parent_stage_id = if stage.id.is_empty() {
759 None
760 } else {
761 Some(stage.id.clone())
762 };
763 run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
764}
765
766fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
767 if existing.worker_name.is_empty() {
768 existing.worker_name = child.worker_name;
769 }
770 if existing.parent_stage_id.is_none() {
771 existing.parent_stage_id = child.parent_stage_id;
772 }
773 if existing.session_id.is_none() {
774 existing.session_id = child.session_id;
775 }
776 if existing.parent_session_id.is_none() {
777 existing.parent_session_id = child.parent_session_id;
778 }
779 if existing.mutation_scope.is_none() {
780 existing.mutation_scope = child.mutation_scope;
781 }
782 if existing.approval_policy.is_none() {
783 existing.approval_policy = child.approval_policy;
784 }
785 if existing.task.is_empty() {
786 existing.task = child.task;
787 }
788 if existing.request.is_none() {
789 existing.request = child.request;
790 }
791 if existing.provenance.is_none() {
792 existing.provenance = child.provenance;
793 }
794 if existing.status.is_empty() {
795 existing.status = child.status;
796 }
797 if existing.started_at.is_empty() {
798 existing.started_at = child.started_at;
799 }
800 if existing.finished_at.is_none() {
801 existing.finished_at = child.finished_at;
802 }
803 if existing.run_id.is_none() {
804 existing.run_id = child.run_id;
805 }
806 if existing.run_path.is_none() {
807 existing.run_path = child.run_path;
808 }
809 if existing.snapshot_path.is_none() {
810 existing.snapshot_path = child.snapshot_path;
811 }
812 if existing.execution.is_none() {
813 existing.execution = child.execution;
814 }
815}
816
817fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
818 for child in run
819 .stages
820 .iter()
821 .filter_map(run_child_from_stage_metadata)
822 .collect::<Vec<_>>()
823 {
824 match run
825 .child_runs
826 .iter_mut()
827 .find(|existing| existing.worker_id == child.worker_id)
828 {
829 Some(existing) => fill_missing_child_run_fields(existing, child),
830 None => run.child_runs.push(child),
831 }
832 }
833}
834
835#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
836#[serde(default)]
837pub struct RunExecutionRecord {
838 pub cwd: Option<String>,
839 pub source_dir: Option<String>,
840 pub env: BTreeMap<String, String>,
841 pub adapter: Option<String>,
842 pub repo_path: Option<String>,
843 pub worktree_path: Option<String>,
844 pub branch: Option<String>,
845 pub base_ref: Option<String>,
846 pub cleanup: Option<String>,
847}
848
849fn compact_json_value(value: &serde_json::Value) -> String {
850 serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
851}
852
853fn normalize_question_text(text: &str) -> String {
854 text.chars()
855 .map(|ch| {
856 if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
857 ch.to_ascii_lowercase()
858 } else {
859 ' '
860 }
861 })
862 .collect::<String>()
863 .split_whitespace()
864 .collect::<Vec<_>>()
865 .join(" ")
866}
867
868fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
869 spec.min_questions.max(1)
870}
871
872fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
873 spec.max_questions.unwrap_or(1).max(1)
874}
875
876fn read_topic_records(
877 log: &AnyEventLog,
878 topic: &Topic,
879) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
880 let mut from = None;
881 let mut records = Vec::new();
882 loop {
883 let batch =
884 futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
885 if batch.is_empty() {
886 break;
887 }
888 from = batch.last().map(|(event_id, _)| *event_id);
889 records.extend(batch);
890 }
891 records
892}
893
894fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
895 let Some(log) = active_event_log() else {
896 return;
897 };
898 let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
899 .expect("static hitl.questions topic should always be valid");
900 let mut merged = run
901 .hitl_questions
902 .iter()
903 .cloned()
904 .map(|question| (question.request_id.clone(), question))
905 .collect::<BTreeMap<_, _>>();
906
907 for (_, event) in read_topic_records(log.as_ref(), &topic) {
908 if event.kind != "hitl.question_asked" {
909 continue;
910 }
911 let payload = &event.payload;
912 let matches_run = event
913 .headers
914 .get("run_id")
915 .is_some_and(|value| value == &run.id)
916 || payload
917 .get("run_id")
918 .and_then(|value| value.as_str())
919 .is_some_and(|value| value == run.id);
920 if !matches_run {
921 continue;
922 }
923 let request_id = payload
924 .get("request_id")
925 .and_then(|value| value.as_str())
926 .or_else(|| event.headers.get("request_id").map(String::as_str))
927 .unwrap_or_default();
928 let prompt = payload
929 .get("payload")
930 .and_then(|value| value.get("prompt"))
931 .and_then(|value| value.as_str())
932 .unwrap_or_default();
933 if request_id.is_empty() || prompt.is_empty() {
934 continue;
935 }
936 merged.insert(
937 request_id.to_string(),
938 RunHitlQuestionRecord {
939 request_id: request_id.to_string(),
940 prompt: prompt.to_string(),
941 agent: payload
942 .get("agent")
943 .and_then(|value| value.as_str())
944 .unwrap_or_default()
945 .to_string(),
946 trace_id: payload
947 .get("trace_id")
948 .and_then(|value| value.as_str())
949 .map(str::to_string),
950 asked_at: payload
951 .get("requested_at")
952 .and_then(|value| value.as_str())
953 .unwrap_or_default()
954 .to_string(),
955 },
956 );
957 }
958
959 run.hitl_questions = merged.into_values().collect();
960 run.hitl_questions.sort_by(|left, right| {
961 (left.asked_at.as_str(), left.request_id.as_str())
962 .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
963 });
964}
965
966fn signature_status_label(status: &SignatureStatus) -> &'static str {
967 match status {
968 SignatureStatus::Verified => "verified",
969 SignatureStatus::Unsigned => "unsigned",
970 SignatureStatus::Failed { .. } => "failed",
971 }
972}
973
974fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
975 run.metadata
976 .get("trigger_event")
977 .cloned()
978 .and_then(|value| serde_json::from_value(value).ok())
979}
980
981fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
982 trigger_event
983 .map(|event| event.trace_id.0.clone())
984 .or_else(|| {
985 run.metadata
986 .get("trace_id")
987 .and_then(|value| value.as_str())
988 .map(str::to_string)
989 })
990}
991
992fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
993 run.metadata
994 .get("replay_of_event_id")
995 .and_then(|value| value.as_str())
996 .map(str::to_string)
997}
998
999fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1000 if stage.kind == "condition" {
1001 ACTION_GRAPH_NODE_KIND_PREDICATE
1002 } else {
1003 ACTION_GRAPH_NODE_KIND_STAGE
1004 }
1005}
1006
1007fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1008 let mut metadata = BTreeMap::new();
1009 metadata.insert(
1010 "provider".to_string(),
1011 serde_json::json!(trigger_event.provider.as_str()),
1012 );
1013 metadata.insert(
1014 "event_kind".to_string(),
1015 serde_json::json!(trigger_event.kind),
1016 );
1017 metadata.insert(
1018 "dedupe_key".to_string(),
1019 serde_json::json!(trigger_event.dedupe_key),
1020 );
1021 metadata.insert(
1022 "signature_status".to_string(),
1023 serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1024 );
1025 metadata
1026}
1027
1028fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1029 let mut metadata = BTreeMap::new();
1030 metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1031 if let Some(branch) = stage.branch.as_ref() {
1032 metadata.insert("branch".to_string(), serde_json::json!(branch));
1033 }
1034 if let Some(worker_id) = stage
1035 .metadata
1036 .get("worker_id")
1037 .and_then(|value| value.as_str())
1038 {
1039 metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1040 }
1041 metadata
1042}
1043
1044fn append_action_graph_node(
1045 nodes: &mut Vec<RunActionGraphNodeRecord>,
1046 record: RunActionGraphNodeRecord,
1047) {
1048 nodes.push(record);
1049}
1050
1051pub async fn append_action_graph_update(
1052 headers: BTreeMap<String, String>,
1053 payload: serde_json::Value,
1054) -> Result<(), crate::event_log::LogError> {
1055 let Some(log) = active_event_log() else {
1056 return Ok(());
1057 };
1058 let topic = Topic::new("observability.action_graph")
1059 .expect("static observability.action_graph topic should always be valid");
1060 let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1061 log.append(&topic, record).await.map(|_| ())
1062}
1063
1064fn publish_action_graph_event(
1065 run: &RunRecord,
1066 observability: &RunObservabilityRecord,
1067 path: &Path,
1068) {
1069 let trigger_event = trigger_event_from_run(run);
1070 let mut headers = BTreeMap::new();
1071 headers.insert("run_id".to_string(), run.id.clone());
1072 headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1073 if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1074 headers.insert("trace_id".to_string(), trace_id);
1075 }
1076 let payload = serde_json::json!({
1077 "run_id": run.id,
1078 "workflow_id": run.workflow_id,
1079 "persisted_path": path.to_string_lossy(),
1080 "status": run.status,
1081 "observability": observability,
1082 });
1083 if let Ok(handle) = tokio::runtime::Handle::try_current() {
1084 handle.spawn(async move {
1085 let _ = append_action_graph_update(headers, payload).await;
1086 });
1087 } else {
1088 let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1089 }
1090}
1091
1092fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1093 let stem = run_path.file_stem()?.to_str()?;
1094 let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1095 Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1096}
1097
1098fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1099 value
1100 .and_then(|value| value.as_array())
1101 .map(|items| {
1102 items
1103 .iter()
1104 .filter_map(|item| item.as_str().map(str::to_string))
1105 .collect::<Vec<_>>()
1106 })
1107 .unwrap_or_default()
1108}
1109
1110fn json_usize(value: Option<&serde_json::Value>) -> usize {
1111 value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1112}
1113
1114fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1115 value.and_then(|value| value.as_bool())
1116}
1117
1118fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1119 stage
1120 .artifacts
1121 .iter()
1122 .find_map(|artifact| artifact.data.as_ref())
1123}
1124
1125fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1126 let deliverables = value
1127 .get("deliverables")
1128 .and_then(|raw| raw.as_array())
1129 .map(|items| {
1130 items
1131 .iter()
1132 .map(|item| RunDeliverableSummaryRecord {
1133 id: item
1134 .get("id")
1135 .and_then(|value| value.as_str())
1136 .unwrap_or_default()
1137 .to_string(),
1138 text: item
1139 .get("text")
1140 .and_then(|value| value.as_str())
1141 .unwrap_or_default()
1142 .to_string(),
1143 status: item
1144 .get("status")
1145 .and_then(|value| value.as_str())
1146 .unwrap_or_default()
1147 .to_string(),
1148 note: item
1149 .get("note")
1150 .and_then(|value| value.as_str())
1151 .map(str::to_string),
1152 })
1153 .collect::<Vec<_>>()
1154 })
1155 .unwrap_or_default();
1156 let observations = json_string_array(value.get("observations"));
1157 let root_task = value
1158 .get("root_task")
1159 .and_then(|value| value.as_str())
1160 .unwrap_or_default()
1161 .to_string();
1162 let rationale = value
1163 .get("rationale")
1164 .and_then(|value| value.as_str())
1165 .unwrap_or_default()
1166 .to_string();
1167 if root_task.is_empty()
1168 && rationale.is_empty()
1169 && deliverables.is_empty()
1170 && observations.is_empty()
1171 {
1172 return None;
1173 }
1174 let blocking_count = deliverables
1175 .iter()
1176 .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1177 .count();
1178 Some(RunTaskLedgerSummaryRecord {
1179 root_task,
1180 rationale,
1181 deliverables,
1182 observations,
1183 blocking_count,
1184 })
1185}
1186
1187fn compaction_events_from_transcript(
1188 transcript: &serde_json::Value,
1189 stage_id: Option<&str>,
1190 node_id: Option<&str>,
1191 location_prefix: &str,
1192 persisted_path: Option<&Path>,
1193) -> Vec<CompactionEventRecord> {
1194 let transcript_id = transcript
1195 .get("id")
1196 .and_then(|value| value.as_str())
1197 .map(str::to_string);
1198 let asset_ids = transcript
1199 .get("assets")
1200 .and_then(|value| value.as_array())
1201 .map(|assets| {
1202 assets
1203 .iter()
1204 .filter_map(|asset| {
1205 asset
1206 .get("id")
1207 .and_then(|value| value.as_str())
1208 .map(str::to_string)
1209 })
1210 .collect::<BTreeSet<_>>()
1211 })
1212 .unwrap_or_default();
1213 transcript
1214 .get("events")
1215 .and_then(|value| value.as_array())
1216 .map(|events| {
1217 events
1218 .iter()
1219 .filter(|event| {
1220 event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1221 })
1222 .map(|event| {
1223 let metadata = event.get("metadata");
1224 let snapshot_asset_id = metadata
1225 .and_then(|value| value.get("snapshot_asset_id"))
1226 .and_then(|value| value.as_str())
1227 .map(str::to_string);
1228 let available = snapshot_asset_id
1229 .as_ref()
1230 .is_some_and(|asset_id| asset_ids.contains(asset_id));
1231 let snapshot_location = snapshot_asset_id
1232 .as_ref()
1233 .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1234 .unwrap_or_else(|| location_prefix.to_string());
1235 CompactionEventRecord {
1236 id: event
1237 .get("id")
1238 .and_then(|value| value.as_str())
1239 .unwrap_or_default()
1240 .to_string(),
1241 transcript_id: transcript_id.clone(),
1242 stage_id: stage_id.map(str::to_string),
1243 node_id: node_id.map(str::to_string),
1244 mode: metadata
1245 .and_then(|value| value.get("mode"))
1246 .and_then(|value| value.as_str())
1247 .unwrap_or_default()
1248 .to_string(),
1249 strategy: metadata
1250 .and_then(|value| value.get("strategy"))
1251 .and_then(|value| value.as_str())
1252 .unwrap_or_default()
1253 .to_string(),
1254 archived_messages: json_usize(
1255 metadata.and_then(|value| value.get("archived_messages")),
1256 ),
1257 estimated_tokens_before: json_usize(
1258 metadata.and_then(|value| value.get("estimated_tokens_before")),
1259 ),
1260 estimated_tokens_after: json_usize(
1261 metadata.and_then(|value| value.get("estimated_tokens_after")),
1262 ),
1263 snapshot_asset_id,
1264 snapshot_location,
1265 snapshot_path: persisted_path
1266 .map(|path| path.to_string_lossy().into_owned()),
1267 available,
1268 }
1269 })
1270 .collect()
1271 })
1272 .unwrap_or_default()
1273}
1274
1275fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1276 let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1277 return Vec::new();
1278 };
1279 let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1280 return Vec::new();
1281 };
1282
1283 content
1284 .lines()
1285 .filter(|line| !line.trim().is_empty())
1286 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1287 .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1288 .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1289 .collect()
1290}
1291
1292pub fn derive_run_observability(
1293 run: &RunRecord,
1294 persisted_path: Option<&Path>,
1295) -> RunObservabilityRecord {
1296 let mut action_graph_nodes = Vec::new();
1297 let mut action_graph_edges = Vec::new();
1298 let mut verification_outcomes = Vec::new();
1299 let mut planner_rounds = Vec::new();
1300 let mut transcript_pointers = Vec::new();
1301 let mut compaction_events = Vec::new();
1302 let mut daemon_events = Vec::new();
1303 let mut research_fact_count = 0usize;
1304
1305 let root_node_id = format!("run:{}", run.id);
1306 let trigger_event = trigger_event_from_run(run);
1307 let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1308 append_action_graph_node(
1309 &mut action_graph_nodes,
1310 RunActionGraphNodeRecord {
1311 id: root_node_id.clone(),
1312 label: run
1313 .workflow_name
1314 .clone()
1315 .unwrap_or_else(|| run.workflow_id.clone()),
1316 kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1317 status: run.status.clone(),
1318 outcome: run.status.clone(),
1319 trace_id: propagated_trace_id.clone(),
1320 stage_id: None,
1321 node_id: None,
1322 worker_id: None,
1323 run_id: Some(run.id.clone()),
1324 run_path: run.persisted_path.clone(),
1325 metadata: BTreeMap::from([(
1326 "workflow_id".to_string(),
1327 serde_json::json!(run.workflow_id),
1328 )]),
1329 },
1330 );
1331 let mut entry_node_id = root_node_id.clone();
1332 if let Some(trigger_event) = trigger_event.as_ref() {
1333 if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1334 let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1335 append_action_graph_node(
1336 &mut action_graph_nodes,
1337 RunActionGraphNodeRecord {
1338 id: replay_source_node_id.clone(),
1339 label: format!(
1340 "{}:{} (original {})",
1341 trigger_event.provider.as_str(),
1342 trigger_event.kind,
1343 replay_of_event_id
1344 ),
1345 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1346 status: "historical".to_string(),
1347 outcome: "replayed_from".to_string(),
1348 trace_id: Some(trigger_event.trace_id.0.clone()),
1349 stage_id: None,
1350 node_id: None,
1351 worker_id: None,
1352 run_id: Some(run.id.clone()),
1353 run_path: run.persisted_path.clone(),
1354 metadata: trigger_node_metadata(trigger_event),
1355 },
1356 );
1357 action_graph_edges.push(RunActionGraphEdgeRecord {
1358 from_id: replay_source_node_id,
1359 to_id: format!("trigger:{}", trigger_event.id.0),
1360 kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1361 label: Some("replay chain".to_string()),
1362 });
1363 }
1364 let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1365 append_action_graph_node(
1366 &mut action_graph_nodes,
1367 RunActionGraphNodeRecord {
1368 id: trigger_node_id.clone(),
1369 label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1370 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1371 status: "received".to_string(),
1372 outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1373 trace_id: Some(trigger_event.trace_id.0.clone()),
1374 stage_id: None,
1375 node_id: None,
1376 worker_id: None,
1377 run_id: Some(run.id.clone()),
1378 run_path: run.persisted_path.clone(),
1379 metadata: trigger_node_metadata(trigger_event),
1380 },
1381 );
1382 action_graph_edges.push(RunActionGraphEdgeRecord {
1383 from_id: root_node_id.clone(),
1384 to_id: trigger_node_id.clone(),
1385 kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1386 label: Some(trigger_event.id.0.clone()),
1387 });
1388 entry_node_id = trigger_node_id;
1389 }
1390
1391 let stage_node_ids = run
1392 .stages
1393 .iter()
1394 .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1395 .collect::<BTreeMap<_, _>>();
1396 let stage_by_id = run
1397 .stages
1398 .iter()
1399 .map(|stage| (stage.id.as_str(), stage))
1400 .collect::<BTreeMap<_, _>>();
1401 let stage_by_node_id = run
1402 .stages
1403 .iter()
1404 .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1405 .collect::<BTreeMap<_, _>>();
1406
1407 let incoming_nodes = run
1408 .transitions
1409 .iter()
1410 .map(|transition| transition.to_node_id.clone())
1411 .collect::<BTreeSet<_>>();
1412
1413 for stage in &run.stages {
1414 let graph_node_id = stage_node_ids
1415 .get(&stage.id)
1416 .cloned()
1417 .unwrap_or_else(|| format!("stage:{}", stage.id));
1418 append_action_graph_node(
1419 &mut action_graph_nodes,
1420 RunActionGraphNodeRecord {
1421 id: graph_node_id.clone(),
1422 label: stage.node_id.clone(),
1423 kind: action_graph_kind_for_stage(stage).to_string(),
1424 status: stage.status.clone(),
1425 outcome: stage.outcome.clone(),
1426 trace_id: propagated_trace_id.clone(),
1427 stage_id: Some(stage.id.clone()),
1428 node_id: Some(stage.node_id.clone()),
1429 worker_id: stage
1430 .metadata
1431 .get("worker_id")
1432 .and_then(|value| value.as_str())
1433 .map(str::to_string),
1434 run_id: None,
1435 run_path: None,
1436 metadata: stage_node_metadata(stage),
1437 },
1438 );
1439 if !incoming_nodes.contains(&stage.node_id) {
1440 action_graph_edges.push(RunActionGraphEdgeRecord {
1441 from_id: entry_node_id.clone(),
1442 to_id: graph_node_id.clone(),
1443 kind: if trigger_event.is_some() {
1444 ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1445 } else {
1446 ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1447 },
1448 label: None,
1449 });
1450 }
1451
1452 if stage.kind == "verify" || stage.verification.is_some() {
1453 let passed = json_bool(
1454 stage
1455 .verification
1456 .as_ref()
1457 .and_then(|value| value.get("pass")),
1458 )
1459 .or_else(|| {
1460 json_bool(
1461 stage
1462 .verification
1463 .as_ref()
1464 .and_then(|value| value.get("success")),
1465 )
1466 })
1467 .or_else(|| {
1468 if stage.status == "completed" && stage.outcome == "success" {
1469 Some(true)
1470 } else if stage.status == "failed" || stage.outcome == "failed" {
1471 Some(false)
1472 } else {
1473 None
1474 }
1475 });
1476 verification_outcomes.push(RunVerificationOutcomeRecord {
1477 stage_id: stage.id.clone(),
1478 node_id: stage.node_id.clone(),
1479 status: stage.status.clone(),
1480 passed,
1481 summary: stage
1482 .verification
1483 .as_ref()
1484 .map(compact_json_value)
1485 .or_else(|| {
1486 stage
1487 .visible_text
1488 .as_ref()
1489 .filter(|value| !value.trim().is_empty())
1490 .cloned()
1491 }),
1492 });
1493 }
1494
1495 if stage.transcript.is_some() {
1496 transcript_pointers.push(RunTranscriptPointerRecord {
1497 id: format!("stage:{}:transcript", stage.id),
1498 label: format!("Stage {} transcript", stage.node_id),
1499 kind: "embedded_transcript".to_string(),
1500 location: format!("run.stages[{}].transcript", stage.node_id),
1501 path: run.persisted_path.clone(),
1502 available: true,
1503 });
1504 if let Some(transcript) = stage.transcript.as_ref() {
1505 compaction_events.extend(compaction_events_from_transcript(
1506 transcript,
1507 Some(&stage.id),
1508 Some(&stage.node_id),
1509 &format!("run.stages[{}].transcript", stage.node_id),
1510 persisted_path,
1511 ));
1512 }
1513 }
1514
1515 if let Some(payload) = stage_result_payload(stage) {
1516 let trace = payload.get("trace");
1517 let task_ledger = payload
1518 .get("task_ledger")
1519 .and_then(task_ledger_summary_from_value);
1520 let research_facts = task_ledger
1521 .as_ref()
1522 .map(|ledger| ledger.observations.clone())
1523 .unwrap_or_default();
1524 research_fact_count += research_facts.len();
1525 let tools_payload = payload.get("tools");
1526 let tools_used = json_string_array(
1527 tools_payload
1528 .and_then(|tools| tools.get("calls"))
1529 .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1530 );
1531 let successful_tools =
1532 json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1533 let planner_round = RunPlannerRoundRecord {
1534 stage_id: stage.id.clone(),
1535 node_id: stage.node_id.clone(),
1536 stage_kind: stage.kind.clone(),
1537 status: stage.status.clone(),
1538 outcome: stage.outcome.clone(),
1539 iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1540 llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1541 tool_execution_count: json_usize(
1542 trace.and_then(|trace| trace.get("tool_executions")),
1543 ),
1544 tool_rejection_count: json_usize(
1545 trace.and_then(|trace| trace.get("tool_rejections")),
1546 ),
1547 intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1548 compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1549 native_text_tool_fallback_count: json_usize(
1550 trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1551 ),
1552 native_text_tool_fallback_rejection_count: json_usize(
1553 trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1554 ),
1555 empty_completion_retry_count: json_usize(
1556 trace.and_then(|trace| trace.get("empty_completion_retries")),
1557 ),
1558 tools_used,
1559 successful_tools,
1560 ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1561 task_ledger,
1562 research_facts,
1563 };
1564 let has_agentic_detail = planner_round.iteration_count > 0
1565 || planner_round.llm_call_count > 0
1566 || planner_round.tool_execution_count > 0
1567 || planner_round.native_text_tool_fallback_count > 0
1568 || planner_round.native_text_tool_fallback_rejection_count > 0
1569 || planner_round.empty_completion_retry_count > 0
1570 || planner_round.ledger_done_rejections > 0
1571 || planner_round.task_ledger.is_some()
1572 || !planner_round.tools_used.is_empty()
1573 || !planner_round.successful_tools.is_empty();
1574 if has_agentic_detail {
1575 planner_rounds.push(planner_round);
1576 }
1577 }
1578 }
1579
1580 for transition in &run.transitions {
1581 let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1582 continue;
1583 };
1584 let from_stage = transition
1585 .from_stage_id
1586 .as_deref()
1587 .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1588 let from_id = transition
1589 .from_stage_id
1590 .as_ref()
1591 .and_then(|stage_id| stage_node_ids.get(stage_id))
1592 .cloned()
1593 .or_else(|| {
1594 transition
1595 .from_node_id
1596 .as_ref()
1597 .and_then(|node_id| stage_by_node_id.get(node_id))
1598 .cloned()
1599 })
1600 .unwrap_or_else(|| root_node_id.clone());
1601 action_graph_edges.push(RunActionGraphEdgeRecord {
1602 from_id,
1603 to_id,
1604 kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1605 ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1606 } else {
1607 ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1608 },
1609 label: transition.branch.clone(),
1610 });
1611 }
1612
1613 let worker_lineage = run
1614 .child_runs
1615 .iter()
1616 .map(|child| {
1617 let worker_node_id = format!("worker:{}", child.worker_id);
1618 append_action_graph_node(
1619 &mut action_graph_nodes,
1620 RunActionGraphNodeRecord {
1621 id: worker_node_id.clone(),
1622 label: child.worker_name.clone(),
1623 kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1624 status: child.status.clone(),
1625 outcome: child.status.clone(),
1626 trace_id: propagated_trace_id.clone(),
1627 stage_id: child.parent_stage_id.clone(),
1628 node_id: None,
1629 worker_id: Some(child.worker_id.clone()),
1630 run_id: child.run_id.clone(),
1631 run_path: child.run_path.clone(),
1632 metadata: BTreeMap::from([
1633 (
1634 "worker_name".to_string(),
1635 serde_json::json!(child.worker_name),
1636 ),
1637 ("task".to_string(), serde_json::json!(child.task)),
1638 ]),
1639 },
1640 );
1641 if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1642 if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1643 action_graph_edges.push(RunActionGraphEdgeRecord {
1644 from_id: stage_node_id.clone(),
1645 to_id: worker_node_id,
1646 kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1647 label: Some(child.worker_name.clone()),
1648 });
1649 }
1650 }
1651 RunWorkerLineageRecord {
1652 worker_id: child.worker_id.clone(),
1653 worker_name: child.worker_name.clone(),
1654 parent_stage_id: child.parent_stage_id.clone(),
1655 task: child.task.clone(),
1656 status: child.status.clone(),
1657 session_id: child.session_id.clone(),
1658 parent_session_id: child.parent_session_id.clone(),
1659 run_id: child.run_id.clone(),
1660 run_path: child.run_path.clone(),
1661 snapshot_path: child.snapshot_path.clone(),
1662 }
1663 })
1664 .collect::<Vec<_>>();
1665
1666 if run.transcript.is_some() {
1667 transcript_pointers.push(RunTranscriptPointerRecord {
1668 id: "run:transcript".to_string(),
1669 label: "Run transcript".to_string(),
1670 kind: "embedded_transcript".to_string(),
1671 location: "run.transcript".to_string(),
1672 path: run.persisted_path.clone(),
1673 available: true,
1674 });
1675 if let Some(transcript) = run.transcript.as_ref() {
1676 compaction_events.extend(compaction_events_from_transcript(
1677 transcript,
1678 None,
1679 None,
1680 "run.transcript",
1681 persisted_path,
1682 ));
1683 }
1684 }
1685
1686 if let Some(path) = persisted_path {
1687 if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1688 transcript_pointers.push(RunTranscriptPointerRecord {
1689 id: "run:llm_transcript".to_string(),
1690 label: "LLM transcript sidecar".to_string(),
1691 kind: "llm_jsonl".to_string(),
1692 location: "run sidecar".to_string(),
1693 path: Some(sidecar_path.to_string_lossy().into_owned()),
1694 available: sidecar_path.exists(),
1695 });
1696 }
1697 daemon_events.extend(daemon_events_from_sidecar(path));
1698 }
1699
1700 RunObservabilityRecord {
1701 schema_version: 4,
1702 planner_rounds,
1703 research_fact_count,
1704 action_graph_nodes,
1705 action_graph_edges,
1706 worker_lineage,
1707 verification_outcomes,
1708 transcript_pointers,
1709 compaction_events,
1710 daemon_events,
1711 }
1712}
1713
1714fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1715 run.observability = Some(derive_run_observability(run, persisted_path));
1716}
1717
1718pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1719 let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1720 if run.type_name.is_empty() {
1721 run.type_name = "run_record".to_string();
1722 }
1723 if run.id.is_empty() {
1724 run.id = new_id("run");
1725 }
1726 if run.started_at.is_empty() {
1727 run.started_at = now_rfc3339();
1728 }
1729 if run.status.is_empty() {
1730 run.status = "running".to_string();
1731 }
1732 if run.root_run_id.is_none() {
1733 run.root_run_id = Some(run.id.clone());
1734 }
1735 if run.replay_fixture.is_none() {
1736 run.replay_fixture = Some(replay_fixture_from_run(&run));
1737 }
1738 merge_hitl_questions_from_active_log(&mut run);
1739 materialize_child_runs_from_stage_metadata(&mut run);
1740 sync_run_handoffs(&mut run);
1741 if run.observability.is_none() {
1742 let persisted_path = run.persisted_path.clone();
1743 let persisted = persisted_path.as_deref().map(Path::new);
1744 refresh_run_observability(&mut run, persisted);
1745 }
1746 Ok(run)
1747}
1748
1749pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1750 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1751 if manifest.type_name.is_empty() {
1752 manifest.type_name = "eval_suite_manifest".to_string();
1753 }
1754 if manifest.id.is_empty() {
1755 manifest.id = new_id("eval_suite");
1756 }
1757 Ok(manifest)
1758}
1759
1760pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1761 let content = std::fs::read_to_string(path)
1762 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1763 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1764 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1765 if manifest.base_dir.is_none() {
1766 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1767 }
1768 Ok(manifest)
1769}
1770
1771pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1772 let content = std::fs::read_to_string(path)
1773 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1774 let mut manifest: EvalPackManifest =
1775 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1776 serde_json::from_str(&content)
1777 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1778 } else {
1779 toml::from_str(&content)
1780 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1781 };
1782 normalize_eval_pack_manifest(&mut manifest);
1783 if manifest.base_dir.is_none() {
1784 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1785 }
1786 Ok(manifest)
1787}
1788
1789fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1790 if manifest.version == 0 {
1791 manifest.version = 1;
1792 }
1793 if manifest.id.is_empty() {
1794 manifest.id = manifest
1795 .name
1796 .clone()
1797 .filter(|name| !name.trim().is_empty())
1798 .unwrap_or_else(|| new_id("eval_pack"));
1799 }
1800}
1801
1802fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1803 let content = std::fs::read_to_string(path)
1804 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1805 serde_json::from_str(&content)
1806 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1807}
1808
1809fn load_run_record_from_fixture_ref(
1810 fixture: &EvalPackFixtureRef,
1811 base_dir: Option<&Path>,
1812) -> Result<RunRecord, VmError> {
1813 if let Some(inline) = &fixture.inline {
1814 let run: RunRecord = serde_json::from_value(inline.clone())
1815 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1816 return Ok(run);
1817 }
1818 let path = fixture.path.as_deref().ok_or_else(|| {
1819 VmError::Runtime(format!(
1820 "fixture '{}' is missing path or inline run",
1821 fixture.id
1822 ))
1823 })?;
1824 load_run_record(&resolve_manifest_path(base_dir, path))
1825}
1826
1827fn load_replay_fixture_from_ref(
1828 fixture: &EvalPackFixtureRef,
1829 base_dir: Option<&Path>,
1830) -> Result<ReplayFixture, VmError> {
1831 if let Some(inline) = &fixture.inline {
1832 return serde_json::from_value(inline.clone())
1833 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1834 }
1835 let path = fixture.path.as_deref().ok_or_else(|| {
1836 VmError::Runtime(format!(
1837 "fixture '{}' is missing path or inline replay fixture",
1838 fixture.id
1839 ))
1840 })?;
1841 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1842}
1843
1844fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1845 let path_buf = PathBuf::from(path);
1846 if path_buf.is_absolute() {
1847 path_buf
1848 } else if let Some(base_dir) = base_dir {
1849 base_dir.join(path_buf)
1850 } else {
1851 path_buf
1852 }
1853}
1854
1855pub fn evaluate_run_suite_manifest(
1856 manifest: &EvalSuiteManifest,
1857) -> Result<ReplayEvalSuiteReport, VmError> {
1858 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1859 let mut reports = Vec::new();
1860 for case in &manifest.cases {
1861 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1862 let run = load_run_record(&run_path)?;
1863 let fixture = match &case.fixture_path {
1864 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1865 None => run
1866 .replay_fixture
1867 .clone()
1868 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1869 };
1870 let eval = evaluate_run_against_fixture(&run, &fixture);
1871 let mut pass = eval.pass;
1872 let mut failures = eval.failures;
1873 let comparison = match &case.compare_to {
1874 Some(path) => {
1875 let baseline_path = resolve_manifest_path(base_dir, path);
1876 let baseline = load_run_record(&baseline_path)?;
1877 let diff = diff_run_records(&baseline, &run);
1878 if !diff.identical {
1879 pass = false;
1880 failures.push(format!(
1881 "run differs from baseline {} with {} stage changes",
1882 baseline_path.display(),
1883 diff.stage_diffs.len()
1884 ));
1885 }
1886 Some(diff)
1887 }
1888 None => None,
1889 };
1890 reports.push(ReplayEvalCaseReport {
1891 run_id: run.id.clone(),
1892 workflow_id: run.workflow_id.clone(),
1893 label: case.label.clone(),
1894 pass,
1895 failures,
1896 stage_count: eval.stage_count,
1897 source_path: Some(run_path.display().to_string()),
1898 comparison,
1899 });
1900 }
1901 let total = reports.len();
1902 let passed = reports.iter().filter(|report| report.pass).count();
1903 let failed = total.saturating_sub(passed);
1904 Ok(ReplayEvalSuiteReport {
1905 pass: failed == 0,
1906 total,
1907 passed,
1908 failed,
1909 cases: reports,
1910 })
1911}
1912
1913pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1914 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1915 let fixture_base_dir_buf = manifest
1916 .defaults
1917 .fixture_root
1918 .as_deref()
1919 .map(|root| resolve_manifest_path(base_dir, root));
1920 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1921 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1922 .fixtures
1923 .iter()
1924 .filter(|fixture| !fixture.id.is_empty())
1925 .map(|fixture| (fixture.id.as_str(), fixture))
1926 .collect();
1927 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1928 .rubrics
1929 .iter()
1930 .filter(|rubric| !rubric.id.is_empty())
1931 .map(|rubric| (rubric.id.as_str(), rubric))
1932 .collect();
1933
1934 let mut reports = Vec::new();
1935 for (index, case) in manifest.cases.iter().enumerate() {
1936 let case_id = case
1937 .id
1938 .clone()
1939 .filter(|id| !id.trim().is_empty())
1940 .unwrap_or_else(|| format!("case_{}", index + 1));
1941 let label = case
1942 .name
1943 .clone()
1944 .or_else(|| case.id.clone())
1945 .unwrap_or_else(|| case_id.clone());
1946 let severity = eval_pack_case_severity(manifest, case);
1947 let blocking = severity == "blocking";
1948 let mut failures = Vec::new();
1949 let mut warnings = Vec::new();
1950 let mut informational = Vec::new();
1951
1952 if case.friction_events.is_some() {
1953 let report = evaluate_eval_pack_friction_case(
1954 manifest,
1955 case,
1956 &case_id,
1957 &label,
1958 &severity,
1959 blocking,
1960 base_dir,
1961 fixture_base_dir,
1962 &fixtures_by_id,
1963 &rubrics_by_id,
1964 )?;
1965 reports.push(report);
1966 continue;
1967 }
1968
1969 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
1970 let fixture =
1971 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
1972 let eval = evaluate_run_against_fixture(&run, &fixture);
1973 failures.extend(eval.failures);
1974 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
1975 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
1976
1977 let comparison = match &case.compare_to {
1978 Some(path) => {
1979 let baseline_path = resolve_manifest_path(base_dir, path);
1980 let baseline = load_run_record(&baseline_path)?;
1981 let diff = diff_run_records(&baseline, &run);
1982 if !diff.identical {
1983 failures.push(format!(
1984 "run differs from baseline {} with {} stage changes",
1985 baseline_path.display(),
1986 diff.stage_diffs.len()
1987 ));
1988 }
1989 Some(diff)
1990 }
1991 None => None,
1992 };
1993
1994 for rubric_id in &case.rubrics {
1995 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1996 failures.push(format!("case references unknown rubric '{rubric_id}'"));
1997 continue;
1998 };
1999 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2000 }
2001
2002 let pass = failures.is_empty() || !blocking;
2003 if !failures.is_empty() && !blocking {
2004 if severity == "warning" {
2005 warnings.append(&mut failures);
2006 } else {
2007 informational.append(&mut failures);
2008 }
2009 }
2010 reports.push(EvalPackCaseReport {
2011 id: case_id,
2012 label,
2013 severity,
2014 pass,
2015 blocking,
2016 run_id: run.id.clone(),
2017 workflow_id: run.workflow_id.clone(),
2018 source_path: eval_pack_case_source_path(
2019 case,
2020 base_dir,
2021 fixture_base_dir,
2022 &fixtures_by_id,
2023 ),
2024 stage_count: eval.stage_count,
2025 failures,
2026 warnings,
2027 informational,
2028 comparison,
2029 });
2030 }
2031
2032 let total = reports.len();
2033 let blocking_failed = reports
2034 .iter()
2035 .filter(|report| report.blocking && !report.failures.is_empty())
2036 .count();
2037 let warning_failed = reports
2038 .iter()
2039 .filter(|report| !report.warnings.is_empty())
2040 .count();
2041 let informational_failed = reports
2042 .iter()
2043 .filter(|report| !report.informational.is_empty())
2044 .count();
2045 let passed = reports.iter().filter(|report| report.pass).count();
2046 Ok(EvalPackReport {
2047 pack_id: manifest.id.clone(),
2048 pass: blocking_failed == 0,
2049 total,
2050 passed,
2051 failed: total.saturating_sub(passed),
2052 blocking_failed,
2053 warning_failed,
2054 informational_failed,
2055 cases: reports,
2056 })
2057}
2058
2059#[allow(clippy::too_many_arguments)]
2060fn evaluate_eval_pack_friction_case(
2061 manifest: &EvalPackManifest,
2062 case: &EvalPackCase,
2063 case_id: &str,
2064 label: &str,
2065 severity: &str,
2066 blocking: bool,
2067 base_dir: Option<&Path>,
2068 fixture_base_dir: Option<&Path>,
2069 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2070 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2071) -> Result<EvalPackCaseReport, VmError> {
2072 let mut failures = Vec::new();
2073 let mut warnings = Vec::new();
2074 let mut informational = Vec::new();
2075 let events =
2076 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2077 let options = friction_suggestion_options(case, manifest);
2078 let suggestions = generate_context_pack_suggestions(&events, &options);
2079
2080 for rubric_id in &case.rubrics {
2081 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2082 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2083 continue;
2084 };
2085 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2086 }
2087
2088 if case.rubrics.is_empty() && suggestions.is_empty() {
2089 failures.push("friction fixture produced no context-pack suggestions".to_string());
2090 }
2091
2092 let pass = failures.is_empty() || !blocking;
2093 if !failures.is_empty() && !blocking {
2094 if severity == "warning" {
2095 warnings.append(&mut failures);
2096 } else {
2097 informational.append(&mut failures);
2098 }
2099 }
2100
2101 Ok(EvalPackCaseReport {
2102 id: case_id.to_string(),
2103 label: label.to_string(),
2104 severity: severity.to_string(),
2105 pass,
2106 blocking,
2107 run_id: "friction_events".to_string(),
2108 workflow_id: String::new(),
2109 source_path: eval_pack_case_friction_source_path(
2110 case,
2111 base_dir,
2112 fixture_base_dir,
2113 fixtures_by_id,
2114 ),
2115 stage_count: events.len(),
2116 failures,
2117 warnings,
2118 informational,
2119 comparison: None,
2120 })
2121}
2122
2123fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2124 normalize_eval_pack_severity(
2125 case.severity
2126 .as_deref()
2127 .or(case.thresholds.severity.as_deref())
2128 .or(manifest.defaults.severity.as_deref())
2129 .or(manifest.defaults.thresholds.severity.as_deref())
2130 .unwrap_or("blocking"),
2131 )
2132}
2133
2134fn normalize_eval_pack_severity(value: &str) -> String {
2135 match value.trim().to_ascii_lowercase().as_str() {
2136 "warn" | "warning" => "warning".to_string(),
2137 "info" | "informational" => "informational".to_string(),
2138 _ => "blocking".to_string(),
2139 }
2140}
2141
2142fn load_eval_pack_case_run(
2143 case: &EvalPackCase,
2144 base_dir: Option<&Path>,
2145 fixture_base_dir: Option<&Path>,
2146 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2147) -> Result<RunRecord, VmError> {
2148 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2149 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2150 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2151 }
2152 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2153 }
2154 Err(VmError::Runtime(
2155 "eval pack case is missing run or run_path".to_string(),
2156 ))
2157}
2158
2159fn load_eval_pack_case_fixture(
2160 case: &EvalPackCase,
2161 base_dir: Option<&Path>,
2162 fixture_base_dir: Option<&Path>,
2163 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2164 run: &RunRecord,
2165) -> Result<ReplayFixture, VmError> {
2166 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2167 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2168 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2169 }
2170 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2171 }
2172 Ok(run
2173 .replay_fixture
2174 .clone()
2175 .unwrap_or_else(|| replay_fixture_from_run(run)))
2176}
2177
2178fn eval_pack_case_source_path(
2179 case: &EvalPackCase,
2180 base_dir: Option<&Path>,
2181 fixture_base_dir: Option<&Path>,
2182 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2183) -> Option<String> {
2184 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2185 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2186 return fixture.path.as_ref().map(|path| {
2187 resolve_manifest_path(fixture_base_dir, path)
2188 .display()
2189 .to_string()
2190 });
2191 }
2192 Some(
2193 resolve_manifest_path(base_dir, run_ref)
2194 .display()
2195 .to_string(),
2196 )
2197}
2198
2199fn load_eval_pack_case_friction_events(
2200 case: &EvalPackCase,
2201 base_dir: Option<&Path>,
2202 fixture_base_dir: Option<&Path>,
2203 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2204) -> Result<Vec<FrictionEvent>, VmError> {
2205 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2206 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2207 })?;
2208 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2209 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2210 }
2211 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2212}
2213
2214fn load_friction_events_from_fixture_ref(
2215 fixture: &EvalPackFixtureRef,
2216 base_dir: Option<&Path>,
2217) -> Result<Vec<FrictionEvent>, VmError> {
2218 if let Some(inline) = &fixture.inline {
2219 return normalize_friction_events_json(inline.clone());
2220 }
2221 let path = fixture.path.as_deref().ok_or_else(|| {
2222 VmError::Runtime(format!(
2223 "fixture '{}' is missing path or inline friction events",
2224 fixture.id
2225 ))
2226 })?;
2227 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2228}
2229
2230fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2231 let content = std::fs::read_to_string(path)
2232 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2233 let value: serde_json::Value = serde_json::from_str(&content)
2234 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2235 normalize_friction_events_json(value)
2236}
2237
2238fn eval_pack_case_friction_source_path(
2239 case: &EvalPackCase,
2240 base_dir: Option<&Path>,
2241 fixture_base_dir: Option<&Path>,
2242 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2243) -> Option<String> {
2244 let event_ref = case.friction_events.as_deref()?;
2245 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2246 return fixture.path.as_ref().map(|path| {
2247 resolve_manifest_path(fixture_base_dir, path)
2248 .display()
2249 .to_string()
2250 });
2251 }
2252 Some(
2253 resolve_manifest_path(base_dir, event_ref)
2254 .display()
2255 .to_string(),
2256 )
2257}
2258
2259fn friction_suggestion_options(
2260 case: &EvalPackCase,
2261 manifest: &EvalPackManifest,
2262) -> ContextPackSuggestionOptions {
2263 let min_occurrences = case
2264 .metadata
2265 .get("min_occurrences")
2266 .or_else(|| manifest.metadata.get("min_occurrences"))
2267 .and_then(|value| value.as_u64())
2268 .unwrap_or(2) as usize;
2269 let owner = case
2270 .metadata
2271 .get("owner")
2272 .or_else(|| manifest.metadata.get("owner"))
2273 .and_then(|value| value.as_str())
2274 .map(str::to_string)
2275 .or_else(|| {
2276 manifest
2277 .package
2278 .as_ref()
2279 .and_then(|package| package.name.clone())
2280 });
2281 ContextPackSuggestionOptions {
2282 min_occurrences,
2283 owner,
2284 }
2285}
2286
2287fn apply_eval_pack_thresholds(
2288 run: &RunRecord,
2289 thresholds: &EvalPackThresholds,
2290 failures: &mut Vec<String>,
2291) {
2292 if let Some(max_stage_count) = thresholds.max_stage_count {
2293 if run.stages.len() > max_stage_count {
2294 failures.push(format!(
2295 "stage count {} exceeds threshold {}",
2296 run.stages.len(),
2297 max_stage_count
2298 ));
2299 }
2300 }
2301 if let Some(max_latency_ms) = thresholds.max_latency_ms {
2302 let actual = run
2303 .usage
2304 .as_ref()
2305 .map(|usage| usage.total_duration_ms)
2306 .unwrap_or_default();
2307 if actual > max_latency_ms {
2308 failures.push(format!(
2309 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2310 ));
2311 }
2312 }
2313 if let Some(max_cost_usd) = thresholds.max_cost_usd {
2314 let actual = run
2315 .usage
2316 .as_ref()
2317 .map(|usage| usage.total_cost)
2318 .unwrap_or_default();
2319 if actual > max_cost_usd {
2320 failures.push(format!(
2321 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2322 ));
2323 }
2324 }
2325 if let Some(max_tokens) = thresholds.max_tokens {
2326 let actual = run
2327 .usage
2328 .as_ref()
2329 .map(|usage| usage.input_tokens + usage.output_tokens)
2330 .unwrap_or_default();
2331 if actual > max_tokens {
2332 failures.push(format!(
2333 "token count {actual} exceeds threshold {max_tokens}"
2334 ));
2335 }
2336 }
2337}
2338
2339fn apply_eval_pack_rubric(
2340 rubric: &EvalPackRubric,
2341 run: &RunRecord,
2342 failures: &mut Vec<String>,
2343 warnings: &mut Vec<String>,
2344) {
2345 match rubric.kind.as_str() {
2346 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2347 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2348 for assertion in &rubric.assertions {
2349 apply_eval_pack_assertion(rubric, assertion, run, failures);
2350 }
2351 }
2352 "llm-judge" | "llm_as_judge" | "judge" => {
2353 let severity = normalize_eval_pack_severity(
2354 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2355 );
2356 let message = format!(
2357 "rubric '{}' requires an external LLM judge and was not run locally",
2358 rubric.id
2359 );
2360 if severity == "blocking" {
2361 failures.push(message);
2362 } else {
2363 warnings.push(message);
2364 }
2365 }
2366 other => warnings.push(format!(
2367 "rubric '{}' has unknown kind '{}' and was not run locally",
2368 rubric.id, other
2369 )),
2370 }
2371}
2372
2373fn apply_eval_pack_friction_rubric(
2374 rubric: &EvalPackRubric,
2375 suggestions: &[super::ContextPackSuggestion],
2376 failures: &mut Vec<String>,
2377 warnings: &mut Vec<String>,
2378) {
2379 match rubric.kind.as_str() {
2380 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2381 let mut expectations = Vec::new();
2382 for assertion in &rubric.assertions {
2383 match assertion.kind.as_str() {
2384 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2385 let expectation = context_pack_expectation_from_assertion(assertion);
2386 expectations.push(expectation);
2387 }
2388 other => failures.push(format!(
2389 "rubric '{}' has unsupported friction assertion kind '{}'",
2390 rubric.id, other
2391 )),
2392 }
2393 }
2394 failures.extend(evaluate_context_pack_suggestion_expectations(
2395 suggestions,
2396 &expectations,
2397 ));
2398 }
2399 other => warnings.push(format!(
2400 "rubric '{}' has unknown friction kind '{}' and was not run locally",
2401 rubric.id, other
2402 )),
2403 }
2404}
2405
2406fn context_pack_expectation_from_assertion(
2407 assertion: &EvalPackAssertion,
2408) -> ContextPackSuggestionExpectation {
2409 let expected = assertion
2410 .expected
2411 .as_ref()
2412 .and_then(|value| value.as_object());
2413 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2414 ContextPackSuggestionExpectation {
2415 min_suggestions: expected
2416 .and_then(|map| map.get("min_suggestions"))
2417 .and_then(|value| value.as_u64())
2418 .map(|value| value as usize),
2419 recommended_artifact: expected
2420 .and_then(|map| map.get("recommended_artifact"))
2421 .and_then(|value| value.as_str())
2422 .map(str::to_string)
2423 .or_else(|| expected_string.map(str::to_string)),
2424 title_contains: assertion.contains.clone().or_else(|| {
2425 expected
2426 .and_then(|map| map.get("title_contains"))
2427 .and_then(|value| value.as_str())
2428 .map(str::to_string)
2429 }),
2430 manifest_name_contains: expected
2431 .and_then(|map| map.get("manifest_name_contains"))
2432 .and_then(|value| value.as_str())
2433 .map(str::to_string),
2434 required_capability: expected
2435 .and_then(|map| map.get("required_capability"))
2436 .and_then(|value| value.as_str())
2437 .map(str::to_string),
2438 required_output_slot: expected
2439 .and_then(|map| map.get("required_output_slot"))
2440 .and_then(|value| value.as_str())
2441 .map(str::to_string),
2442 }
2443}
2444
2445fn apply_eval_pack_assertion(
2446 rubric: &EvalPackRubric,
2447 assertion: &EvalPackAssertion,
2448 run: &RunRecord,
2449 failures: &mut Vec<String>,
2450) {
2451 match assertion.kind.as_str() {
2452 "run-status" | "run_status" | "status" => {
2453 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2454 if let Some(expected) = expected {
2455 if run.status != expected {
2456 failures.push(format!(
2457 "rubric '{}' expected run status {}, got {}",
2458 rubric.id, expected, run.status
2459 ));
2460 }
2461 }
2462 }
2463 "stage-status" | "stage_status" => {
2464 let Some(stage_id) = assertion.stage.as_deref() else {
2465 failures.push(format!(
2466 "rubric '{}' stage-status assertion missing stage",
2467 rubric.id
2468 ));
2469 return;
2470 };
2471 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2472 let Some(expected) = expected else {
2473 failures.push(format!(
2474 "rubric '{}' stage-status assertion missing expected string",
2475 rubric.id
2476 ));
2477 return;
2478 };
2479 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2480 Some(stage) if stage.status == expected => {}
2481 Some(stage) => failures.push(format!(
2482 "rubric '{}' expected stage {} status {}, got {}",
2483 rubric.id, stage_id, expected, stage.status
2484 )),
2485 None => failures.push(format!(
2486 "rubric '{}' expected stage {} to exist",
2487 rubric.id, stage_id
2488 )),
2489 }
2490 }
2491 "visible-text-contains" | "visible_text_contains" => {
2492 let Some(needle) = assertion.contains.as_deref() else {
2493 failures.push(format!(
2494 "rubric '{}' visible-text assertion missing contains",
2495 rubric.id
2496 ));
2497 return;
2498 };
2499 let matched = match assertion.stage.as_deref() {
2500 Some(stage_id) => run
2501 .stages
2502 .iter()
2503 .find(|stage| stage.node_id == stage_id)
2504 .and_then(|stage| stage.visible_text.as_deref())
2505 .is_some_and(|text| text.contains(needle)),
2506 None => run
2507 .stages
2508 .iter()
2509 .filter_map(|stage| stage.visible_text.as_deref())
2510 .any(|text| text.contains(needle)),
2511 };
2512 if !matched {
2513 failures.push(format!(
2514 "rubric '{}' expected visible text to contain {:?}",
2515 rubric.id, needle
2516 ));
2517 }
2518 }
2519 "hitl-question-contains" | "hitl_question_contains" => {
2520 let Some(needle) = assertion.contains.as_deref() else {
2521 failures.push(format!(
2522 "rubric '{}' HITL assertion missing contains",
2523 rubric.id
2524 ));
2525 return;
2526 };
2527 if !run
2528 .hitl_questions
2529 .iter()
2530 .any(|question| question.prompt.contains(needle))
2531 {
2532 failures.push(format!(
2533 "rubric '{}' expected HITL question to contain {:?}",
2534 rubric.id, needle
2535 ));
2536 }
2537 }
2538 "" => {}
2539 other => failures.push(format!(
2540 "rubric '{}' has unsupported assertion kind '{}'",
2541 rubric.id, other
2542 )),
2543 }
2544}
2545
2546#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2548pub(crate) enum DiffOp {
2549 Equal,
2550 Delete,
2551 Insert,
2552}
2553
2554pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2558 let n = a.len() as isize;
2559 let m = b.len() as isize;
2560 if n == 0 && m == 0 {
2561 return Vec::new();
2562 }
2563 if n == 0 {
2564 return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2565 }
2566 if m == 0 {
2567 return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2568 }
2569
2570 let max_d = (n + m) as usize;
2571 let offset = max_d as isize;
2572 let v_size = 2 * max_d + 1;
2573 let mut v = vec![0isize; v_size];
2574 let mut trace: Vec<Vec<isize>> = Vec::new();
2576
2577 'outer: for d in 0..=max_d as isize {
2578 trace.push(v.clone());
2579 let mut new_v = v.clone();
2580 for k in (-d..=d).step_by(2) {
2581 let ki = (k + offset) as usize;
2582 let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2583 v[ki + 1]
2584 } else {
2585 v[ki - 1] + 1
2586 };
2587 let mut y = x - k;
2588 while x < n && y < m && a[x as usize] == b[y as usize] {
2589 x += 1;
2590 y += 1;
2591 }
2592 new_v[ki] = x;
2593 if x >= n && y >= m {
2594 let _ = new_v;
2595 break 'outer;
2596 }
2597 }
2598 v = new_v;
2599 }
2600
2601 let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2602 let mut x = n;
2603 let mut y = m;
2604 for d in (1..trace.len() as isize).rev() {
2605 let k = x - y;
2606 let v_prev = &trace[d as usize];
2607 let prev_k = if k == -d
2608 || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2609 {
2610 k + 1
2611 } else {
2612 k - 1
2613 };
2614 let prev_x = v_prev[(prev_k + offset) as usize];
2615 let prev_y = prev_x - prev_k;
2616
2617 while x > prev_x && y > prev_y {
2618 x -= 1;
2619 y -= 1;
2620 ops.push((DiffOp::Equal, x as usize));
2621 }
2622 if prev_k < k {
2623 x -= 1;
2624 ops.push((DiffOp::Delete, x as usize));
2625 } else {
2626 y -= 1;
2627 ops.push((DiffOp::Insert, y as usize));
2628 }
2629 }
2630 while x > 0 && y > 0 {
2631 x -= 1;
2632 y -= 1;
2633 ops.push((DiffOp::Equal, x as usize));
2634 }
2635 ops.reverse();
2636 ops
2637}
2638
2639pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2640 let before_lines: Vec<&str> = before.lines().collect();
2641 let after_lines: Vec<&str> = after.lines().collect();
2642 let ops = myers_diff(&before_lines, &after_lines);
2643
2644 let mut diff = String::new();
2645 let file = path.unwrap_or("artifact");
2646 diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2647 for &(op, idx) in &ops {
2648 match op {
2649 DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2650 DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2651 DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2652 }
2653 }
2654 diff
2655}
2656
2657pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2658 let path = path
2659 .map(PathBuf::from)
2660 .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2661 let mut materialized = run.clone();
2662 merge_hitl_questions_from_active_log(&mut materialized);
2663 materialize_child_runs_from_stage_metadata(&mut materialized);
2664 if materialized.replay_fixture.is_none() {
2665 materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2666 }
2667 materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2668 sync_run_handoffs(&mut materialized);
2669 refresh_run_observability(&mut materialized, Some(&path));
2670 if let Some(parent) = path.parent() {
2671 std::fs::create_dir_all(parent)
2672 .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2673 }
2674 let json = serde_json::to_string_pretty(&materialized)
2675 .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2676 let tmp_path = path.with_extension("json.tmp");
2678 std::fs::write(&tmp_path, &json)
2679 .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2680 std::fs::rename(&tmp_path, &path).map_err(|e| {
2681 let _ = std::fs::write(&path, &json);
2683 VmError::Runtime(format!("failed to finalize run record: {e}"))
2684 })?;
2685 if let Some(observability) = materialized.observability.as_ref() {
2686 publish_action_graph_event(&materialized, observability, &path);
2687 }
2688 Ok(path.to_string_lossy().into_owned())
2689}
2690
2691pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2692 let content = std::fs::read_to_string(path)
2693 .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2694 let mut run: RunRecord = serde_json::from_str(&content)
2695 .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2696 materialize_child_runs_from_stage_metadata(&mut run);
2697 if run.replay_fixture.is_none() {
2698 run.replay_fixture = Some(replay_fixture_from_run(&run));
2699 }
2700 run.persisted_path
2701 .get_or_insert_with(|| path.to_string_lossy().into_owned());
2702 sync_run_handoffs(&mut run);
2703 refresh_run_observability(&mut run, Some(path));
2704 Ok(run)
2705}
2706
2707pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2708 ReplayFixture {
2709 type_name: "replay_fixture".to_string(),
2710 id: new_id("fixture"),
2711 source_run_id: run.id.clone(),
2712 workflow_id: run.workflow_id.clone(),
2713 workflow_name: run.workflow_name.clone(),
2714 created_at: now_rfc3339(),
2715 eval_kind: Some("replay".to_string()),
2716 clarifying_question: None,
2717 expected_status: run.status.clone(),
2718 stage_assertions: run
2719 .stages
2720 .iter()
2721 .map(|stage| ReplayStageAssertion {
2722 node_id: stage.node_id.clone(),
2723 expected_status: stage.status.clone(),
2724 expected_outcome: stage.outcome.clone(),
2725 expected_branch: stage.branch.clone(),
2726 required_artifact_kinds: stage
2727 .artifacts
2728 .iter()
2729 .map(|artifact| artifact.kind.clone())
2730 .collect(),
2731 visible_text_contains: stage
2732 .visible_text
2733 .as_ref()
2734 .filter(|text| !text.is_empty())
2735 .map(|text| text.chars().take(80).collect()),
2736 })
2737 .collect(),
2738 }
2739}
2740
2741pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2742 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2743 return evaluate_clarifying_question(run, fixture);
2744 }
2745 let mut failures = Vec::new();
2746 if run.status != fixture.expected_status {
2747 failures.push(format!(
2748 "run status mismatch: expected {}, got {}",
2749 fixture.expected_status, run.status
2750 ));
2751 }
2752 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2753 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2754 for assertion in &fixture.stage_assertions {
2755 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2756 failures.push(format!("missing stage {}", assertion.node_id));
2757 continue;
2758 };
2759 if stage.status != assertion.expected_status {
2760 failures.push(format!(
2761 "stage {} status mismatch: expected {}, got {}",
2762 assertion.node_id, assertion.expected_status, stage.status
2763 ));
2764 }
2765 if stage.outcome != assertion.expected_outcome {
2766 failures.push(format!(
2767 "stage {} outcome mismatch: expected {}, got {}",
2768 assertion.node_id, assertion.expected_outcome, stage.outcome
2769 ));
2770 }
2771 if stage.branch != assertion.expected_branch {
2772 failures.push(format!(
2773 "stage {} branch mismatch: expected {:?}, got {:?}",
2774 assertion.node_id, assertion.expected_branch, stage.branch
2775 ));
2776 }
2777 for required_kind in &assertion.required_artifact_kinds {
2778 if !stage
2779 .artifacts
2780 .iter()
2781 .any(|artifact| &artifact.kind == required_kind)
2782 {
2783 failures.push(format!(
2784 "stage {} missing artifact kind {}",
2785 assertion.node_id, required_kind
2786 ));
2787 }
2788 }
2789 if let Some(snippet) = &assertion.visible_text_contains {
2790 let actual = stage.visible_text.clone().unwrap_or_default();
2791 if !actual.contains(snippet) {
2792 failures.push(format!(
2793 "stage {} visible text does not contain expected snippet {:?}",
2794 assertion.node_id, snippet
2795 ));
2796 }
2797 }
2798 }
2799
2800 ReplayEvalReport {
2801 pass: failures.is_empty(),
2802 failures,
2803 stage_count: run.stages.len(),
2804 }
2805}
2806
2807fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2808 let mut failures = Vec::new();
2809 let spec = fixture.clarifying_question.clone().unwrap_or_default();
2810 let min_questions = clarifying_min_questions(&spec);
2811 let max_questions = clarifying_max_questions(&spec);
2812 let questions = &run.hitl_questions;
2813
2814 if run.status != fixture.expected_status {
2815 failures.push(format!(
2816 "run status mismatch: expected {}, got {}",
2817 fixture.expected_status, run.status
2818 ));
2819 }
2820 if questions.len() < min_questions {
2821 failures.push(format!(
2822 "expected at least {min_questions} clarifying question(s), got {}",
2823 questions.len()
2824 ));
2825 }
2826 if questions.len() > max_questions {
2827 failures.push(format!(
2828 "expected at most {max_questions} clarifying question(s), got {}",
2829 questions.len()
2830 ));
2831 }
2832
2833 let normalized_expected = spec
2834 .expected_question
2835 .as_deref()
2836 .map(normalize_question_text);
2837 let normalized_accepted = spec
2838 .accepted_questions
2839 .iter()
2840 .map(|question| normalize_question_text(question))
2841 .collect::<Vec<_>>();
2842 let required_terms = spec
2843 .required_terms
2844 .iter()
2845 .map(|term| normalize_question_text(term))
2846 .collect::<Vec<_>>();
2847 let forbidden_terms = spec
2848 .forbidden_terms
2849 .iter()
2850 .map(|term| normalize_question_text(term))
2851 .collect::<Vec<_>>();
2852
2853 let matched = questions.iter().any(|question| {
2854 let normalized = normalize_question_text(&question.prompt);
2855 let matches_expected = normalized_expected
2856 .as_ref()
2857 .is_none_or(|expected| &normalized == expected)
2858 && (normalized_accepted.is_empty()
2859 || normalized_accepted
2860 .iter()
2861 .any(|candidate| candidate == &normalized));
2862 let has_required_terms = required_terms
2863 .iter()
2864 .all(|term| normalized.contains(term.as_str()));
2865 let avoids_forbidden_terms = forbidden_terms
2866 .iter()
2867 .all(|term| !normalized.contains(term.as_str()));
2868 matches_expected && has_required_terms && avoids_forbidden_terms
2869 });
2870
2871 if !questions.is_empty()
2872 && (!normalized_accepted.is_empty()
2873 || normalized_expected.is_some()
2874 || !required_terms.is_empty()
2875 || !forbidden_terms.is_empty())
2876 && !matched
2877 {
2878 failures.push(format!(
2879 "no clarifying question matched fixture; actual questions: {}",
2880 questions
2881 .iter()
2882 .map(|question| format!("{:?}", question.prompt))
2883 .collect::<Vec<_>>()
2884 .join(", ")
2885 ));
2886 }
2887
2888 ReplayEvalReport {
2889 pass: failures.is_empty(),
2890 failures,
2891 stage_count: run.stages.len(),
2892 }
2893}
2894
2895pub fn evaluate_run_suite(
2896 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2897) -> ReplayEvalSuiteReport {
2898 let mut reports = Vec::new();
2899 for (run, fixture, source_path) in cases {
2900 let report = evaluate_run_against_fixture(&run, &fixture);
2901 reports.push(ReplayEvalCaseReport {
2902 run_id: run.id.clone(),
2903 workflow_id: run.workflow_id.clone(),
2904 label: None,
2905 pass: report.pass,
2906 failures: report.failures,
2907 stage_count: report.stage_count,
2908 source_path,
2909 comparison: None,
2910 });
2911 }
2912 let total = reports.len();
2913 let passed = reports.iter().filter(|report| report.pass).count();
2914 let failed = total.saturating_sub(passed);
2915 ReplayEvalSuiteReport {
2916 pass: failed == 0,
2917 total,
2918 passed,
2919 failed,
2920 cases: reports,
2921 }
2922}
2923
2924pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2925 let mut stage_diffs = Vec::new();
2926 let mut all_node_ids = BTreeSet::new();
2927 let left_by_id: BTreeMap<&str, &RunStageRecord> = left
2928 .stages
2929 .iter()
2930 .map(|s| (s.node_id.as_str(), s))
2931 .collect();
2932 let right_by_id: BTreeMap<&str, &RunStageRecord> = right
2933 .stages
2934 .iter()
2935 .map(|s| (s.node_id.as_str(), s))
2936 .collect();
2937 all_node_ids.extend(left_by_id.keys().copied());
2938 all_node_ids.extend(right_by_id.keys().copied());
2939
2940 for node_id in all_node_ids {
2941 let left_stage = left_by_id.get(node_id).copied();
2942 let right_stage = right_by_id.get(node_id).copied();
2943 match (left_stage, right_stage) {
2944 (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2945 node_id: node_id.to_string(),
2946 change: "removed".to_string(),
2947 details: vec!["stage missing from right run".to_string()],
2948 }),
2949 (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2950 node_id: node_id.to_string(),
2951 change: "added".to_string(),
2952 details: vec!["stage missing from left run".to_string()],
2953 }),
2954 (Some(left_stage), Some(right_stage)) => {
2955 let mut details = Vec::new();
2956 if left_stage.status != right_stage.status {
2957 details.push(format!(
2958 "status: {} -> {}",
2959 left_stage.status, right_stage.status
2960 ));
2961 }
2962 if left_stage.outcome != right_stage.outcome {
2963 details.push(format!(
2964 "outcome: {} -> {}",
2965 left_stage.outcome, right_stage.outcome
2966 ));
2967 }
2968 if left_stage.branch != right_stage.branch {
2969 details.push(format!(
2970 "branch: {:?} -> {:?}",
2971 left_stage.branch, right_stage.branch
2972 ));
2973 }
2974 if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2975 {
2976 details.push(format!(
2977 "produced_artifacts: {} -> {}",
2978 left_stage.produced_artifact_ids.len(),
2979 right_stage.produced_artifact_ids.len()
2980 ));
2981 }
2982 if left_stage.artifacts.len() != right_stage.artifacts.len() {
2983 details.push(format!(
2984 "artifact_records: {} -> {}",
2985 left_stage.artifacts.len(),
2986 right_stage.artifacts.len()
2987 ));
2988 }
2989 if !details.is_empty() {
2990 stage_diffs.push(RunStageDiffRecord {
2991 node_id: node_id.to_string(),
2992 change: "changed".to_string(),
2993 details,
2994 });
2995 }
2996 }
2997 (None, None) => {}
2998 }
2999 }
3000
3001 let mut tool_diffs = Vec::new();
3002 let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3003 .tool_recordings
3004 .iter()
3005 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3006 .collect();
3007 let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3008 .tool_recordings
3009 .iter()
3010 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3011 .collect();
3012 let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3013 .keys()
3014 .chain(right_tools.keys())
3015 .cloned()
3016 .collect();
3017 for key in &all_tool_keys {
3018 let l = left_tools.get(key);
3019 let r = right_tools.get(key);
3020 let result_changed = match (l, r) {
3021 (Some(a), Some(b)) => a.result != b.result,
3022 _ => true,
3023 };
3024 if result_changed {
3025 tool_diffs.push(ToolCallDiffRecord {
3026 tool_name: key.0.clone(),
3027 args_hash: key.1.clone(),
3028 result_changed,
3029 left_result: l.map(|t| t.result.clone()),
3030 right_result: r.map(|t| t.result.clone()),
3031 });
3032 }
3033 }
3034
3035 let left_observability = left.observability.clone().unwrap_or_else(|| {
3036 derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3037 });
3038 let right_observability = right.observability.clone().unwrap_or_else(|| {
3039 derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3040 });
3041 let mut observability_diffs = Vec::new();
3042
3043 let left_workers = left_observability
3044 .worker_lineage
3045 .iter()
3046 .map(|worker| {
3047 (
3048 worker.worker_id.clone(),
3049 (
3050 worker.status.clone(),
3051 worker.run_id.clone(),
3052 worker.run_path.clone(),
3053 ),
3054 )
3055 })
3056 .collect::<BTreeMap<_, _>>();
3057 let right_workers = right_observability
3058 .worker_lineage
3059 .iter()
3060 .map(|worker| {
3061 (
3062 worker.worker_id.clone(),
3063 (
3064 worker.status.clone(),
3065 worker.run_id.clone(),
3066 worker.run_path.clone(),
3067 ),
3068 )
3069 })
3070 .collect::<BTreeMap<_, _>>();
3071 let worker_ids = left_workers
3072 .keys()
3073 .chain(right_workers.keys())
3074 .cloned()
3075 .collect::<BTreeSet<_>>();
3076 for worker_id in worker_ids {
3077 match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3078 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3079 section: "worker_lineage".to_string(),
3080 label: worker_id,
3081 details: vec!["worker missing from right run".to_string()],
3082 }),
3083 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3084 section: "worker_lineage".to_string(),
3085 label: worker_id,
3086 details: vec!["worker missing from left run".to_string()],
3087 }),
3088 (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3089 let mut details = Vec::new();
3090 if left_worker.0 != right_worker.0 {
3091 details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3092 }
3093 if left_worker.1 != right_worker.1 {
3094 details.push(format!(
3095 "run_id: {:?} -> {:?}",
3096 left_worker.1, right_worker.1
3097 ));
3098 }
3099 if left_worker.2 != right_worker.2 {
3100 details.push(format!(
3101 "run_path: {:?} -> {:?}",
3102 left_worker.2, right_worker.2
3103 ));
3104 }
3105 observability_diffs.push(RunObservabilityDiffRecord {
3106 section: "worker_lineage".to_string(),
3107 label: worker_id,
3108 details,
3109 });
3110 }
3111 _ => {}
3112 }
3113 }
3114
3115 let left_rounds = left_observability
3116 .planner_rounds
3117 .iter()
3118 .map(|round| (round.stage_id.clone(), round))
3119 .collect::<BTreeMap<_, _>>();
3120 let right_rounds = right_observability
3121 .planner_rounds
3122 .iter()
3123 .map(|round| (round.stage_id.clone(), round))
3124 .collect::<BTreeMap<_, _>>();
3125 let round_ids = left_rounds
3126 .keys()
3127 .chain(right_rounds.keys())
3128 .cloned()
3129 .collect::<BTreeSet<_>>();
3130 for stage_id in round_ids {
3131 match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3132 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3133 section: "planner_rounds".to_string(),
3134 label: stage_id,
3135 details: vec!["planner summary missing from right run".to_string()],
3136 }),
3137 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3138 section: "planner_rounds".to_string(),
3139 label: stage_id,
3140 details: vec!["planner summary missing from left run".to_string()],
3141 }),
3142 (Some(left_round), Some(right_round)) => {
3143 let mut details = Vec::new();
3144 if left_round.iteration_count != right_round.iteration_count {
3145 details.push(format!(
3146 "iterations: {} -> {}",
3147 left_round.iteration_count, right_round.iteration_count
3148 ));
3149 }
3150 if left_round.tool_execution_count != right_round.tool_execution_count {
3151 details.push(format!(
3152 "tool_executions: {} -> {}",
3153 left_round.tool_execution_count, right_round.tool_execution_count
3154 ));
3155 }
3156 if left_round.native_text_tool_fallback_count
3157 != right_round.native_text_tool_fallback_count
3158 {
3159 details.push(format!(
3160 "native_text_tool_fallbacks: {} -> {}",
3161 left_round.native_text_tool_fallback_count,
3162 right_round.native_text_tool_fallback_count
3163 ));
3164 }
3165 if left_round.native_text_tool_fallback_rejection_count
3166 != right_round.native_text_tool_fallback_rejection_count
3167 {
3168 details.push(format!(
3169 "native_text_tool_fallback_rejections: {} -> {}",
3170 left_round.native_text_tool_fallback_rejection_count,
3171 right_round.native_text_tool_fallback_rejection_count
3172 ));
3173 }
3174 if left_round.empty_completion_retry_count
3175 != right_round.empty_completion_retry_count
3176 {
3177 details.push(format!(
3178 "empty_completion_retries: {} -> {}",
3179 left_round.empty_completion_retry_count,
3180 right_round.empty_completion_retry_count
3181 ));
3182 }
3183 if left_round.research_facts != right_round.research_facts {
3184 details.push(format!(
3185 "research_facts: {:?} -> {:?}",
3186 left_round.research_facts, right_round.research_facts
3187 ));
3188 }
3189 let left_deliverables = left_round
3190 .task_ledger
3191 .as_ref()
3192 .map(|ledger| {
3193 ledger
3194 .deliverables
3195 .iter()
3196 .map(|item| format!("{}:{}", item.id, item.status))
3197 .collect::<Vec<_>>()
3198 })
3199 .unwrap_or_default();
3200 let right_deliverables = right_round
3201 .task_ledger
3202 .as_ref()
3203 .map(|ledger| {
3204 ledger
3205 .deliverables
3206 .iter()
3207 .map(|item| format!("{}:{}", item.id, item.status))
3208 .collect::<Vec<_>>()
3209 })
3210 .unwrap_or_default();
3211 if left_deliverables != right_deliverables {
3212 details.push(format!(
3213 "deliverables: {:?} -> {:?}",
3214 left_deliverables, right_deliverables
3215 ));
3216 }
3217 if left_round.successful_tools != right_round.successful_tools {
3218 details.push(format!(
3219 "successful_tools: {:?} -> {:?}",
3220 left_round.successful_tools, right_round.successful_tools
3221 ));
3222 }
3223 if !details.is_empty() {
3224 observability_diffs.push(RunObservabilityDiffRecord {
3225 section: "planner_rounds".to_string(),
3226 label: left_round.node_id.clone(),
3227 details,
3228 });
3229 }
3230 }
3231 _ => {}
3232 }
3233 }
3234
3235 let left_pointers = left_observability
3236 .transcript_pointers
3237 .iter()
3238 .map(|pointer| {
3239 (
3240 pointer.id.clone(),
3241 (
3242 pointer.available,
3243 pointer.path.clone(),
3244 pointer.location.clone(),
3245 ),
3246 )
3247 })
3248 .collect::<BTreeMap<_, _>>();
3249 let right_pointers = right_observability
3250 .transcript_pointers
3251 .iter()
3252 .map(|pointer| {
3253 (
3254 pointer.id.clone(),
3255 (
3256 pointer.available,
3257 pointer.path.clone(),
3258 pointer.location.clone(),
3259 ),
3260 )
3261 })
3262 .collect::<BTreeMap<_, _>>();
3263 let pointer_ids = left_pointers
3264 .keys()
3265 .chain(right_pointers.keys())
3266 .cloned()
3267 .collect::<BTreeSet<_>>();
3268 for pointer_id in pointer_ids {
3269 match (
3270 left_pointers.get(&pointer_id),
3271 right_pointers.get(&pointer_id),
3272 ) {
3273 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3274 section: "transcript_pointers".to_string(),
3275 label: pointer_id,
3276 details: vec!["pointer missing from right run".to_string()],
3277 }),
3278 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3279 section: "transcript_pointers".to_string(),
3280 label: pointer_id,
3281 details: vec!["pointer missing from left run".to_string()],
3282 }),
3283 (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3284 observability_diffs.push(RunObservabilityDiffRecord {
3285 section: "transcript_pointers".to_string(),
3286 label: pointer_id,
3287 details: vec![format!(
3288 "pointer: {:?} -> {:?}",
3289 left_pointer, right_pointer
3290 )],
3291 });
3292 }
3293 _ => {}
3294 }
3295 }
3296
3297 let left_compactions = left_observability
3298 .compaction_events
3299 .iter()
3300 .map(|event| {
3301 (
3302 event.id.clone(),
3303 (
3304 event.strategy.clone(),
3305 event.archived_messages,
3306 event.snapshot_asset_id.clone(),
3307 event.available,
3308 ),
3309 )
3310 })
3311 .collect::<BTreeMap<_, _>>();
3312 let right_compactions = right_observability
3313 .compaction_events
3314 .iter()
3315 .map(|event| {
3316 (
3317 event.id.clone(),
3318 (
3319 event.strategy.clone(),
3320 event.archived_messages,
3321 event.snapshot_asset_id.clone(),
3322 event.available,
3323 ),
3324 )
3325 })
3326 .collect::<BTreeMap<_, _>>();
3327 let compaction_ids = left_compactions
3328 .keys()
3329 .chain(right_compactions.keys())
3330 .cloned()
3331 .collect::<BTreeSet<_>>();
3332 for compaction_id in compaction_ids {
3333 match (
3334 left_compactions.get(&compaction_id),
3335 right_compactions.get(&compaction_id),
3336 ) {
3337 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3338 section: "compaction_events".to_string(),
3339 label: compaction_id,
3340 details: vec!["compaction event missing from right run".to_string()],
3341 }),
3342 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3343 section: "compaction_events".to_string(),
3344 label: compaction_id,
3345 details: vec!["compaction event missing from left run".to_string()],
3346 }),
3347 (Some(left_event), Some(right_event)) if left_event != right_event => {
3348 observability_diffs.push(RunObservabilityDiffRecord {
3349 section: "compaction_events".to_string(),
3350 label: compaction_id,
3351 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3352 });
3353 }
3354 _ => {}
3355 }
3356 }
3357
3358 let left_daemons = left_observability
3359 .daemon_events
3360 .iter()
3361 .map(|event| {
3362 (
3363 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3364 (
3365 event.name.clone(),
3366 event.persist_path.clone(),
3367 event.payload_summary.clone(),
3368 ),
3369 )
3370 })
3371 .collect::<BTreeMap<_, _>>();
3372 let right_daemons = right_observability
3373 .daemon_events
3374 .iter()
3375 .map(|event| {
3376 (
3377 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3378 (
3379 event.name.clone(),
3380 event.persist_path.clone(),
3381 event.payload_summary.clone(),
3382 ),
3383 )
3384 })
3385 .collect::<BTreeMap<_, _>>();
3386 let daemon_keys = left_daemons
3387 .keys()
3388 .chain(right_daemons.keys())
3389 .cloned()
3390 .collect::<BTreeSet<_>>();
3391 for daemon_key in daemon_keys {
3392 let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3393 match (
3394 left_daemons.get(&daemon_key),
3395 right_daemons.get(&daemon_key),
3396 ) {
3397 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3398 section: "daemon_events".to_string(),
3399 label,
3400 details: vec!["daemon event missing from right run".to_string()],
3401 }),
3402 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3403 section: "daemon_events".to_string(),
3404 label,
3405 details: vec!["daemon event missing from left run".to_string()],
3406 }),
3407 (Some(left_event), Some(right_event)) if left_event != right_event => {
3408 observability_diffs.push(RunObservabilityDiffRecord {
3409 section: "daemon_events".to_string(),
3410 label,
3411 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3412 });
3413 }
3414 _ => {}
3415 }
3416 }
3417
3418 let left_verification = left_observability
3419 .verification_outcomes
3420 .iter()
3421 .map(|item| (item.stage_id.clone(), item))
3422 .collect::<BTreeMap<_, _>>();
3423 let right_verification = right_observability
3424 .verification_outcomes
3425 .iter()
3426 .map(|item| (item.stage_id.clone(), item))
3427 .collect::<BTreeMap<_, _>>();
3428 let verification_ids = left_verification
3429 .keys()
3430 .chain(right_verification.keys())
3431 .cloned()
3432 .collect::<BTreeSet<_>>();
3433 for stage_id in verification_ids {
3434 match (
3435 left_verification.get(&stage_id),
3436 right_verification.get(&stage_id),
3437 ) {
3438 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3439 section: "verification".to_string(),
3440 label: stage_id,
3441 details: vec!["verification missing from right run".to_string()],
3442 }),
3443 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3444 section: "verification".to_string(),
3445 label: stage_id,
3446 details: vec!["verification missing from left run".to_string()],
3447 }),
3448 (Some(left_item), Some(right_item)) if left_item != right_item => {
3449 let mut details = Vec::new();
3450 if left_item.passed != right_item.passed {
3451 details.push(format!(
3452 "passed: {:?} -> {:?}",
3453 left_item.passed, right_item.passed
3454 ));
3455 }
3456 if left_item.summary != right_item.summary {
3457 details.push(format!(
3458 "summary: {:?} -> {:?}",
3459 left_item.summary, right_item.summary
3460 ));
3461 }
3462 observability_diffs.push(RunObservabilityDiffRecord {
3463 section: "verification".to_string(),
3464 label: left_item.node_id.clone(),
3465 details,
3466 });
3467 }
3468 _ => {}
3469 }
3470 }
3471
3472 let left_graph = (
3473 left_observability.action_graph_nodes.len(),
3474 left_observability.action_graph_edges.len(),
3475 );
3476 let right_graph = (
3477 right_observability.action_graph_nodes.len(),
3478 right_observability.action_graph_edges.len(),
3479 );
3480 if left_graph != right_graph {
3481 observability_diffs.push(RunObservabilityDiffRecord {
3482 section: "action_graph".to_string(),
3483 label: "shape".to_string(),
3484 details: vec![format!(
3485 "nodes/edges: {}/{} -> {}/{}",
3486 left_graph.0, left_graph.1, right_graph.0, right_graph.1
3487 )],
3488 });
3489 }
3490
3491 let status_changed = left.status != right.status;
3492 let identical = !status_changed
3493 && stage_diffs.is_empty()
3494 && tool_diffs.is_empty()
3495 && observability_diffs.is_empty()
3496 && left.transitions.len() == right.transitions.len()
3497 && left.artifacts.len() == right.artifacts.len()
3498 && left.checkpoints.len() == right.checkpoints.len();
3499
3500 RunDiffReport {
3501 left_run_id: left.id.clone(),
3502 right_run_id: right.id.clone(),
3503 identical,
3504 status_changed,
3505 left_status: left.status.clone(),
3506 right_status: right.status.clone(),
3507 stage_diffs,
3508 tool_diffs,
3509 observability_diffs,
3510 transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3511 artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3512 checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3513 }
3514}
3515
3516#[cfg(test)]
3517mod tests {
3518 use super::*;
3519 use std::fs;
3520
3521 fn minimal_run(status: &str) -> RunRecord {
3522 RunRecord {
3523 type_name: "workflow_run".to_string(),
3524 id: "run_1".to_string(),
3525 workflow_id: "workflow_1".to_string(),
3526 status: status.to_string(),
3527 usage: Some(LlmUsageRecord {
3528 total_duration_ms: 12,
3529 total_cost: 0.01,
3530 input_tokens: 3,
3531 output_tokens: 4,
3532 call_count: 1,
3533 models: vec!["mock".to_string()],
3534 }),
3535 replay_fixture: Some(ReplayFixture {
3536 type_name: "replay_fixture".to_string(),
3537 expected_status: "completed".to_string(),
3538 ..ReplayFixture::default()
3539 }),
3540 ..RunRecord::default()
3541 }
3542 }
3543
3544 #[test]
3545 fn eval_pack_manifest_toml_runs_replay_case() {
3546 let temp = tempfile::tempdir().unwrap();
3547 let run_path = temp.path().join("run.json");
3548 fs::write(
3549 &run_path,
3550 serde_json::to_string(&minimal_run("completed")).unwrap(),
3551 )
3552 .unwrap();
3553 let pack_path = temp.path().join("harn.eval.toml");
3554 fs::write(
3555 &pack_path,
3556 r#"
3557version = 1
3558id = "connector-regressions"
3559name = "Connector regressions"
3560
3561[[cases]]
3562id = "webhook"
3563name = "Webhook normalization"
3564run = "run.json"
3565rubrics = ["status"]
3566
3567[[rubrics]]
3568id = "status"
3569kind = "deterministic"
3570
3571[[rubrics.assertions]]
3572kind = "run-status"
3573expected = "completed"
3574"#,
3575 )
3576 .unwrap();
3577
3578 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3579 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3580
3581 assert!(report.pass);
3582 assert_eq!(report.total, 1);
3583 assert_eq!(report.cases[0].label, "Webhook normalization");
3584 }
3585
3586 #[test]
3587 fn eval_pack_warning_case_does_not_block() {
3588 let temp = tempfile::tempdir().unwrap();
3589 let run_path = temp.path().join("run.json");
3590 fs::write(
3591 &run_path,
3592 serde_json::to_string(&minimal_run("completed")).unwrap(),
3593 )
3594 .unwrap();
3595 let pack_path = temp.path().join("harn.eval.toml");
3596 fs::write(
3597 &pack_path,
3598 r#"
3599version = 1
3600id = "budgets"
3601
3602[[cases]]
3603id = "latency-budget"
3604run = "run.json"
3605severity = "warning"
3606
3607[cases.thresholds]
3608max-latency-ms = 1
3609"#,
3610 )
3611 .unwrap();
3612
3613 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3614 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3615
3616 assert!(report.pass);
3617 assert_eq!(report.warning_failed, 1);
3618 assert!(report.cases[0].warnings[0].contains("latency"));
3619 }
3620
3621 #[test]
3622 fn eval_pack_manifest_runs_friction_context_pack_case() {
3623 let temp = tempfile::tempdir().unwrap();
3624 let events_path = temp.path().join("incident-friction.json");
3625 fs::write(
3626 &events_path,
3627 r#"
3628{
3629 "events": [
3630 {
3631 "kind": "repeated_query",
3632 "source": "incident-triage",
3633 "actor": "sre",
3634 "tool": "splunk",
3635 "provider": "splunk",
3636 "redacted_summary": "Checkout incidents need the same Splunk search",
3637 "recurrence_hints": ["checkout incident queries"],
3638 "estimated_time_ms": 300000,
3639 "metadata": {
3640 "query": "index=checkout service=api error",
3641 "capability": "splunk.search",
3642 "secret_ref": "SPLUNK_READ_TOKEN",
3643 "output_slot": "splunk_errors"
3644 }
3645 },
3646 {
3647 "kind": "repeated_query",
3648 "source": "incident-triage",
3649 "actor": "sre",
3650 "tool": "splunk",
3651 "provider": "splunk",
3652 "redacted_summary": "Checkout incident triage repeated the Splunk search",
3653 "recurrence_hints": ["checkout incident queries"],
3654 "estimated_time_ms": 240000,
3655 "metadata": {
3656 "query": "index=checkout service=api error",
3657 "capability": "splunk.search",
3658 "secret_ref": "SPLUNK_READ_TOKEN",
3659 "output_slot": "splunk_errors"
3660 }
3661 }
3662 ]
3663}
3664"#,
3665 )
3666 .unwrap();
3667 let pack_path = temp.path().join("harn.eval.toml");
3668 fs::write(
3669 &pack_path,
3670 r#"
3671version = 1
3672id = "team-learning"
3673name = "Team learning evals"
3674
3675[[fixtures]]
3676id = "incident-friction"
3677kind = "friction-events"
3678path = "incident-friction.json"
3679
3680[[cases]]
3681id = "incident-context-pack"
3682name = "Incident context pack suggestion"
3683friction_events = "incident-friction"
3684rubrics = ["context-pack"]
3685
3686[[rubrics]]
3687id = "context-pack"
3688kind = "friction"
3689
3690[[rubrics.assertions]]
3691kind = "context-pack-suggestion"
3692contains = "incident"
3693expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3694"#,
3695 )
3696 .unwrap();
3697
3698 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3699 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3700
3701 assert!(report.pass);
3702 assert_eq!(report.total, 1);
3703 assert_eq!(report.cases[0].run_id, "friction_events");
3704 assert_eq!(report.cases[0].stage_count, 2);
3705 }
3706}