1use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 default_run_dir, evaluate_context_pack_suggestion_expectations,
10 generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11 parse_json_payload, parse_json_value, sync_run_handoffs, ArtifactRecord, CapabilityPolicy,
12 ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact,
13};
14use crate::event_log::{
15 active_event_log, AnyEventLog, EventLog, LogEvent as EventLogRecord, Topic,
16};
17use crate::llm::vm_value_to_json;
18use crate::triggers::{SignatureStatus, TriggerEvent};
19use crate::value::{VmError, VmValue};
20
21pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
22pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
23pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
24pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
25pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
26pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
27pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
28pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
29pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
30pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
31pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
32
33pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
34pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
35pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
36pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
37pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
38pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
39pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
40pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
41pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
42
43#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
44#[serde(default)]
45pub struct LlmUsageRecord {
46 pub input_tokens: i64,
47 pub output_tokens: i64,
48 pub total_duration_ms: i64,
49 pub call_count: i64,
50 pub total_cost: f64,
51 pub models: Vec<String>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
55#[serde(default)]
56pub struct RunStageRecord {
57 pub id: String,
58 pub node_id: String,
59 pub kind: String,
60 pub status: String,
61 pub outcome: String,
62 pub branch: Option<String>,
63 pub started_at: String,
64 pub finished_at: Option<String>,
65 pub visible_text: Option<String>,
66 pub private_reasoning: Option<String>,
67 pub transcript: Option<serde_json::Value>,
68 pub verification: Option<serde_json::Value>,
69 pub usage: Option<LlmUsageRecord>,
70 pub artifacts: Vec<ArtifactRecord>,
71 pub consumed_artifact_ids: Vec<String>,
72 pub produced_artifact_ids: Vec<String>,
73 pub attempts: Vec<RunStageAttemptRecord>,
74 pub metadata: BTreeMap<String, serde_json::Value>,
75}
76
77#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
78#[serde(default)]
79pub struct RunStageAttemptRecord {
80 pub attempt: usize,
81 pub status: String,
82 pub outcome: String,
83 pub branch: Option<String>,
84 pub error: Option<String>,
85 pub verification: Option<serde_json::Value>,
86 pub started_at: String,
87 pub finished_at: Option<String>,
88}
89
90#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(default)]
92pub struct RunTransitionRecord {
93 pub id: String,
94 pub from_stage_id: Option<String>,
95 pub from_node_id: Option<String>,
96 pub to_node_id: String,
97 pub branch: Option<String>,
98 pub timestamp: String,
99 pub consumed_artifact_ids: Vec<String>,
100 pub produced_artifact_ids: Vec<String>,
101}
102
103#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
104#[serde(default)]
105pub struct RunCheckpointRecord {
106 pub id: String,
107 pub ready_nodes: Vec<String>,
108 pub completed_nodes: Vec<String>,
109 pub last_stage_id: Option<String>,
110 pub persisted_at: String,
111 pub reason: String,
112}
113
114#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
115#[serde(default)]
116pub struct ReplayFixture {
117 #[serde(rename = "_type")]
118 pub type_name: String,
119 pub id: String,
120 pub source_run_id: String,
121 pub workflow_id: String,
122 pub workflow_name: Option<String>,
123 pub created_at: String,
124 pub eval_kind: Option<String>,
125 pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
126 pub expected_status: String,
127 pub stage_assertions: Vec<ReplayStageAssertion>,
128}
129
130#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
131#[serde(default)]
132pub struct ClarifyingQuestionEvalSpec {
133 pub expected_question: Option<String>,
134 pub accepted_questions: Vec<String>,
135 pub required_terms: Vec<String>,
136 pub forbidden_terms: Vec<String>,
137 pub min_questions: usize,
138 pub max_questions: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
142#[serde(default)]
143pub struct ReplayStageAssertion {
144 pub node_id: String,
145 pub expected_status: String,
146 pub expected_outcome: String,
147 pub expected_branch: Option<String>,
148 pub required_artifact_kinds: Vec<String>,
149 pub visible_text_contains: Option<String>,
150}
151
152#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
153#[serde(default)]
154pub struct ReplayEvalReport {
155 pub pass: bool,
156 pub failures: Vec<String>,
157 pub stage_count: usize,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalCaseReport {
163 pub run_id: String,
164 pub workflow_id: String,
165 pub label: Option<String>,
166 pub pass: bool,
167 pub failures: Vec<String>,
168 pub stage_count: usize,
169 pub source_path: Option<String>,
170 pub comparison: Option<RunDiffReport>,
171}
172
173#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
174#[serde(default)]
175pub struct ReplayEvalSuiteReport {
176 pub pass: bool,
177 pub total: usize,
178 pub passed: usize,
179 pub failed: usize,
180 pub cases: Vec<ReplayEvalCaseReport>,
181}
182
183#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(default)]
185pub struct RunDeliverableSummaryRecord {
186 pub id: String,
187 pub text: String,
188 pub status: String,
189 pub note: Option<String>,
190}
191
192#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
193#[serde(default)]
194pub struct RunTaskLedgerSummaryRecord {
195 pub root_task: String,
196 pub rationale: String,
197 pub deliverables: Vec<RunDeliverableSummaryRecord>,
198 pub observations: Vec<String>,
199 pub blocking_count: usize,
200}
201
202#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
203#[serde(default)]
204pub struct RunPlannerRoundRecord {
205 pub stage_id: String,
206 pub node_id: String,
207 pub stage_kind: String,
208 pub status: String,
209 pub outcome: String,
210 pub iteration_count: usize,
211 pub llm_call_count: usize,
212 pub tool_execution_count: usize,
213 pub tool_rejection_count: usize,
214 pub intervention_count: usize,
215 pub compaction_count: usize,
216 pub native_text_tool_fallback_count: usize,
217 pub native_text_tool_fallback_rejection_count: usize,
218 pub empty_completion_retry_count: usize,
219 pub tools_used: Vec<String>,
220 pub successful_tools: Vec<String>,
221 pub ledger_done_rejections: usize,
222 pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
223 pub research_facts: Vec<String>,
224}
225
226#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
227#[serde(default)]
228pub struct RunWorkerLineageRecord {
229 pub worker_id: String,
230 pub worker_name: String,
231 pub parent_stage_id: Option<String>,
232 pub task: String,
233 pub status: String,
234 pub session_id: Option<String>,
235 pub parent_session_id: Option<String>,
236 pub run_id: Option<String>,
237 pub run_path: Option<String>,
238 pub snapshot_path: Option<String>,
239}
240
241#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
242#[serde(default)]
243pub struct RunActionGraphNodeRecord {
244 pub id: String,
245 pub label: String,
246 pub kind: String,
247 pub status: String,
248 pub outcome: String,
249 pub trace_id: Option<String>,
250 pub stage_id: Option<String>,
251 pub node_id: Option<String>,
252 pub worker_id: Option<String>,
253 pub run_id: Option<String>,
254 pub run_path: Option<String>,
255 pub metadata: BTreeMap<String, serde_json::Value>,
256}
257
258#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
259#[serde(default)]
260pub struct RunActionGraphEdgeRecord {
261 pub from_id: String,
262 pub to_id: String,
263 pub kind: String,
264 pub label: Option<String>,
265}
266
267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
268#[serde(default)]
269pub struct RunVerificationOutcomeRecord {
270 pub stage_id: String,
271 pub node_id: String,
272 pub status: String,
273 pub passed: Option<bool>,
274 pub summary: Option<String>,
275}
276
277#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
278#[serde(default)]
279pub struct RunTranscriptPointerRecord {
280 pub id: String,
281 pub label: String,
282 pub kind: String,
283 pub location: String,
284 pub path: Option<String>,
285 pub available: bool,
286}
287
288#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
289#[serde(default)]
290pub struct CompactionEventRecord {
291 pub id: String,
292 pub transcript_id: Option<String>,
293 pub stage_id: Option<String>,
294 pub node_id: Option<String>,
295 pub mode: String,
296 pub strategy: String,
297 pub archived_messages: usize,
298 pub estimated_tokens_before: usize,
299 pub estimated_tokens_after: usize,
300 pub snapshot_asset_id: Option<String>,
301 pub snapshot_location: String,
302 pub snapshot_path: Option<String>,
303 pub available: bool,
304}
305
306#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
307#[serde(rename_all = "snake_case")]
308pub enum DaemonEventKindRecord {
309 #[default]
310 Spawned,
311 Triggered,
312 Snapshotted,
313 Resumed,
314 Stopped,
315}
316
317#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
318#[serde(default)]
319pub struct DaemonEventRecord {
320 pub daemon_id: String,
321 pub name: String,
322 pub kind: DaemonEventKindRecord,
323 pub timestamp: String,
324 pub persist_path: String,
325 pub payload_summary: Option<String>,
326}
327
328#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
329#[serde(default)]
330pub struct RunObservabilityRecord {
331 pub schema_version: usize,
332 pub planner_rounds: Vec<RunPlannerRoundRecord>,
333 pub research_fact_count: usize,
334 pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
335 pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
336 pub worker_lineage: Vec<RunWorkerLineageRecord>,
337 pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
338 pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
339 pub compaction_events: Vec<CompactionEventRecord>,
340 pub daemon_events: Vec<DaemonEventRecord>,
341}
342
343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
344#[serde(default)]
345pub struct RunStageDiffRecord {
346 pub node_id: String,
347 pub change: String,
348 pub details: Vec<String>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct ToolCallDiffRecord {
354 pub tool_name: String,
355 pub args_hash: String,
356 pub result_changed: bool,
357 pub left_result: Option<String>,
358 pub right_result: Option<String>,
359}
360
361#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
362#[serde(default)]
363pub struct RunObservabilityDiffRecord {
364 pub section: String,
365 pub label: String,
366 pub details: Vec<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunDiffReport {
372 pub left_run_id: String,
373 pub right_run_id: String,
374 pub identical: bool,
375 pub status_changed: bool,
376 pub left_status: String,
377 pub right_status: String,
378 pub stage_diffs: Vec<RunStageDiffRecord>,
379 pub tool_diffs: Vec<ToolCallDiffRecord>,
380 pub observability_diffs: Vec<RunObservabilityDiffRecord>,
381 pub transition_count_delta: isize,
382 pub artifact_count_delta: isize,
383 pub checkpoint_count_delta: isize,
384}
385
386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
387#[serde(default)]
388pub struct EvalSuiteManifest {
389 #[serde(rename = "_type")]
390 pub type_name: String,
391 pub id: String,
392 pub name: Option<String>,
393 pub base_dir: Option<String>,
394 pub cases: Vec<EvalSuiteCase>,
395}
396
397#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
398#[serde(default)]
399pub struct EvalSuiteCase {
400 pub label: Option<String>,
401 pub run_path: String,
402 pub fixture_path: Option<String>,
403 pub compare_to: Option<String>,
404}
405
406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
407#[serde(default)]
408pub struct EvalPackManifest {
409 pub version: u32,
410 pub id: String,
411 pub name: Option<String>,
412 pub description: Option<String>,
413 pub base_dir: Option<String>,
414 pub baseline: Option<String>,
415 pub package: Option<EvalPackPackage>,
416 pub defaults: EvalPackDefaults,
417 pub fixtures: Vec<EvalPackFixtureRef>,
418 pub rubrics: Vec<EvalPackRubric>,
419 pub judge: Option<EvalPackJudgeConfig>,
420 pub cases: Vec<EvalPackCase>,
421 pub metadata: BTreeMap<String, serde_json::Value>,
422}
423
424#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
425#[serde(default)]
426pub struct EvalPackPackage {
427 pub name: Option<String>,
428 pub version: Option<String>,
429 pub source: Option<String>,
430 pub templates: Vec<String>,
431}
432
433#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
434#[serde(default)]
435pub struct EvalPackDefaults {
436 pub severity: Option<String>,
437 pub fixture_root: Option<String>,
438 pub thresholds: EvalPackThresholds,
439 pub judge: Option<EvalPackJudgeConfig>,
440}
441
442#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
443#[serde(default)]
444pub struct EvalPackFixtureRef {
445 pub id: String,
446 pub kind: String,
447 pub path: Option<String>,
448 #[serde(default, alias = "trace-id")]
449 pub trace_id: Option<String>,
450 pub provider: Option<String>,
451 #[serde(default, alias = "event-kind")]
452 pub event_kind: Option<String>,
453 pub inline: Option<serde_json::Value>,
454 pub metadata: BTreeMap<String, serde_json::Value>,
455}
456
457#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
458#[serde(default)]
459pub struct EvalPackRubric {
460 pub id: String,
461 pub kind: String,
462 pub description: Option<String>,
463 pub prompt: Option<String>,
464 pub assertions: Vec<EvalPackAssertion>,
465 pub judge: Option<EvalPackJudgeConfig>,
466 pub calibration: Vec<EvalPackGoldenExample>,
467 pub thresholds: EvalPackThresholds,
468 pub metadata: BTreeMap<String, serde_json::Value>,
469}
470
471#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
472#[serde(default)]
473pub struct EvalPackAssertion {
474 pub kind: String,
475 pub stage: Option<String>,
476 pub path: Option<String>,
477 pub op: Option<String>,
478 pub expected: Option<serde_json::Value>,
479 pub contains: Option<String>,
480 pub metadata: BTreeMap<String, serde_json::Value>,
481}
482
483#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
484#[serde(default)]
485pub struct EvalPackJudgeConfig {
486 pub model: Option<String>,
487 #[serde(default, alias = "prompt-version")]
488 pub prompt_version: Option<String>,
489 #[serde(default, alias = "tie-break")]
490 pub tie_break: Option<String>,
491 #[serde(default, alias = "confidence-min")]
492 pub confidence_min: Option<f64>,
493 pub temperature: Option<f64>,
494 pub rubric: Option<String>,
495 pub metadata: BTreeMap<String, serde_json::Value>,
496}
497
498#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
499#[serde(default)]
500pub struct EvalPackGoldenExample {
501 pub input: serde_json::Value,
502 pub output: serde_json::Value,
503 pub score: Option<f64>,
504 pub explanation: Option<String>,
505}
506
507#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
508#[serde(default)]
509pub struct EvalPackThresholds {
510 pub severity: Option<String>,
511 #[serde(default, alias = "min-score")]
512 pub min_score: Option<f64>,
513 #[serde(default, alias = "min-confidence")]
514 pub min_confidence: Option<f64>,
515 #[serde(default, alias = "max-cost-usd")]
516 pub max_cost_usd: Option<f64>,
517 #[serde(default, alias = "max-latency-ms")]
518 pub max_latency_ms: Option<i64>,
519 #[serde(default, alias = "max-tokens")]
520 pub max_tokens: Option<i64>,
521 #[serde(default, alias = "max-stage-count")]
522 pub max_stage_count: Option<usize>,
523}
524
525#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
526#[serde(default)]
527pub struct EvalPackCase {
528 pub id: Option<String>,
529 pub name: Option<String>,
530 pub description: Option<String>,
531 pub run: Option<String>,
532 #[serde(default, alias = "run-path")]
533 pub run_path: Option<String>,
534 #[serde(default, alias = "friction-events", alias = "friction_events")]
535 pub friction_events: Option<String>,
536 pub fixture: Option<String>,
537 #[serde(default, alias = "fixture-path")]
538 pub fixture_path: Option<String>,
539 #[serde(default, alias = "compare-to")]
540 pub compare_to: Option<String>,
541 pub rubrics: Vec<String>,
542 pub severity: Option<String>,
543 pub thresholds: EvalPackThresholds,
544 pub metadata: BTreeMap<String, serde_json::Value>,
545}
546
547#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
548#[serde(default)]
549pub struct EvalPackReport {
550 pub pack_id: String,
551 pub pass: bool,
552 pub total: usize,
553 pub passed: usize,
554 pub failed: usize,
555 pub blocking_failed: usize,
556 pub warning_failed: usize,
557 pub informational_failed: usize,
558 pub cases: Vec<EvalPackCaseReport>,
559}
560
561#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
562#[serde(default)]
563pub struct EvalPackCaseReport {
564 pub id: String,
565 pub label: String,
566 pub severity: String,
567 pub pass: bool,
568 pub blocking: bool,
569 pub run_id: String,
570 pub workflow_id: String,
571 pub source_path: Option<String>,
572 pub stage_count: usize,
573 pub failures: Vec<String>,
574 pub warnings: Vec<String>,
575 pub informational: Vec<String>,
576 pub comparison: Option<RunDiffReport>,
577}
578
579#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
580#[serde(default)]
581pub struct RunHitlQuestionRecord {
582 pub request_id: String,
583 pub prompt: String,
584 pub agent: String,
585 pub trace_id: Option<String>,
586 pub asked_at: String,
587}
588
589#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
590#[serde(default)]
591pub struct RunRecord {
592 #[serde(rename = "_type")]
593 pub type_name: String,
594 pub id: String,
595 pub workflow_id: String,
596 pub workflow_name: Option<String>,
597 pub task: String,
598 pub status: String,
599 pub started_at: String,
600 pub finished_at: Option<String>,
601 pub parent_run_id: Option<String>,
602 pub root_run_id: Option<String>,
603 pub stages: Vec<RunStageRecord>,
604 pub transitions: Vec<RunTransitionRecord>,
605 pub checkpoints: Vec<RunCheckpointRecord>,
606 pub pending_nodes: Vec<String>,
607 pub completed_nodes: Vec<String>,
608 pub child_runs: Vec<RunChildRecord>,
609 pub artifacts: Vec<ArtifactRecord>,
610 pub handoffs: Vec<HandoffArtifact>,
611 pub policy: CapabilityPolicy,
612 pub execution: Option<RunExecutionRecord>,
613 pub transcript: Option<serde_json::Value>,
614 pub usage: Option<LlmUsageRecord>,
615 pub replay_fixture: Option<ReplayFixture>,
616 pub observability: Option<RunObservabilityRecord>,
617 pub trace_spans: Vec<RunTraceSpanRecord>,
618 pub tool_recordings: Vec<ToolCallRecord>,
619 pub hitl_questions: Vec<RunHitlQuestionRecord>,
620 pub metadata: BTreeMap<String, serde_json::Value>,
621 pub persisted_path: Option<String>,
622}
623
624#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
625#[serde(default)]
626pub struct ToolCallRecord {
627 pub tool_name: String,
628 pub tool_use_id: String,
629 pub args_hash: String,
630 pub result: String,
631 pub is_rejected: bool,
632 pub duration_ms: u64,
633 pub iteration: usize,
634 pub timestamp: String,
635}
636
637pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
639 use std::hash::{Hash, Hasher};
640 let mut hasher = std::collections::hash_map::DefaultHasher::new();
641 tool_name.hash(&mut hasher);
642 let args_str = serde_json::to_string(args).unwrap_or_default();
643 args_str.hash(&mut hasher);
644 format!("{:016x}", hasher.finish())
645}
646
647#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
648#[serde(default)]
649pub struct RunTraceSpanRecord {
650 pub span_id: u64,
651 pub parent_id: Option<u64>,
652 pub kind: String,
653 pub name: String,
654 pub start_ms: u64,
655 pub duration_ms: u64,
656 pub metadata: BTreeMap<String, serde_json::Value>,
657}
658
659#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
660#[serde(default)]
661pub struct RunChildRecord {
662 pub worker_id: String,
663 pub worker_name: String,
664 pub parent_stage_id: Option<String>,
665 pub session_id: Option<String>,
666 pub parent_session_id: Option<String>,
667 pub mutation_scope: Option<String>,
668 pub approval_policy: Option<super::ToolApprovalPolicy>,
669 pub task: String,
670 pub request: Option<serde_json::Value>,
671 pub provenance: Option<serde_json::Value>,
672 pub status: String,
673 pub started_at: String,
674 pub finished_at: Option<String>,
675 pub run_id: Option<String>,
676 pub run_path: Option<String>,
677 pub snapshot_path: Option<String>,
678 pub execution: Option<RunExecutionRecord>,
679}
680
681pub(crate) fn run_child_record_from_worker_metadata(
682 parent_stage_id: Option<String>,
683 worker: &serde_json::Value,
684) -> Option<RunChildRecord> {
685 let worker_id = worker.get("id").and_then(|value| value.as_str())?;
686 if worker_id.is_empty() {
687 return None;
688 }
689 Some(RunChildRecord {
690 worker_id: worker_id.to_string(),
691 worker_name: worker
692 .get("name")
693 .and_then(|value| value.as_str())
694 .unwrap_or("worker")
695 .to_string(),
696 parent_stage_id,
697 session_id: worker
698 .get("audit")
699 .and_then(|value| value.get("session_id"))
700 .and_then(|value| value.as_str())
701 .map(str::to_string),
702 parent_session_id: worker
703 .get("audit")
704 .and_then(|value| value.get("parent_session_id"))
705 .and_then(|value| value.as_str())
706 .map(str::to_string),
707 mutation_scope: worker
708 .get("audit")
709 .and_then(|value| value.get("mutation_scope"))
710 .and_then(|value| value.as_str())
711 .map(str::to_string),
712 approval_policy: worker
713 .get("audit")
714 .and_then(|value| value.get("approval_policy"))
715 .and_then(|value| {
716 serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
717 }),
718 task: worker
719 .get("task")
720 .and_then(|value| value.as_str())
721 .unwrap_or_default()
722 .to_string(),
723 request: worker.get("request").cloned(),
724 provenance: worker.get("provenance").cloned(),
725 status: worker
726 .get("status")
727 .and_then(|value| value.as_str())
728 .unwrap_or("completed")
729 .to_string(),
730 started_at: worker
731 .get("started_at")
732 .and_then(|value| value.as_str())
733 .unwrap_or_default()
734 .to_string(),
735 finished_at: worker
736 .get("finished_at")
737 .and_then(|value| value.as_str())
738 .map(str::to_string),
739 run_id: worker
740 .get("child_run_id")
741 .and_then(|value| value.as_str())
742 .map(str::to_string),
743 run_path: worker
744 .get("child_run_path")
745 .and_then(|value| value.as_str())
746 .map(str::to_string),
747 snapshot_path: worker
748 .get("snapshot_path")
749 .and_then(|value| value.as_str())
750 .map(str::to_string),
751 execution: worker
752 .get("execution")
753 .cloned()
754 .and_then(|value| serde_json::from_value(value).ok()),
755 })
756}
757
758fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
759 let parent_stage_id = if stage.id.is_empty() {
760 None
761 } else {
762 Some(stage.id.clone())
763 };
764 run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
765}
766
767fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
768 if existing.worker_name.is_empty() {
769 existing.worker_name = child.worker_name;
770 }
771 if existing.parent_stage_id.is_none() {
772 existing.parent_stage_id = child.parent_stage_id;
773 }
774 if existing.session_id.is_none() {
775 existing.session_id = child.session_id;
776 }
777 if existing.parent_session_id.is_none() {
778 existing.parent_session_id = child.parent_session_id;
779 }
780 if existing.mutation_scope.is_none() {
781 existing.mutation_scope = child.mutation_scope;
782 }
783 if existing.approval_policy.is_none() {
784 existing.approval_policy = child.approval_policy;
785 }
786 if existing.task.is_empty() {
787 existing.task = child.task;
788 }
789 if existing.request.is_none() {
790 existing.request = child.request;
791 }
792 if existing.provenance.is_none() {
793 existing.provenance = child.provenance;
794 }
795 if existing.status.is_empty() {
796 existing.status = child.status;
797 }
798 if existing.started_at.is_empty() {
799 existing.started_at = child.started_at;
800 }
801 if existing.finished_at.is_none() {
802 existing.finished_at = child.finished_at;
803 }
804 if existing.run_id.is_none() {
805 existing.run_id = child.run_id;
806 }
807 if existing.run_path.is_none() {
808 existing.run_path = child.run_path;
809 }
810 if existing.snapshot_path.is_none() {
811 existing.snapshot_path = child.snapshot_path;
812 }
813 if existing.execution.is_none() {
814 existing.execution = child.execution;
815 }
816}
817
818fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
819 for child in run
820 .stages
821 .iter()
822 .filter_map(run_child_from_stage_metadata)
823 .collect::<Vec<_>>()
824 {
825 match run
826 .child_runs
827 .iter_mut()
828 .find(|existing| existing.worker_id == child.worker_id)
829 {
830 Some(existing) => fill_missing_child_run_fields(existing, child),
831 None => run.child_runs.push(child),
832 }
833 }
834}
835
836#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
837#[serde(default)]
838pub struct RunExecutionRecord {
839 pub cwd: Option<String>,
840 pub source_dir: Option<String>,
841 pub env: BTreeMap<String, String>,
842 pub adapter: Option<String>,
843 pub repo_path: Option<String>,
844 pub worktree_path: Option<String>,
845 pub branch: Option<String>,
846 pub base_ref: Option<String>,
847 pub cleanup: Option<String>,
848}
849
850fn compact_json_value(value: &serde_json::Value) -> String {
851 serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
852}
853
854fn normalize_question_text(text: &str) -> String {
855 text.chars()
856 .map(|ch| {
857 if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
858 ch.to_ascii_lowercase()
859 } else {
860 ' '
861 }
862 })
863 .collect::<String>()
864 .split_whitespace()
865 .collect::<Vec<_>>()
866 .join(" ")
867}
868
869fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
870 spec.min_questions.max(1)
871}
872
873fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
874 spec.max_questions.unwrap_or(1).max(1)
875}
876
877fn read_topic_records(
878 log: &AnyEventLog,
879 topic: &Topic,
880) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
881 let mut from = None;
882 let mut records = Vec::new();
883 loop {
884 let batch =
885 futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
886 if batch.is_empty() {
887 break;
888 }
889 from = batch.last().map(|(event_id, _)| *event_id);
890 records.extend(batch);
891 }
892 records
893}
894
895fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
896 let Some(log) = active_event_log() else {
897 return;
898 };
899 let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
900 .expect("static hitl.questions topic should always be valid");
901 let mut merged = run
902 .hitl_questions
903 .iter()
904 .cloned()
905 .map(|question| (question.request_id.clone(), question))
906 .collect::<BTreeMap<_, _>>();
907
908 for (_, event) in read_topic_records(log.as_ref(), &topic) {
909 if event.kind != "hitl.question_asked" {
910 continue;
911 }
912 let payload = &event.payload;
913 let matches_run = event
914 .headers
915 .get("run_id")
916 .is_some_and(|value| value == &run.id)
917 || payload
918 .get("run_id")
919 .and_then(|value| value.as_str())
920 .is_some_and(|value| value == run.id);
921 if !matches_run {
922 continue;
923 }
924 let request_id = payload
925 .get("request_id")
926 .and_then(|value| value.as_str())
927 .or_else(|| event.headers.get("request_id").map(String::as_str))
928 .unwrap_or_default();
929 let prompt = payload
930 .get("payload")
931 .and_then(|value| value.get("prompt"))
932 .and_then(|value| value.as_str())
933 .unwrap_or_default();
934 if request_id.is_empty() || prompt.is_empty() {
935 continue;
936 }
937 merged.insert(
938 request_id.to_string(),
939 RunHitlQuestionRecord {
940 request_id: request_id.to_string(),
941 prompt: prompt.to_string(),
942 agent: payload
943 .get("agent")
944 .and_then(|value| value.as_str())
945 .unwrap_or_default()
946 .to_string(),
947 trace_id: payload
948 .get("trace_id")
949 .and_then(|value| value.as_str())
950 .map(str::to_string),
951 asked_at: payload
952 .get("requested_at")
953 .and_then(|value| value.as_str())
954 .unwrap_or_default()
955 .to_string(),
956 },
957 );
958 }
959
960 run.hitl_questions = merged.into_values().collect();
961 run.hitl_questions.sort_by(|left, right| {
962 (left.asked_at.as_str(), left.request_id.as_str())
963 .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
964 });
965}
966
967fn signature_status_label(status: &SignatureStatus) -> &'static str {
968 match status {
969 SignatureStatus::Verified => "verified",
970 SignatureStatus::Unsigned => "unsigned",
971 SignatureStatus::Failed { .. } => "failed",
972 }
973}
974
975fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
976 run.metadata
977 .get("trigger_event")
978 .cloned()
979 .and_then(|value| serde_json::from_value(value).ok())
980}
981
982fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
983 trigger_event
984 .map(|event| event.trace_id.0.clone())
985 .or_else(|| {
986 run.metadata
987 .get("trace_id")
988 .and_then(|value| value.as_str())
989 .map(str::to_string)
990 })
991}
992
993fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
994 run.metadata
995 .get("replay_of_event_id")
996 .and_then(|value| value.as_str())
997 .map(str::to_string)
998}
999
1000fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1001 if stage.kind == "condition" {
1002 ACTION_GRAPH_NODE_KIND_PREDICATE
1003 } else {
1004 ACTION_GRAPH_NODE_KIND_STAGE
1005 }
1006}
1007
1008fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1009 let mut metadata = BTreeMap::new();
1010 metadata.insert(
1011 "provider".to_string(),
1012 serde_json::json!(trigger_event.provider.as_str()),
1013 );
1014 metadata.insert(
1015 "event_kind".to_string(),
1016 serde_json::json!(trigger_event.kind),
1017 );
1018 metadata.insert(
1019 "dedupe_key".to_string(),
1020 serde_json::json!(trigger_event.dedupe_key),
1021 );
1022 metadata.insert(
1023 "signature_status".to_string(),
1024 serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1025 );
1026 metadata
1027}
1028
1029fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1030 let mut metadata = BTreeMap::new();
1031 metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1032 if let Some(branch) = stage.branch.as_ref() {
1033 metadata.insert("branch".to_string(), serde_json::json!(branch));
1034 }
1035 if let Some(worker_id) = stage
1036 .metadata
1037 .get("worker_id")
1038 .and_then(|value| value.as_str())
1039 {
1040 metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1041 }
1042 metadata
1043}
1044
1045fn append_action_graph_node(
1046 nodes: &mut Vec<RunActionGraphNodeRecord>,
1047 record: RunActionGraphNodeRecord,
1048) {
1049 nodes.push(record);
1050}
1051
1052pub async fn append_action_graph_update(
1053 headers: BTreeMap<String, String>,
1054 payload: serde_json::Value,
1055) -> Result<(), crate::event_log::LogError> {
1056 let Some(log) = active_event_log() else {
1057 return Ok(());
1058 };
1059 let topic = Topic::new("observability.action_graph")
1060 .expect("static observability.action_graph topic should always be valid");
1061 let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1062 log.append(&topic, record).await.map(|_| ())
1063}
1064
1065fn publish_action_graph_event(
1066 run: &RunRecord,
1067 observability: &RunObservabilityRecord,
1068 path: &Path,
1069) {
1070 let trigger_event = trigger_event_from_run(run);
1071 let mut headers = BTreeMap::new();
1072 headers.insert("run_id".to_string(), run.id.clone());
1073 headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1074 if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1075 headers.insert("trace_id".to_string(), trace_id);
1076 }
1077 let payload = serde_json::json!({
1078 "run_id": run.id,
1079 "workflow_id": run.workflow_id,
1080 "persisted_path": path.to_string_lossy(),
1081 "status": run.status,
1082 "observability": observability,
1083 });
1084 if let Ok(handle) = tokio::runtime::Handle::try_current() {
1085 handle.spawn(async move {
1086 let _ = append_action_graph_update(headers, payload).await;
1087 });
1088 } else {
1089 let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1090 }
1091}
1092
1093fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1094 let stem = run_path.file_stem()?.to_str()?;
1095 let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1096 Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1097}
1098
1099fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1100 value
1101 .and_then(|value| value.as_array())
1102 .map(|items| {
1103 items
1104 .iter()
1105 .filter_map(|item| item.as_str().map(str::to_string))
1106 .collect::<Vec<_>>()
1107 })
1108 .unwrap_or_default()
1109}
1110
1111fn json_usize(value: Option<&serde_json::Value>) -> usize {
1112 value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1113}
1114
1115fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1116 value.and_then(|value| value.as_bool())
1117}
1118
1119fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1120 stage
1121 .artifacts
1122 .iter()
1123 .find_map(|artifact| artifact.data.as_ref())
1124}
1125
1126fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1127 let deliverables = value
1128 .get("deliverables")
1129 .and_then(|raw| raw.as_array())
1130 .map(|items| {
1131 items
1132 .iter()
1133 .map(|item| RunDeliverableSummaryRecord {
1134 id: item
1135 .get("id")
1136 .and_then(|value| value.as_str())
1137 .unwrap_or_default()
1138 .to_string(),
1139 text: item
1140 .get("text")
1141 .and_then(|value| value.as_str())
1142 .unwrap_or_default()
1143 .to_string(),
1144 status: item
1145 .get("status")
1146 .and_then(|value| value.as_str())
1147 .unwrap_or_default()
1148 .to_string(),
1149 note: item
1150 .get("note")
1151 .and_then(|value| value.as_str())
1152 .map(str::to_string),
1153 })
1154 .collect::<Vec<_>>()
1155 })
1156 .unwrap_or_default();
1157 let observations = json_string_array(value.get("observations"));
1158 let root_task = value
1159 .get("root_task")
1160 .and_then(|value| value.as_str())
1161 .unwrap_or_default()
1162 .to_string();
1163 let rationale = value
1164 .get("rationale")
1165 .and_then(|value| value.as_str())
1166 .unwrap_or_default()
1167 .to_string();
1168 if root_task.is_empty()
1169 && rationale.is_empty()
1170 && deliverables.is_empty()
1171 && observations.is_empty()
1172 {
1173 return None;
1174 }
1175 let blocking_count = deliverables
1176 .iter()
1177 .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1178 .count();
1179 Some(RunTaskLedgerSummaryRecord {
1180 root_task,
1181 rationale,
1182 deliverables,
1183 observations,
1184 blocking_count,
1185 })
1186}
1187
1188fn compaction_events_from_transcript(
1189 transcript: &serde_json::Value,
1190 stage_id: Option<&str>,
1191 node_id: Option<&str>,
1192 location_prefix: &str,
1193 persisted_path: Option<&Path>,
1194) -> Vec<CompactionEventRecord> {
1195 let transcript_id = transcript
1196 .get("id")
1197 .and_then(|value| value.as_str())
1198 .map(str::to_string);
1199 let asset_ids = transcript
1200 .get("assets")
1201 .and_then(|value| value.as_array())
1202 .map(|assets| {
1203 assets
1204 .iter()
1205 .filter_map(|asset| {
1206 asset
1207 .get("id")
1208 .and_then(|value| value.as_str())
1209 .map(str::to_string)
1210 })
1211 .collect::<BTreeSet<_>>()
1212 })
1213 .unwrap_or_default();
1214 transcript
1215 .get("events")
1216 .and_then(|value| value.as_array())
1217 .map(|events| {
1218 events
1219 .iter()
1220 .filter(|event| {
1221 event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1222 })
1223 .map(|event| {
1224 let metadata = event.get("metadata");
1225 let snapshot_asset_id = metadata
1226 .and_then(|value| value.get("snapshot_asset_id"))
1227 .and_then(|value| value.as_str())
1228 .map(str::to_string);
1229 let available = snapshot_asset_id
1230 .as_ref()
1231 .is_some_and(|asset_id| asset_ids.contains(asset_id));
1232 let snapshot_location = snapshot_asset_id
1233 .as_ref()
1234 .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1235 .unwrap_or_else(|| location_prefix.to_string());
1236 CompactionEventRecord {
1237 id: event
1238 .get("id")
1239 .and_then(|value| value.as_str())
1240 .unwrap_or_default()
1241 .to_string(),
1242 transcript_id: transcript_id.clone(),
1243 stage_id: stage_id.map(str::to_string),
1244 node_id: node_id.map(str::to_string),
1245 mode: metadata
1246 .and_then(|value| value.get("mode"))
1247 .and_then(|value| value.as_str())
1248 .unwrap_or_default()
1249 .to_string(),
1250 strategy: metadata
1251 .and_then(|value| value.get("strategy"))
1252 .and_then(|value| value.as_str())
1253 .unwrap_or_default()
1254 .to_string(),
1255 archived_messages: json_usize(
1256 metadata.and_then(|value| value.get("archived_messages")),
1257 ),
1258 estimated_tokens_before: json_usize(
1259 metadata.and_then(|value| value.get("estimated_tokens_before")),
1260 ),
1261 estimated_tokens_after: json_usize(
1262 metadata.and_then(|value| value.get("estimated_tokens_after")),
1263 ),
1264 snapshot_asset_id,
1265 snapshot_location,
1266 snapshot_path: persisted_path
1267 .map(|path| path.to_string_lossy().into_owned()),
1268 available,
1269 }
1270 })
1271 .collect()
1272 })
1273 .unwrap_or_default()
1274}
1275
1276fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1277 let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1278 return Vec::new();
1279 };
1280 let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1281 return Vec::new();
1282 };
1283
1284 content
1285 .lines()
1286 .filter(|line| !line.trim().is_empty())
1287 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1288 .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1289 .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1290 .collect()
1291}
1292
1293pub fn derive_run_observability(
1294 run: &RunRecord,
1295 persisted_path: Option<&Path>,
1296) -> RunObservabilityRecord {
1297 let mut action_graph_nodes = Vec::new();
1298 let mut action_graph_edges = Vec::new();
1299 let mut verification_outcomes = Vec::new();
1300 let mut planner_rounds = Vec::new();
1301 let mut transcript_pointers = Vec::new();
1302 let mut compaction_events = Vec::new();
1303 let mut daemon_events = Vec::new();
1304 let mut research_fact_count = 0usize;
1305
1306 let root_node_id = format!("run:{}", run.id);
1307 let trigger_event = trigger_event_from_run(run);
1308 let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1309 append_action_graph_node(
1310 &mut action_graph_nodes,
1311 RunActionGraphNodeRecord {
1312 id: root_node_id.clone(),
1313 label: run
1314 .workflow_name
1315 .clone()
1316 .unwrap_or_else(|| run.workflow_id.clone()),
1317 kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1318 status: run.status.clone(),
1319 outcome: run.status.clone(),
1320 trace_id: propagated_trace_id.clone(),
1321 stage_id: None,
1322 node_id: None,
1323 worker_id: None,
1324 run_id: Some(run.id.clone()),
1325 run_path: run.persisted_path.clone(),
1326 metadata: BTreeMap::from([(
1327 "workflow_id".to_string(),
1328 serde_json::json!(run.workflow_id),
1329 )]),
1330 },
1331 );
1332 let mut entry_node_id = root_node_id.clone();
1333 if let Some(trigger_event) = trigger_event.as_ref() {
1334 if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1335 let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1336 append_action_graph_node(
1337 &mut action_graph_nodes,
1338 RunActionGraphNodeRecord {
1339 id: replay_source_node_id.clone(),
1340 label: format!(
1341 "{}:{} (original {})",
1342 trigger_event.provider.as_str(),
1343 trigger_event.kind,
1344 replay_of_event_id
1345 ),
1346 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1347 status: "historical".to_string(),
1348 outcome: "replayed_from".to_string(),
1349 trace_id: Some(trigger_event.trace_id.0.clone()),
1350 stage_id: None,
1351 node_id: None,
1352 worker_id: None,
1353 run_id: Some(run.id.clone()),
1354 run_path: run.persisted_path.clone(),
1355 metadata: trigger_node_metadata(trigger_event),
1356 },
1357 );
1358 action_graph_edges.push(RunActionGraphEdgeRecord {
1359 from_id: replay_source_node_id,
1360 to_id: format!("trigger:{}", trigger_event.id.0),
1361 kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1362 label: Some("replay chain".to_string()),
1363 });
1364 }
1365 let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1366 append_action_graph_node(
1367 &mut action_graph_nodes,
1368 RunActionGraphNodeRecord {
1369 id: trigger_node_id.clone(),
1370 label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1371 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1372 status: "received".to_string(),
1373 outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1374 trace_id: Some(trigger_event.trace_id.0.clone()),
1375 stage_id: None,
1376 node_id: None,
1377 worker_id: None,
1378 run_id: Some(run.id.clone()),
1379 run_path: run.persisted_path.clone(),
1380 metadata: trigger_node_metadata(trigger_event),
1381 },
1382 );
1383 action_graph_edges.push(RunActionGraphEdgeRecord {
1384 from_id: root_node_id.clone(),
1385 to_id: trigger_node_id.clone(),
1386 kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1387 label: Some(trigger_event.id.0.clone()),
1388 });
1389 entry_node_id = trigger_node_id;
1390 }
1391
1392 let stage_node_ids = run
1393 .stages
1394 .iter()
1395 .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1396 .collect::<BTreeMap<_, _>>();
1397 let stage_by_id = run
1398 .stages
1399 .iter()
1400 .map(|stage| (stage.id.as_str(), stage))
1401 .collect::<BTreeMap<_, _>>();
1402 let stage_by_node_id = run
1403 .stages
1404 .iter()
1405 .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1406 .collect::<BTreeMap<_, _>>();
1407
1408 let incoming_nodes = run
1409 .transitions
1410 .iter()
1411 .map(|transition| transition.to_node_id.clone())
1412 .collect::<BTreeSet<_>>();
1413
1414 for stage in &run.stages {
1415 let graph_node_id = stage_node_ids
1416 .get(&stage.id)
1417 .cloned()
1418 .unwrap_or_else(|| format!("stage:{}", stage.id));
1419 append_action_graph_node(
1420 &mut action_graph_nodes,
1421 RunActionGraphNodeRecord {
1422 id: graph_node_id.clone(),
1423 label: stage.node_id.clone(),
1424 kind: action_graph_kind_for_stage(stage).to_string(),
1425 status: stage.status.clone(),
1426 outcome: stage.outcome.clone(),
1427 trace_id: propagated_trace_id.clone(),
1428 stage_id: Some(stage.id.clone()),
1429 node_id: Some(stage.node_id.clone()),
1430 worker_id: stage
1431 .metadata
1432 .get("worker_id")
1433 .and_then(|value| value.as_str())
1434 .map(str::to_string),
1435 run_id: None,
1436 run_path: None,
1437 metadata: stage_node_metadata(stage),
1438 },
1439 );
1440 if !incoming_nodes.contains(&stage.node_id) {
1441 action_graph_edges.push(RunActionGraphEdgeRecord {
1442 from_id: entry_node_id.clone(),
1443 to_id: graph_node_id.clone(),
1444 kind: if trigger_event.is_some() {
1445 ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1446 } else {
1447 ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1448 },
1449 label: None,
1450 });
1451 }
1452
1453 if stage.kind == "verify" || stage.verification.is_some() {
1454 let passed = json_bool(
1455 stage
1456 .verification
1457 .as_ref()
1458 .and_then(|value| value.get("pass")),
1459 )
1460 .or_else(|| {
1461 json_bool(
1462 stage
1463 .verification
1464 .as_ref()
1465 .and_then(|value| value.get("success")),
1466 )
1467 })
1468 .or_else(|| {
1469 if stage.status == "completed" && stage.outcome == "success" {
1470 Some(true)
1471 } else if stage.status == "failed" || stage.outcome == "failed" {
1472 Some(false)
1473 } else {
1474 None
1475 }
1476 });
1477 verification_outcomes.push(RunVerificationOutcomeRecord {
1478 stage_id: stage.id.clone(),
1479 node_id: stage.node_id.clone(),
1480 status: stage.status.clone(),
1481 passed,
1482 summary: stage
1483 .verification
1484 .as_ref()
1485 .map(compact_json_value)
1486 .or_else(|| {
1487 stage
1488 .visible_text
1489 .as_ref()
1490 .filter(|value| !value.trim().is_empty())
1491 .cloned()
1492 }),
1493 });
1494 }
1495
1496 if stage.transcript.is_some() {
1497 transcript_pointers.push(RunTranscriptPointerRecord {
1498 id: format!("stage:{}:transcript", stage.id),
1499 label: format!("Stage {} transcript", stage.node_id),
1500 kind: "embedded_transcript".to_string(),
1501 location: format!("run.stages[{}].transcript", stage.node_id),
1502 path: run.persisted_path.clone(),
1503 available: true,
1504 });
1505 if let Some(transcript) = stage.transcript.as_ref() {
1506 compaction_events.extend(compaction_events_from_transcript(
1507 transcript,
1508 Some(&stage.id),
1509 Some(&stage.node_id),
1510 &format!("run.stages[{}].transcript", stage.node_id),
1511 persisted_path,
1512 ));
1513 }
1514 }
1515
1516 if let Some(payload) = stage_result_payload(stage) {
1517 let trace = payload.get("trace");
1518 let task_ledger = payload
1519 .get("task_ledger")
1520 .and_then(task_ledger_summary_from_value);
1521 let research_facts = task_ledger
1522 .as_ref()
1523 .map(|ledger| ledger.observations.clone())
1524 .unwrap_or_default();
1525 research_fact_count += research_facts.len();
1526 let tools_payload = payload.get("tools");
1527 let tools_used = json_string_array(
1528 tools_payload
1529 .and_then(|tools| tools.get("calls"))
1530 .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1531 );
1532 let successful_tools =
1533 json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1534 let planner_round = RunPlannerRoundRecord {
1535 stage_id: stage.id.clone(),
1536 node_id: stage.node_id.clone(),
1537 stage_kind: stage.kind.clone(),
1538 status: stage.status.clone(),
1539 outcome: stage.outcome.clone(),
1540 iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1541 llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1542 tool_execution_count: json_usize(
1543 trace.and_then(|trace| trace.get("tool_executions")),
1544 ),
1545 tool_rejection_count: json_usize(
1546 trace.and_then(|trace| trace.get("tool_rejections")),
1547 ),
1548 intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1549 compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1550 native_text_tool_fallback_count: json_usize(
1551 trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1552 ),
1553 native_text_tool_fallback_rejection_count: json_usize(
1554 trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1555 ),
1556 empty_completion_retry_count: json_usize(
1557 trace.and_then(|trace| trace.get("empty_completion_retries")),
1558 ),
1559 tools_used,
1560 successful_tools,
1561 ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1562 task_ledger,
1563 research_facts,
1564 };
1565 let has_agentic_detail = planner_round.iteration_count > 0
1566 || planner_round.llm_call_count > 0
1567 || planner_round.tool_execution_count > 0
1568 || planner_round.native_text_tool_fallback_count > 0
1569 || planner_round.native_text_tool_fallback_rejection_count > 0
1570 || planner_round.empty_completion_retry_count > 0
1571 || planner_round.ledger_done_rejections > 0
1572 || planner_round.task_ledger.is_some()
1573 || !planner_round.tools_used.is_empty()
1574 || !planner_round.successful_tools.is_empty();
1575 if has_agentic_detail {
1576 planner_rounds.push(planner_round);
1577 }
1578 }
1579 }
1580
1581 for transition in &run.transitions {
1582 let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1583 continue;
1584 };
1585 let from_stage = transition
1586 .from_stage_id
1587 .as_deref()
1588 .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1589 let from_id = transition
1590 .from_stage_id
1591 .as_ref()
1592 .and_then(|stage_id| stage_node_ids.get(stage_id))
1593 .cloned()
1594 .or_else(|| {
1595 transition
1596 .from_node_id
1597 .as_ref()
1598 .and_then(|node_id| stage_by_node_id.get(node_id))
1599 .cloned()
1600 })
1601 .unwrap_or_else(|| root_node_id.clone());
1602 action_graph_edges.push(RunActionGraphEdgeRecord {
1603 from_id,
1604 to_id,
1605 kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1606 ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1607 } else {
1608 ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1609 },
1610 label: transition.branch.clone(),
1611 });
1612 }
1613
1614 let worker_lineage = run
1615 .child_runs
1616 .iter()
1617 .map(|child| {
1618 let worker_node_id = format!("worker:{}", child.worker_id);
1619 append_action_graph_node(
1620 &mut action_graph_nodes,
1621 RunActionGraphNodeRecord {
1622 id: worker_node_id.clone(),
1623 label: child.worker_name.clone(),
1624 kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1625 status: child.status.clone(),
1626 outcome: child.status.clone(),
1627 trace_id: propagated_trace_id.clone(),
1628 stage_id: child.parent_stage_id.clone(),
1629 node_id: None,
1630 worker_id: Some(child.worker_id.clone()),
1631 run_id: child.run_id.clone(),
1632 run_path: child.run_path.clone(),
1633 metadata: BTreeMap::from([
1634 (
1635 "worker_name".to_string(),
1636 serde_json::json!(child.worker_name),
1637 ),
1638 ("task".to_string(), serde_json::json!(child.task)),
1639 ]),
1640 },
1641 );
1642 if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1643 if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1644 action_graph_edges.push(RunActionGraphEdgeRecord {
1645 from_id: stage_node_id.clone(),
1646 to_id: worker_node_id,
1647 kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1648 label: Some(child.worker_name.clone()),
1649 });
1650 }
1651 }
1652 RunWorkerLineageRecord {
1653 worker_id: child.worker_id.clone(),
1654 worker_name: child.worker_name.clone(),
1655 parent_stage_id: child.parent_stage_id.clone(),
1656 task: child.task.clone(),
1657 status: child.status.clone(),
1658 session_id: child.session_id.clone(),
1659 parent_session_id: child.parent_session_id.clone(),
1660 run_id: child.run_id.clone(),
1661 run_path: child.run_path.clone(),
1662 snapshot_path: child.snapshot_path.clone(),
1663 }
1664 })
1665 .collect::<Vec<_>>();
1666
1667 if run.transcript.is_some() {
1668 transcript_pointers.push(RunTranscriptPointerRecord {
1669 id: "run:transcript".to_string(),
1670 label: "Run transcript".to_string(),
1671 kind: "embedded_transcript".to_string(),
1672 location: "run.transcript".to_string(),
1673 path: run.persisted_path.clone(),
1674 available: true,
1675 });
1676 if let Some(transcript) = run.transcript.as_ref() {
1677 compaction_events.extend(compaction_events_from_transcript(
1678 transcript,
1679 None,
1680 None,
1681 "run.transcript",
1682 persisted_path,
1683 ));
1684 }
1685 }
1686
1687 if let Some(path) = persisted_path {
1688 if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1689 transcript_pointers.push(RunTranscriptPointerRecord {
1690 id: "run:llm_transcript".to_string(),
1691 label: "LLM transcript sidecar".to_string(),
1692 kind: "llm_jsonl".to_string(),
1693 location: "run sidecar".to_string(),
1694 path: Some(sidecar_path.to_string_lossy().into_owned()),
1695 available: sidecar_path.exists(),
1696 });
1697 }
1698 daemon_events.extend(daemon_events_from_sidecar(path));
1699 }
1700
1701 RunObservabilityRecord {
1702 schema_version: 4,
1703 planner_rounds,
1704 research_fact_count,
1705 action_graph_nodes,
1706 action_graph_edges,
1707 worker_lineage,
1708 verification_outcomes,
1709 transcript_pointers,
1710 compaction_events,
1711 daemon_events,
1712 }
1713}
1714
1715fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1716 run.observability = Some(derive_run_observability(run, persisted_path));
1717}
1718
1719pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1720 let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1721 if run.type_name.is_empty() {
1722 run.type_name = "run_record".to_string();
1723 }
1724 if run.id.is_empty() {
1725 run.id = new_id("run");
1726 }
1727 if run.started_at.is_empty() {
1728 run.started_at = now_rfc3339();
1729 }
1730 if run.status.is_empty() {
1731 run.status = "running".to_string();
1732 }
1733 if run.root_run_id.is_none() {
1734 run.root_run_id = Some(run.id.clone());
1735 }
1736 if run.replay_fixture.is_none() {
1737 run.replay_fixture = Some(replay_fixture_from_run(&run));
1738 }
1739 merge_hitl_questions_from_active_log(&mut run);
1740 materialize_child_runs_from_stage_metadata(&mut run);
1741 sync_run_handoffs(&mut run);
1742 if run.observability.is_none() {
1743 let persisted_path = run.persisted_path.clone();
1744 let persisted = persisted_path.as_deref().map(Path::new);
1745 refresh_run_observability(&mut run, persisted);
1746 }
1747 Ok(run)
1748}
1749
1750pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1751 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1752 if manifest.type_name.is_empty() {
1753 manifest.type_name = "eval_suite_manifest".to_string();
1754 }
1755 if manifest.id.is_empty() {
1756 manifest.id = new_id("eval_suite");
1757 }
1758 Ok(manifest)
1759}
1760
1761pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1762 let content = std::fs::read_to_string(path)
1763 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1764 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1765 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1766 if manifest.base_dir.is_none() {
1767 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1768 }
1769 Ok(manifest)
1770}
1771
1772pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1773 let content = std::fs::read_to_string(path)
1774 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1775 let mut manifest: EvalPackManifest =
1776 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1777 serde_json::from_str(&content)
1778 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1779 } else {
1780 toml::from_str(&content)
1781 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1782 };
1783 normalize_eval_pack_manifest(&mut manifest);
1784 if manifest.base_dir.is_none() {
1785 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1786 }
1787 Ok(manifest)
1788}
1789
1790pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
1791 let mut manifest: EvalPackManifest = parse_json_value(value)?;
1792 normalize_eval_pack_manifest(&mut manifest);
1793 Ok(manifest)
1794}
1795
1796fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1797 if manifest.version == 0 {
1798 manifest.version = 1;
1799 }
1800 if manifest.id.is_empty() {
1801 manifest.id = manifest
1802 .name
1803 .clone()
1804 .filter(|name| !name.trim().is_empty())
1805 .unwrap_or_else(|| new_id("eval_pack"));
1806 }
1807}
1808
1809fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1810 let content = std::fs::read_to_string(path)
1811 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1812 serde_json::from_str(&content)
1813 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1814}
1815
1816fn load_run_record_from_fixture_ref(
1817 fixture: &EvalPackFixtureRef,
1818 base_dir: Option<&Path>,
1819) -> Result<RunRecord, VmError> {
1820 if let Some(inline) = &fixture.inline {
1821 let run: RunRecord = serde_json::from_value(inline.clone())
1822 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1823 return Ok(run);
1824 }
1825 let path = fixture.path.as_deref().ok_or_else(|| {
1826 VmError::Runtime(format!(
1827 "fixture '{}' is missing path or inline run",
1828 fixture.id
1829 ))
1830 })?;
1831 load_run_record(&resolve_manifest_path(base_dir, path))
1832}
1833
1834fn load_replay_fixture_from_ref(
1835 fixture: &EvalPackFixtureRef,
1836 base_dir: Option<&Path>,
1837) -> Result<ReplayFixture, VmError> {
1838 if let Some(inline) = &fixture.inline {
1839 return serde_json::from_value(inline.clone())
1840 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1841 }
1842 let path = fixture.path.as_deref().ok_or_else(|| {
1843 VmError::Runtime(format!(
1844 "fixture '{}' is missing path or inline replay fixture",
1845 fixture.id
1846 ))
1847 })?;
1848 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1849}
1850
1851fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1852 let path_buf = PathBuf::from(path);
1853 if path_buf.is_absolute() {
1854 path_buf
1855 } else if let Some(base_dir) = base_dir {
1856 base_dir.join(path_buf)
1857 } else {
1858 path_buf
1859 }
1860}
1861
1862pub fn evaluate_run_suite_manifest(
1863 manifest: &EvalSuiteManifest,
1864) -> Result<ReplayEvalSuiteReport, VmError> {
1865 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1866 let mut reports = Vec::new();
1867 for case in &manifest.cases {
1868 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1869 let run = load_run_record(&run_path)?;
1870 let fixture = match &case.fixture_path {
1871 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1872 None => run
1873 .replay_fixture
1874 .clone()
1875 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1876 };
1877 let eval = evaluate_run_against_fixture(&run, &fixture);
1878 let mut pass = eval.pass;
1879 let mut failures = eval.failures;
1880 let comparison = match &case.compare_to {
1881 Some(path) => {
1882 let baseline_path = resolve_manifest_path(base_dir, path);
1883 let baseline = load_run_record(&baseline_path)?;
1884 let diff = diff_run_records(&baseline, &run);
1885 if !diff.identical {
1886 pass = false;
1887 failures.push(format!(
1888 "run differs from baseline {} with {} stage changes",
1889 baseline_path.display(),
1890 diff.stage_diffs.len()
1891 ));
1892 }
1893 Some(diff)
1894 }
1895 None => None,
1896 };
1897 reports.push(ReplayEvalCaseReport {
1898 run_id: run.id.clone(),
1899 workflow_id: run.workflow_id.clone(),
1900 label: case.label.clone(),
1901 pass,
1902 failures,
1903 stage_count: eval.stage_count,
1904 source_path: Some(run_path.display().to_string()),
1905 comparison,
1906 });
1907 }
1908 let total = reports.len();
1909 let passed = reports.iter().filter(|report| report.pass).count();
1910 let failed = total.saturating_sub(passed);
1911 Ok(ReplayEvalSuiteReport {
1912 pass: failed == 0,
1913 total,
1914 passed,
1915 failed,
1916 cases: reports,
1917 })
1918}
1919
1920pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1921 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1922 let fixture_base_dir_buf = manifest
1923 .defaults
1924 .fixture_root
1925 .as_deref()
1926 .map(|root| resolve_manifest_path(base_dir, root));
1927 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1928 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1929 .fixtures
1930 .iter()
1931 .filter(|fixture| !fixture.id.is_empty())
1932 .map(|fixture| (fixture.id.as_str(), fixture))
1933 .collect();
1934 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1935 .rubrics
1936 .iter()
1937 .filter(|rubric| !rubric.id.is_empty())
1938 .map(|rubric| (rubric.id.as_str(), rubric))
1939 .collect();
1940
1941 let mut reports = Vec::new();
1942 for (index, case) in manifest.cases.iter().enumerate() {
1943 let case_id = case
1944 .id
1945 .clone()
1946 .filter(|id| !id.trim().is_empty())
1947 .unwrap_or_else(|| format!("case_{}", index + 1));
1948 let label = case
1949 .name
1950 .clone()
1951 .or_else(|| case.id.clone())
1952 .unwrap_or_else(|| case_id.clone());
1953 let severity = eval_pack_case_severity(manifest, case);
1954 let blocking = severity == "blocking";
1955 let mut failures = Vec::new();
1956 let mut warnings = Vec::new();
1957 let mut informational = Vec::new();
1958
1959 if case.friction_events.is_some() {
1960 let report = evaluate_eval_pack_friction_case(
1961 manifest,
1962 case,
1963 &case_id,
1964 &label,
1965 &severity,
1966 blocking,
1967 base_dir,
1968 fixture_base_dir,
1969 &fixtures_by_id,
1970 &rubrics_by_id,
1971 )?;
1972 reports.push(report);
1973 continue;
1974 }
1975
1976 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
1977 let fixture =
1978 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
1979 let eval = evaluate_run_against_fixture(&run, &fixture);
1980 failures.extend(eval.failures);
1981 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
1982 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
1983
1984 let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
1985 Some(path) => {
1986 let baseline_path = resolve_manifest_path(base_dir, path);
1987 let baseline = load_run_record(&baseline_path)?;
1988 let diff = diff_run_records(&baseline, &run);
1989 if !diff.identical {
1990 failures.push(format!(
1991 "run differs from baseline {} with {} stage changes",
1992 baseline_path.display(),
1993 diff.stage_diffs.len()
1994 ));
1995 }
1996 Some(diff)
1997 }
1998 None => None,
1999 };
2000
2001 for rubric_id in &case.rubrics {
2002 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2003 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2004 continue;
2005 };
2006 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2007 }
2008
2009 let pass = failures.is_empty() || !blocking;
2010 if !failures.is_empty() && !blocking {
2011 if severity == "warning" {
2012 warnings.append(&mut failures);
2013 } else {
2014 informational.append(&mut failures);
2015 }
2016 }
2017 reports.push(EvalPackCaseReport {
2018 id: case_id,
2019 label,
2020 severity,
2021 pass,
2022 blocking,
2023 run_id: run.id.clone(),
2024 workflow_id: run.workflow_id.clone(),
2025 source_path: eval_pack_case_source_path(
2026 case,
2027 base_dir,
2028 fixture_base_dir,
2029 &fixtures_by_id,
2030 ),
2031 stage_count: eval.stage_count,
2032 failures,
2033 warnings,
2034 informational,
2035 comparison,
2036 });
2037 }
2038
2039 let total = reports.len();
2040 let blocking_failed = reports
2041 .iter()
2042 .filter(|report| report.blocking && !report.failures.is_empty())
2043 .count();
2044 let warning_failed = reports
2045 .iter()
2046 .filter(|report| !report.warnings.is_empty())
2047 .count();
2048 let informational_failed = reports
2049 .iter()
2050 .filter(|report| !report.informational.is_empty())
2051 .count();
2052 let passed = reports.iter().filter(|report| report.pass).count();
2053 Ok(EvalPackReport {
2054 pack_id: manifest.id.clone(),
2055 pass: blocking_failed == 0,
2056 total,
2057 passed,
2058 failed: total.saturating_sub(passed),
2059 blocking_failed,
2060 warning_failed,
2061 informational_failed,
2062 cases: reports,
2063 })
2064}
2065
2066#[allow(clippy::too_many_arguments)]
2067fn evaluate_eval_pack_friction_case(
2068 manifest: &EvalPackManifest,
2069 case: &EvalPackCase,
2070 case_id: &str,
2071 label: &str,
2072 severity: &str,
2073 blocking: bool,
2074 base_dir: Option<&Path>,
2075 fixture_base_dir: Option<&Path>,
2076 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2077 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2078) -> Result<EvalPackCaseReport, VmError> {
2079 let mut failures = Vec::new();
2080 let mut warnings = Vec::new();
2081 let mut informational = Vec::new();
2082 let events =
2083 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2084 let options = friction_suggestion_options(case, manifest);
2085 let suggestions = generate_context_pack_suggestions(&events, &options);
2086
2087 for rubric_id in &case.rubrics {
2088 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2089 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2090 continue;
2091 };
2092 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2093 }
2094
2095 if case.rubrics.is_empty() && suggestions.is_empty() {
2096 failures.push("friction fixture produced no context-pack suggestions".to_string());
2097 }
2098
2099 let pass = failures.is_empty() || !blocking;
2100 if !failures.is_empty() && !blocking {
2101 if severity == "warning" {
2102 warnings.append(&mut failures);
2103 } else {
2104 informational.append(&mut failures);
2105 }
2106 }
2107
2108 Ok(EvalPackCaseReport {
2109 id: case_id.to_string(),
2110 label: label.to_string(),
2111 severity: severity.to_string(),
2112 pass,
2113 blocking,
2114 run_id: "friction_events".to_string(),
2115 workflow_id: String::new(),
2116 source_path: eval_pack_case_friction_source_path(
2117 case,
2118 base_dir,
2119 fixture_base_dir,
2120 fixtures_by_id,
2121 ),
2122 stage_count: events.len(),
2123 failures,
2124 warnings,
2125 informational,
2126 comparison: None,
2127 })
2128}
2129
2130fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2131 normalize_eval_pack_severity(
2132 case.severity
2133 .as_deref()
2134 .or(case.thresholds.severity.as_deref())
2135 .or(manifest.defaults.severity.as_deref())
2136 .or(manifest.defaults.thresholds.severity.as_deref())
2137 .unwrap_or("blocking"),
2138 )
2139}
2140
2141fn normalize_eval_pack_severity(value: &str) -> String {
2142 match value.trim().to_ascii_lowercase().as_str() {
2143 "warn" | "warning" => "warning".to_string(),
2144 "info" | "informational" => "informational".to_string(),
2145 _ => "blocking".to_string(),
2146 }
2147}
2148
2149fn load_eval_pack_case_run(
2150 case: &EvalPackCase,
2151 base_dir: Option<&Path>,
2152 fixture_base_dir: Option<&Path>,
2153 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2154) -> Result<RunRecord, VmError> {
2155 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2156 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2157 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2158 }
2159 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2160 }
2161 Err(VmError::Runtime(
2162 "eval pack case is missing run or run_path".to_string(),
2163 ))
2164}
2165
2166fn load_eval_pack_case_fixture(
2167 case: &EvalPackCase,
2168 base_dir: Option<&Path>,
2169 fixture_base_dir: Option<&Path>,
2170 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2171 run: &RunRecord,
2172) -> Result<ReplayFixture, VmError> {
2173 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2174 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2175 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2176 }
2177 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2178 }
2179 Ok(run
2180 .replay_fixture
2181 .clone()
2182 .unwrap_or_else(|| replay_fixture_from_run(run)))
2183}
2184
2185fn eval_pack_case_source_path(
2186 case: &EvalPackCase,
2187 base_dir: Option<&Path>,
2188 fixture_base_dir: Option<&Path>,
2189 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2190) -> Option<String> {
2191 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2192 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2193 return fixture.path.as_ref().map(|path| {
2194 resolve_manifest_path(fixture_base_dir, path)
2195 .display()
2196 .to_string()
2197 });
2198 }
2199 Some(
2200 resolve_manifest_path(base_dir, run_ref)
2201 .display()
2202 .to_string(),
2203 )
2204}
2205
2206fn load_eval_pack_case_friction_events(
2207 case: &EvalPackCase,
2208 base_dir: Option<&Path>,
2209 fixture_base_dir: Option<&Path>,
2210 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2211) -> Result<Vec<FrictionEvent>, VmError> {
2212 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2213 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2214 })?;
2215 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2216 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2217 }
2218 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2219}
2220
2221fn load_friction_events_from_fixture_ref(
2222 fixture: &EvalPackFixtureRef,
2223 base_dir: Option<&Path>,
2224) -> Result<Vec<FrictionEvent>, VmError> {
2225 if let Some(inline) = &fixture.inline {
2226 return normalize_friction_events_json(inline.clone());
2227 }
2228 let path = fixture.path.as_deref().ok_or_else(|| {
2229 VmError::Runtime(format!(
2230 "fixture '{}' is missing path or inline friction events",
2231 fixture.id
2232 ))
2233 })?;
2234 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2235}
2236
2237fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2238 let content = std::fs::read_to_string(path)
2239 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2240 let value: serde_json::Value = serde_json::from_str(&content)
2241 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2242 normalize_friction_events_json(value)
2243}
2244
2245fn eval_pack_case_friction_source_path(
2246 case: &EvalPackCase,
2247 base_dir: Option<&Path>,
2248 fixture_base_dir: Option<&Path>,
2249 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2250) -> Option<String> {
2251 let event_ref = case.friction_events.as_deref()?;
2252 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2253 return fixture.path.as_ref().map(|path| {
2254 resolve_manifest_path(fixture_base_dir, path)
2255 .display()
2256 .to_string()
2257 });
2258 }
2259 Some(
2260 resolve_manifest_path(base_dir, event_ref)
2261 .display()
2262 .to_string(),
2263 )
2264}
2265
2266fn friction_suggestion_options(
2267 case: &EvalPackCase,
2268 manifest: &EvalPackManifest,
2269) -> ContextPackSuggestionOptions {
2270 let min_occurrences = case
2271 .metadata
2272 .get("min_occurrences")
2273 .or_else(|| manifest.metadata.get("min_occurrences"))
2274 .and_then(|value| value.as_u64())
2275 .unwrap_or(2) as usize;
2276 let owner = case
2277 .metadata
2278 .get("owner")
2279 .or_else(|| manifest.metadata.get("owner"))
2280 .and_then(|value| value.as_str())
2281 .map(str::to_string)
2282 .or_else(|| {
2283 manifest
2284 .package
2285 .as_ref()
2286 .and_then(|package| package.name.clone())
2287 });
2288 ContextPackSuggestionOptions {
2289 min_occurrences,
2290 owner,
2291 }
2292}
2293
2294fn apply_eval_pack_thresholds(
2295 run: &RunRecord,
2296 thresholds: &EvalPackThresholds,
2297 failures: &mut Vec<String>,
2298) {
2299 if let Some(max_stage_count) = thresholds.max_stage_count {
2300 if run.stages.len() > max_stage_count {
2301 failures.push(format!(
2302 "stage count {} exceeds threshold {}",
2303 run.stages.len(),
2304 max_stage_count
2305 ));
2306 }
2307 }
2308 if let Some(max_latency_ms) = thresholds.max_latency_ms {
2309 let actual = run
2310 .usage
2311 .as_ref()
2312 .map(|usage| usage.total_duration_ms)
2313 .unwrap_or_default();
2314 if actual > max_latency_ms {
2315 failures.push(format!(
2316 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2317 ));
2318 }
2319 }
2320 if let Some(max_cost_usd) = thresholds.max_cost_usd {
2321 let actual = run
2322 .usage
2323 .as_ref()
2324 .map(|usage| usage.total_cost)
2325 .unwrap_or_default();
2326 if actual > max_cost_usd {
2327 failures.push(format!(
2328 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2329 ));
2330 }
2331 }
2332 if let Some(max_tokens) = thresholds.max_tokens {
2333 let actual = run
2334 .usage
2335 .as_ref()
2336 .map(|usage| usage.input_tokens + usage.output_tokens)
2337 .unwrap_or_default();
2338 if actual > max_tokens {
2339 failures.push(format!(
2340 "token count {actual} exceeds threshold {max_tokens}"
2341 ));
2342 }
2343 }
2344}
2345
2346fn apply_eval_pack_rubric(
2347 rubric: &EvalPackRubric,
2348 run: &RunRecord,
2349 failures: &mut Vec<String>,
2350 warnings: &mut Vec<String>,
2351) {
2352 match rubric.kind.as_str() {
2353 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2354 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2355 for assertion in &rubric.assertions {
2356 apply_eval_pack_assertion(rubric, assertion, run, failures);
2357 }
2358 }
2359 "llm-judge" | "llm_as_judge" | "judge" => {
2360 let severity = normalize_eval_pack_severity(
2361 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2362 );
2363 let message = format!(
2364 "rubric '{}' requires an external LLM judge and was not run locally",
2365 rubric.id
2366 );
2367 if severity == "blocking" {
2368 failures.push(message);
2369 } else {
2370 warnings.push(message);
2371 }
2372 }
2373 other => warnings.push(format!(
2374 "rubric '{}' has unknown kind '{}' and was not run locally",
2375 rubric.id, other
2376 )),
2377 }
2378}
2379
2380fn apply_eval_pack_friction_rubric(
2381 rubric: &EvalPackRubric,
2382 suggestions: &[super::ContextPackSuggestion],
2383 failures: &mut Vec<String>,
2384 warnings: &mut Vec<String>,
2385) {
2386 match rubric.kind.as_str() {
2387 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2388 let mut expectations = Vec::new();
2389 for assertion in &rubric.assertions {
2390 match assertion.kind.as_str() {
2391 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2392 let expectation = context_pack_expectation_from_assertion(assertion);
2393 expectations.push(expectation);
2394 }
2395 other => failures.push(format!(
2396 "rubric '{}' has unsupported friction assertion kind '{}'",
2397 rubric.id, other
2398 )),
2399 }
2400 }
2401 failures.extend(evaluate_context_pack_suggestion_expectations(
2402 suggestions,
2403 &expectations,
2404 ));
2405 }
2406 other => warnings.push(format!(
2407 "rubric '{}' has unknown friction kind '{}' and was not run locally",
2408 rubric.id, other
2409 )),
2410 }
2411}
2412
2413fn context_pack_expectation_from_assertion(
2414 assertion: &EvalPackAssertion,
2415) -> ContextPackSuggestionExpectation {
2416 let expected = assertion
2417 .expected
2418 .as_ref()
2419 .and_then(|value| value.as_object());
2420 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2421 ContextPackSuggestionExpectation {
2422 min_suggestions: expected
2423 .and_then(|map| map.get("min_suggestions"))
2424 .and_then(|value| value.as_u64())
2425 .map(|value| value as usize),
2426 recommended_artifact: expected
2427 .and_then(|map| map.get("recommended_artifact"))
2428 .and_then(|value| value.as_str())
2429 .map(str::to_string)
2430 .or_else(|| expected_string.map(str::to_string)),
2431 title_contains: assertion.contains.clone().or_else(|| {
2432 expected
2433 .and_then(|map| map.get("title_contains"))
2434 .and_then(|value| value.as_str())
2435 .map(str::to_string)
2436 }),
2437 manifest_name_contains: expected
2438 .and_then(|map| map.get("manifest_name_contains"))
2439 .and_then(|value| value.as_str())
2440 .map(str::to_string),
2441 required_capability: expected
2442 .and_then(|map| map.get("required_capability"))
2443 .and_then(|value| value.as_str())
2444 .map(str::to_string),
2445 required_output_slot: expected
2446 .and_then(|map| map.get("required_output_slot"))
2447 .and_then(|value| value.as_str())
2448 .map(str::to_string),
2449 }
2450}
2451
2452fn apply_eval_pack_assertion(
2453 rubric: &EvalPackRubric,
2454 assertion: &EvalPackAssertion,
2455 run: &RunRecord,
2456 failures: &mut Vec<String>,
2457) {
2458 match assertion.kind.as_str() {
2459 "run-status" | "run_status" | "status" => {
2460 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2461 if let Some(expected) = expected {
2462 if run.status != expected {
2463 failures.push(format!(
2464 "rubric '{}' expected run status {}, got {}",
2465 rubric.id, expected, run.status
2466 ));
2467 }
2468 }
2469 }
2470 "stage-status" | "stage_status" => {
2471 let Some(stage_id) = assertion.stage.as_deref() else {
2472 failures.push(format!(
2473 "rubric '{}' stage-status assertion missing stage",
2474 rubric.id
2475 ));
2476 return;
2477 };
2478 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2479 let Some(expected) = expected else {
2480 failures.push(format!(
2481 "rubric '{}' stage-status assertion missing expected string",
2482 rubric.id
2483 ));
2484 return;
2485 };
2486 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2487 Some(stage) if stage.status == expected => {}
2488 Some(stage) => failures.push(format!(
2489 "rubric '{}' expected stage {} status {}, got {}",
2490 rubric.id, stage_id, expected, stage.status
2491 )),
2492 None => failures.push(format!(
2493 "rubric '{}' expected stage {} to exist",
2494 rubric.id, stage_id
2495 )),
2496 }
2497 }
2498 "visible-text-contains" | "visible_text_contains" => {
2499 let Some(needle) = assertion.contains.as_deref() else {
2500 failures.push(format!(
2501 "rubric '{}' visible-text assertion missing contains",
2502 rubric.id
2503 ));
2504 return;
2505 };
2506 let matched = match assertion.stage.as_deref() {
2507 Some(stage_id) => run
2508 .stages
2509 .iter()
2510 .find(|stage| stage.node_id == stage_id)
2511 .and_then(|stage| stage.visible_text.as_deref())
2512 .is_some_and(|text| text.contains(needle)),
2513 None => run
2514 .stages
2515 .iter()
2516 .filter_map(|stage| stage.visible_text.as_deref())
2517 .any(|text| text.contains(needle)),
2518 };
2519 if !matched {
2520 failures.push(format!(
2521 "rubric '{}' expected visible text to contain {:?}",
2522 rubric.id, needle
2523 ));
2524 }
2525 }
2526 "hitl-question-contains" | "hitl_question_contains" => {
2527 let Some(needle) = assertion.contains.as_deref() else {
2528 failures.push(format!(
2529 "rubric '{}' HITL assertion missing contains",
2530 rubric.id
2531 ));
2532 return;
2533 };
2534 if !run
2535 .hitl_questions
2536 .iter()
2537 .any(|question| question.prompt.contains(needle))
2538 {
2539 failures.push(format!(
2540 "rubric '{}' expected HITL question to contain {:?}",
2541 rubric.id, needle
2542 ));
2543 }
2544 }
2545 "" => {}
2546 other => failures.push(format!(
2547 "rubric '{}' has unsupported assertion kind '{}'",
2548 rubric.id, other
2549 )),
2550 }
2551}
2552
2553#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2555pub(crate) enum DiffOp {
2556 Equal,
2557 Delete,
2558 Insert,
2559}
2560
2561pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2565 let n = a.len() as isize;
2566 let m = b.len() as isize;
2567 if n == 0 && m == 0 {
2568 return Vec::new();
2569 }
2570 if n == 0 {
2571 return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2572 }
2573 if m == 0 {
2574 return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2575 }
2576
2577 let max_d = (n + m) as usize;
2578 let offset = max_d as isize;
2579 let v_size = 2 * max_d + 1;
2580 let mut v = vec![0isize; v_size];
2581 let mut trace: Vec<Vec<isize>> = Vec::new();
2583
2584 'outer: for d in 0..=max_d as isize {
2585 trace.push(v.clone());
2586 let mut new_v = v.clone();
2587 for k in (-d..=d).step_by(2) {
2588 let ki = (k + offset) as usize;
2589 let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2590 v[ki + 1]
2591 } else {
2592 v[ki - 1] + 1
2593 };
2594 let mut y = x - k;
2595 while x < n && y < m && a[x as usize] == b[y as usize] {
2596 x += 1;
2597 y += 1;
2598 }
2599 new_v[ki] = x;
2600 if x >= n && y >= m {
2601 let _ = new_v;
2602 break 'outer;
2603 }
2604 }
2605 v = new_v;
2606 }
2607
2608 let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2609 let mut x = n;
2610 let mut y = m;
2611 for d in (1..trace.len() as isize).rev() {
2612 let k = x - y;
2613 let v_prev = &trace[d as usize];
2614 let prev_k = if k == -d
2615 || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2616 {
2617 k + 1
2618 } else {
2619 k - 1
2620 };
2621 let prev_x = v_prev[(prev_k + offset) as usize];
2622 let prev_y = prev_x - prev_k;
2623
2624 while x > prev_x && y > prev_y {
2625 x -= 1;
2626 y -= 1;
2627 ops.push((DiffOp::Equal, x as usize));
2628 }
2629 if prev_k < k {
2630 x -= 1;
2631 ops.push((DiffOp::Delete, x as usize));
2632 } else {
2633 y -= 1;
2634 ops.push((DiffOp::Insert, y as usize));
2635 }
2636 }
2637 while x > 0 && y > 0 {
2638 x -= 1;
2639 y -= 1;
2640 ops.push((DiffOp::Equal, x as usize));
2641 }
2642 ops.reverse();
2643 ops
2644}
2645
2646pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2647 let before_lines: Vec<&str> = before.lines().collect();
2648 let after_lines: Vec<&str> = after.lines().collect();
2649 let ops = myers_diff(&before_lines, &after_lines);
2650
2651 let mut diff = String::new();
2652 let file = path.unwrap_or("artifact");
2653 diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2654 for &(op, idx) in &ops {
2655 match op {
2656 DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2657 DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2658 DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2659 }
2660 }
2661 diff
2662}
2663
2664pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2665 let path = path
2666 .map(PathBuf::from)
2667 .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2668 let mut materialized = run.clone();
2669 merge_hitl_questions_from_active_log(&mut materialized);
2670 materialize_child_runs_from_stage_metadata(&mut materialized);
2671 if materialized.replay_fixture.is_none() {
2672 materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2673 }
2674 materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2675 sync_run_handoffs(&mut materialized);
2676 refresh_run_observability(&mut materialized, Some(&path));
2677 if let Some(parent) = path.parent() {
2678 std::fs::create_dir_all(parent)
2679 .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2680 }
2681 let json = serde_json::to_string_pretty(&materialized)
2682 .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2683 crate::atomic_io::atomic_write(&path, json.as_bytes())
2684 .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2685 if let Some(observability) = materialized.observability.as_ref() {
2686 publish_action_graph_event(&materialized, observability, &path);
2687 }
2688 Ok(path.to_string_lossy().into_owned())
2689}
2690
2691pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2692 let content = std::fs::read_to_string(path)
2693 .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2694 let mut run: RunRecord = serde_json::from_str(&content)
2695 .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2696 materialize_child_runs_from_stage_metadata(&mut run);
2697 if run.replay_fixture.is_none() {
2698 run.replay_fixture = Some(replay_fixture_from_run(&run));
2699 }
2700 run.persisted_path
2701 .get_or_insert_with(|| path.to_string_lossy().into_owned());
2702 sync_run_handoffs(&mut run);
2703 refresh_run_observability(&mut run, Some(path));
2704 Ok(run)
2705}
2706
2707pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2708 ReplayFixture {
2709 type_name: "replay_fixture".to_string(),
2710 id: new_id("fixture"),
2711 source_run_id: run.id.clone(),
2712 workflow_id: run.workflow_id.clone(),
2713 workflow_name: run.workflow_name.clone(),
2714 created_at: now_rfc3339(),
2715 eval_kind: Some("replay".to_string()),
2716 clarifying_question: None,
2717 expected_status: run.status.clone(),
2718 stage_assertions: run
2719 .stages
2720 .iter()
2721 .map(|stage| ReplayStageAssertion {
2722 node_id: stage.node_id.clone(),
2723 expected_status: stage.status.clone(),
2724 expected_outcome: stage.outcome.clone(),
2725 expected_branch: stage.branch.clone(),
2726 required_artifact_kinds: stage
2727 .artifacts
2728 .iter()
2729 .map(|artifact| artifact.kind.clone())
2730 .collect(),
2731 visible_text_contains: stage
2732 .visible_text
2733 .as_ref()
2734 .filter(|text| !text.is_empty())
2735 .map(|text| text.chars().take(80).collect()),
2736 })
2737 .collect(),
2738 }
2739}
2740
2741pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2742 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2743 return evaluate_clarifying_question(run, fixture);
2744 }
2745 let mut failures = Vec::new();
2746 if run.status != fixture.expected_status {
2747 failures.push(format!(
2748 "run status mismatch: expected {}, got {}",
2749 fixture.expected_status, run.status
2750 ));
2751 }
2752 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2753 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2754 for assertion in &fixture.stage_assertions {
2755 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2756 failures.push(format!("missing stage {}", assertion.node_id));
2757 continue;
2758 };
2759 if stage.status != assertion.expected_status {
2760 failures.push(format!(
2761 "stage {} status mismatch: expected {}, got {}",
2762 assertion.node_id, assertion.expected_status, stage.status
2763 ));
2764 }
2765 if stage.outcome != assertion.expected_outcome {
2766 failures.push(format!(
2767 "stage {} outcome mismatch: expected {}, got {}",
2768 assertion.node_id, assertion.expected_outcome, stage.outcome
2769 ));
2770 }
2771 if stage.branch != assertion.expected_branch {
2772 failures.push(format!(
2773 "stage {} branch mismatch: expected {:?}, got {:?}",
2774 assertion.node_id, assertion.expected_branch, stage.branch
2775 ));
2776 }
2777 for required_kind in &assertion.required_artifact_kinds {
2778 if !stage
2779 .artifacts
2780 .iter()
2781 .any(|artifact| &artifact.kind == required_kind)
2782 {
2783 failures.push(format!(
2784 "stage {} missing artifact kind {}",
2785 assertion.node_id, required_kind
2786 ));
2787 }
2788 }
2789 if let Some(snippet) = &assertion.visible_text_contains {
2790 let actual = stage.visible_text.clone().unwrap_or_default();
2791 if !actual.contains(snippet) {
2792 failures.push(format!(
2793 "stage {} visible text does not contain expected snippet {:?}",
2794 assertion.node_id, snippet
2795 ));
2796 }
2797 }
2798 }
2799
2800 ReplayEvalReport {
2801 pass: failures.is_empty(),
2802 failures,
2803 stage_count: run.stages.len(),
2804 }
2805}
2806
2807fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2808 let mut failures = Vec::new();
2809 let spec = fixture.clarifying_question.clone().unwrap_or_default();
2810 let min_questions = clarifying_min_questions(&spec);
2811 let max_questions = clarifying_max_questions(&spec);
2812 let questions = &run.hitl_questions;
2813
2814 if run.status != fixture.expected_status {
2815 failures.push(format!(
2816 "run status mismatch: expected {}, got {}",
2817 fixture.expected_status, run.status
2818 ));
2819 }
2820 if questions.len() < min_questions {
2821 failures.push(format!(
2822 "expected at least {min_questions} clarifying question(s), got {}",
2823 questions.len()
2824 ));
2825 }
2826 if questions.len() > max_questions {
2827 failures.push(format!(
2828 "expected at most {max_questions} clarifying question(s), got {}",
2829 questions.len()
2830 ));
2831 }
2832
2833 let normalized_expected = spec
2834 .expected_question
2835 .as_deref()
2836 .map(normalize_question_text);
2837 let normalized_accepted = spec
2838 .accepted_questions
2839 .iter()
2840 .map(|question| normalize_question_text(question))
2841 .collect::<Vec<_>>();
2842 let required_terms = spec
2843 .required_terms
2844 .iter()
2845 .map(|term| normalize_question_text(term))
2846 .collect::<Vec<_>>();
2847 let forbidden_terms = spec
2848 .forbidden_terms
2849 .iter()
2850 .map(|term| normalize_question_text(term))
2851 .collect::<Vec<_>>();
2852
2853 let matched = questions.iter().any(|question| {
2854 let normalized = normalize_question_text(&question.prompt);
2855 let matches_expected = normalized_expected
2856 .as_ref()
2857 .is_none_or(|expected| &normalized == expected)
2858 && (normalized_accepted.is_empty()
2859 || normalized_accepted
2860 .iter()
2861 .any(|candidate| candidate == &normalized));
2862 let has_required_terms = required_terms
2863 .iter()
2864 .all(|term| normalized.contains(term.as_str()));
2865 let avoids_forbidden_terms = forbidden_terms
2866 .iter()
2867 .all(|term| !normalized.contains(term.as_str()));
2868 matches_expected && has_required_terms && avoids_forbidden_terms
2869 });
2870
2871 if !questions.is_empty()
2872 && (!normalized_accepted.is_empty()
2873 || normalized_expected.is_some()
2874 || !required_terms.is_empty()
2875 || !forbidden_terms.is_empty())
2876 && !matched
2877 {
2878 failures.push(format!(
2879 "no clarifying question matched fixture; actual questions: {}",
2880 questions
2881 .iter()
2882 .map(|question| format!("{:?}", question.prompt))
2883 .collect::<Vec<_>>()
2884 .join(", ")
2885 ));
2886 }
2887
2888 ReplayEvalReport {
2889 pass: failures.is_empty(),
2890 failures,
2891 stage_count: run.stages.len(),
2892 }
2893}
2894
2895pub fn evaluate_run_suite(
2896 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2897) -> ReplayEvalSuiteReport {
2898 let mut reports = Vec::new();
2899 for (run, fixture, source_path) in cases {
2900 let report = evaluate_run_against_fixture(&run, &fixture);
2901 reports.push(ReplayEvalCaseReport {
2902 run_id: run.id.clone(),
2903 workflow_id: run.workflow_id.clone(),
2904 label: None,
2905 pass: report.pass,
2906 failures: report.failures,
2907 stage_count: report.stage_count,
2908 source_path,
2909 comparison: None,
2910 });
2911 }
2912 let total = reports.len();
2913 let passed = reports.iter().filter(|report| report.pass).count();
2914 let failed = total.saturating_sub(passed);
2915 ReplayEvalSuiteReport {
2916 pass: failed == 0,
2917 total,
2918 passed,
2919 failed,
2920 cases: reports,
2921 }
2922}
2923
2924pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2925 let mut stage_diffs = Vec::new();
2926 let mut all_node_ids = BTreeSet::new();
2927 let left_by_id: BTreeMap<&str, &RunStageRecord> = left
2928 .stages
2929 .iter()
2930 .map(|s| (s.node_id.as_str(), s))
2931 .collect();
2932 let right_by_id: BTreeMap<&str, &RunStageRecord> = right
2933 .stages
2934 .iter()
2935 .map(|s| (s.node_id.as_str(), s))
2936 .collect();
2937 all_node_ids.extend(left_by_id.keys().copied());
2938 all_node_ids.extend(right_by_id.keys().copied());
2939
2940 for node_id in all_node_ids {
2941 let left_stage = left_by_id.get(node_id).copied();
2942 let right_stage = right_by_id.get(node_id).copied();
2943 match (left_stage, right_stage) {
2944 (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2945 node_id: node_id.to_string(),
2946 change: "removed".to_string(),
2947 details: vec!["stage missing from right run".to_string()],
2948 }),
2949 (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2950 node_id: node_id.to_string(),
2951 change: "added".to_string(),
2952 details: vec!["stage missing from left run".to_string()],
2953 }),
2954 (Some(left_stage), Some(right_stage)) => {
2955 let mut details = Vec::new();
2956 if left_stage.status != right_stage.status {
2957 details.push(format!(
2958 "status: {} -> {}",
2959 left_stage.status, right_stage.status
2960 ));
2961 }
2962 if left_stage.outcome != right_stage.outcome {
2963 details.push(format!(
2964 "outcome: {} -> {}",
2965 left_stage.outcome, right_stage.outcome
2966 ));
2967 }
2968 if left_stage.branch != right_stage.branch {
2969 details.push(format!(
2970 "branch: {:?} -> {:?}",
2971 left_stage.branch, right_stage.branch
2972 ));
2973 }
2974 if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2975 {
2976 details.push(format!(
2977 "produced_artifacts: {} -> {}",
2978 left_stage.produced_artifact_ids.len(),
2979 right_stage.produced_artifact_ids.len()
2980 ));
2981 }
2982 if left_stage.artifacts.len() != right_stage.artifacts.len() {
2983 details.push(format!(
2984 "artifact_records: {} -> {}",
2985 left_stage.artifacts.len(),
2986 right_stage.artifacts.len()
2987 ));
2988 }
2989 if !details.is_empty() {
2990 stage_diffs.push(RunStageDiffRecord {
2991 node_id: node_id.to_string(),
2992 change: "changed".to_string(),
2993 details,
2994 });
2995 }
2996 }
2997 (None, None) => {}
2998 }
2999 }
3000
3001 let mut tool_diffs = Vec::new();
3002 let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3003 .tool_recordings
3004 .iter()
3005 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3006 .collect();
3007 let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3008 .tool_recordings
3009 .iter()
3010 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3011 .collect();
3012 let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3013 .keys()
3014 .chain(right_tools.keys())
3015 .cloned()
3016 .collect();
3017 for key in &all_tool_keys {
3018 let l = left_tools.get(key);
3019 let r = right_tools.get(key);
3020 let result_changed = match (l, r) {
3021 (Some(a), Some(b)) => a.result != b.result,
3022 _ => true,
3023 };
3024 if result_changed {
3025 tool_diffs.push(ToolCallDiffRecord {
3026 tool_name: key.0.clone(),
3027 args_hash: key.1.clone(),
3028 result_changed,
3029 left_result: l.map(|t| t.result.clone()),
3030 right_result: r.map(|t| t.result.clone()),
3031 });
3032 }
3033 }
3034
3035 let left_observability = left.observability.clone().unwrap_or_else(|| {
3036 derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3037 });
3038 let right_observability = right.observability.clone().unwrap_or_else(|| {
3039 derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3040 });
3041 let mut observability_diffs = Vec::new();
3042
3043 let left_workers = left_observability
3044 .worker_lineage
3045 .iter()
3046 .map(|worker| {
3047 (
3048 worker.worker_id.clone(),
3049 (
3050 worker.status.clone(),
3051 worker.run_id.clone(),
3052 worker.run_path.clone(),
3053 ),
3054 )
3055 })
3056 .collect::<BTreeMap<_, _>>();
3057 let right_workers = right_observability
3058 .worker_lineage
3059 .iter()
3060 .map(|worker| {
3061 (
3062 worker.worker_id.clone(),
3063 (
3064 worker.status.clone(),
3065 worker.run_id.clone(),
3066 worker.run_path.clone(),
3067 ),
3068 )
3069 })
3070 .collect::<BTreeMap<_, _>>();
3071 let worker_ids = left_workers
3072 .keys()
3073 .chain(right_workers.keys())
3074 .cloned()
3075 .collect::<BTreeSet<_>>();
3076 for worker_id in worker_ids {
3077 match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3078 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3079 section: "worker_lineage".to_string(),
3080 label: worker_id,
3081 details: vec!["worker missing from right run".to_string()],
3082 }),
3083 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3084 section: "worker_lineage".to_string(),
3085 label: worker_id,
3086 details: vec!["worker missing from left run".to_string()],
3087 }),
3088 (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3089 let mut details = Vec::new();
3090 if left_worker.0 != right_worker.0 {
3091 details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3092 }
3093 if left_worker.1 != right_worker.1 {
3094 details.push(format!(
3095 "run_id: {:?} -> {:?}",
3096 left_worker.1, right_worker.1
3097 ));
3098 }
3099 if left_worker.2 != right_worker.2 {
3100 details.push(format!(
3101 "run_path: {:?} -> {:?}",
3102 left_worker.2, right_worker.2
3103 ));
3104 }
3105 observability_diffs.push(RunObservabilityDiffRecord {
3106 section: "worker_lineage".to_string(),
3107 label: worker_id,
3108 details,
3109 });
3110 }
3111 _ => {}
3112 }
3113 }
3114
3115 let left_rounds = left_observability
3116 .planner_rounds
3117 .iter()
3118 .map(|round| (round.stage_id.clone(), round))
3119 .collect::<BTreeMap<_, _>>();
3120 let right_rounds = right_observability
3121 .planner_rounds
3122 .iter()
3123 .map(|round| (round.stage_id.clone(), round))
3124 .collect::<BTreeMap<_, _>>();
3125 let round_ids = left_rounds
3126 .keys()
3127 .chain(right_rounds.keys())
3128 .cloned()
3129 .collect::<BTreeSet<_>>();
3130 for stage_id in round_ids {
3131 match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3132 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3133 section: "planner_rounds".to_string(),
3134 label: stage_id,
3135 details: vec!["planner summary missing from right run".to_string()],
3136 }),
3137 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3138 section: "planner_rounds".to_string(),
3139 label: stage_id,
3140 details: vec!["planner summary missing from left run".to_string()],
3141 }),
3142 (Some(left_round), Some(right_round)) => {
3143 let mut details = Vec::new();
3144 if left_round.iteration_count != right_round.iteration_count {
3145 details.push(format!(
3146 "iterations: {} -> {}",
3147 left_round.iteration_count, right_round.iteration_count
3148 ));
3149 }
3150 if left_round.tool_execution_count != right_round.tool_execution_count {
3151 details.push(format!(
3152 "tool_executions: {} -> {}",
3153 left_round.tool_execution_count, right_round.tool_execution_count
3154 ));
3155 }
3156 if left_round.native_text_tool_fallback_count
3157 != right_round.native_text_tool_fallback_count
3158 {
3159 details.push(format!(
3160 "native_text_tool_fallbacks: {} -> {}",
3161 left_round.native_text_tool_fallback_count,
3162 right_round.native_text_tool_fallback_count
3163 ));
3164 }
3165 if left_round.native_text_tool_fallback_rejection_count
3166 != right_round.native_text_tool_fallback_rejection_count
3167 {
3168 details.push(format!(
3169 "native_text_tool_fallback_rejections: {} -> {}",
3170 left_round.native_text_tool_fallback_rejection_count,
3171 right_round.native_text_tool_fallback_rejection_count
3172 ));
3173 }
3174 if left_round.empty_completion_retry_count
3175 != right_round.empty_completion_retry_count
3176 {
3177 details.push(format!(
3178 "empty_completion_retries: {} -> {}",
3179 left_round.empty_completion_retry_count,
3180 right_round.empty_completion_retry_count
3181 ));
3182 }
3183 if left_round.research_facts != right_round.research_facts {
3184 details.push(format!(
3185 "research_facts: {:?} -> {:?}",
3186 left_round.research_facts, right_round.research_facts
3187 ));
3188 }
3189 let left_deliverables = left_round
3190 .task_ledger
3191 .as_ref()
3192 .map(|ledger| {
3193 ledger
3194 .deliverables
3195 .iter()
3196 .map(|item| format!("{}:{}", item.id, item.status))
3197 .collect::<Vec<_>>()
3198 })
3199 .unwrap_or_default();
3200 let right_deliverables = right_round
3201 .task_ledger
3202 .as_ref()
3203 .map(|ledger| {
3204 ledger
3205 .deliverables
3206 .iter()
3207 .map(|item| format!("{}:{}", item.id, item.status))
3208 .collect::<Vec<_>>()
3209 })
3210 .unwrap_or_default();
3211 if left_deliverables != right_deliverables {
3212 details.push(format!(
3213 "deliverables: {:?} -> {:?}",
3214 left_deliverables, right_deliverables
3215 ));
3216 }
3217 if left_round.successful_tools != right_round.successful_tools {
3218 details.push(format!(
3219 "successful_tools: {:?} -> {:?}",
3220 left_round.successful_tools, right_round.successful_tools
3221 ));
3222 }
3223 if !details.is_empty() {
3224 observability_diffs.push(RunObservabilityDiffRecord {
3225 section: "planner_rounds".to_string(),
3226 label: left_round.node_id.clone(),
3227 details,
3228 });
3229 }
3230 }
3231 _ => {}
3232 }
3233 }
3234
3235 let left_pointers = left_observability
3236 .transcript_pointers
3237 .iter()
3238 .map(|pointer| {
3239 (
3240 pointer.id.clone(),
3241 (
3242 pointer.available,
3243 pointer.path.clone(),
3244 pointer.location.clone(),
3245 ),
3246 )
3247 })
3248 .collect::<BTreeMap<_, _>>();
3249 let right_pointers = right_observability
3250 .transcript_pointers
3251 .iter()
3252 .map(|pointer| {
3253 (
3254 pointer.id.clone(),
3255 (
3256 pointer.available,
3257 pointer.path.clone(),
3258 pointer.location.clone(),
3259 ),
3260 )
3261 })
3262 .collect::<BTreeMap<_, _>>();
3263 let pointer_ids = left_pointers
3264 .keys()
3265 .chain(right_pointers.keys())
3266 .cloned()
3267 .collect::<BTreeSet<_>>();
3268 for pointer_id in pointer_ids {
3269 match (
3270 left_pointers.get(&pointer_id),
3271 right_pointers.get(&pointer_id),
3272 ) {
3273 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3274 section: "transcript_pointers".to_string(),
3275 label: pointer_id,
3276 details: vec!["pointer missing from right run".to_string()],
3277 }),
3278 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3279 section: "transcript_pointers".to_string(),
3280 label: pointer_id,
3281 details: vec!["pointer missing from left run".to_string()],
3282 }),
3283 (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3284 observability_diffs.push(RunObservabilityDiffRecord {
3285 section: "transcript_pointers".to_string(),
3286 label: pointer_id,
3287 details: vec![format!(
3288 "pointer: {:?} -> {:?}",
3289 left_pointer, right_pointer
3290 )],
3291 });
3292 }
3293 _ => {}
3294 }
3295 }
3296
3297 let left_compactions = left_observability
3298 .compaction_events
3299 .iter()
3300 .map(|event| {
3301 (
3302 event.id.clone(),
3303 (
3304 event.strategy.clone(),
3305 event.archived_messages,
3306 event.snapshot_asset_id.clone(),
3307 event.available,
3308 ),
3309 )
3310 })
3311 .collect::<BTreeMap<_, _>>();
3312 let right_compactions = right_observability
3313 .compaction_events
3314 .iter()
3315 .map(|event| {
3316 (
3317 event.id.clone(),
3318 (
3319 event.strategy.clone(),
3320 event.archived_messages,
3321 event.snapshot_asset_id.clone(),
3322 event.available,
3323 ),
3324 )
3325 })
3326 .collect::<BTreeMap<_, _>>();
3327 let compaction_ids = left_compactions
3328 .keys()
3329 .chain(right_compactions.keys())
3330 .cloned()
3331 .collect::<BTreeSet<_>>();
3332 for compaction_id in compaction_ids {
3333 match (
3334 left_compactions.get(&compaction_id),
3335 right_compactions.get(&compaction_id),
3336 ) {
3337 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3338 section: "compaction_events".to_string(),
3339 label: compaction_id,
3340 details: vec!["compaction event missing from right run".to_string()],
3341 }),
3342 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3343 section: "compaction_events".to_string(),
3344 label: compaction_id,
3345 details: vec!["compaction event missing from left run".to_string()],
3346 }),
3347 (Some(left_event), Some(right_event)) if left_event != right_event => {
3348 observability_diffs.push(RunObservabilityDiffRecord {
3349 section: "compaction_events".to_string(),
3350 label: compaction_id,
3351 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3352 });
3353 }
3354 _ => {}
3355 }
3356 }
3357
3358 let left_daemons = left_observability
3359 .daemon_events
3360 .iter()
3361 .map(|event| {
3362 (
3363 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3364 (
3365 event.name.clone(),
3366 event.persist_path.clone(),
3367 event.payload_summary.clone(),
3368 ),
3369 )
3370 })
3371 .collect::<BTreeMap<_, _>>();
3372 let right_daemons = right_observability
3373 .daemon_events
3374 .iter()
3375 .map(|event| {
3376 (
3377 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3378 (
3379 event.name.clone(),
3380 event.persist_path.clone(),
3381 event.payload_summary.clone(),
3382 ),
3383 )
3384 })
3385 .collect::<BTreeMap<_, _>>();
3386 let daemon_keys = left_daemons
3387 .keys()
3388 .chain(right_daemons.keys())
3389 .cloned()
3390 .collect::<BTreeSet<_>>();
3391 for daemon_key in daemon_keys {
3392 let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3393 match (
3394 left_daemons.get(&daemon_key),
3395 right_daemons.get(&daemon_key),
3396 ) {
3397 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3398 section: "daemon_events".to_string(),
3399 label,
3400 details: vec!["daemon event missing from right run".to_string()],
3401 }),
3402 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3403 section: "daemon_events".to_string(),
3404 label,
3405 details: vec!["daemon event missing from left run".to_string()],
3406 }),
3407 (Some(left_event), Some(right_event)) if left_event != right_event => {
3408 observability_diffs.push(RunObservabilityDiffRecord {
3409 section: "daemon_events".to_string(),
3410 label,
3411 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3412 });
3413 }
3414 _ => {}
3415 }
3416 }
3417
3418 let left_verification = left_observability
3419 .verification_outcomes
3420 .iter()
3421 .map(|item| (item.stage_id.clone(), item))
3422 .collect::<BTreeMap<_, _>>();
3423 let right_verification = right_observability
3424 .verification_outcomes
3425 .iter()
3426 .map(|item| (item.stage_id.clone(), item))
3427 .collect::<BTreeMap<_, _>>();
3428 let verification_ids = left_verification
3429 .keys()
3430 .chain(right_verification.keys())
3431 .cloned()
3432 .collect::<BTreeSet<_>>();
3433 for stage_id in verification_ids {
3434 match (
3435 left_verification.get(&stage_id),
3436 right_verification.get(&stage_id),
3437 ) {
3438 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3439 section: "verification".to_string(),
3440 label: stage_id,
3441 details: vec!["verification missing from right run".to_string()],
3442 }),
3443 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3444 section: "verification".to_string(),
3445 label: stage_id,
3446 details: vec!["verification missing from left run".to_string()],
3447 }),
3448 (Some(left_item), Some(right_item)) if left_item != right_item => {
3449 let mut details = Vec::new();
3450 if left_item.passed != right_item.passed {
3451 details.push(format!(
3452 "passed: {:?} -> {:?}",
3453 left_item.passed, right_item.passed
3454 ));
3455 }
3456 if left_item.summary != right_item.summary {
3457 details.push(format!(
3458 "summary: {:?} -> {:?}",
3459 left_item.summary, right_item.summary
3460 ));
3461 }
3462 observability_diffs.push(RunObservabilityDiffRecord {
3463 section: "verification".to_string(),
3464 label: left_item.node_id.clone(),
3465 details,
3466 });
3467 }
3468 _ => {}
3469 }
3470 }
3471
3472 let left_graph = (
3473 left_observability.action_graph_nodes.len(),
3474 left_observability.action_graph_edges.len(),
3475 );
3476 let right_graph = (
3477 right_observability.action_graph_nodes.len(),
3478 right_observability.action_graph_edges.len(),
3479 );
3480 if left_graph != right_graph {
3481 observability_diffs.push(RunObservabilityDiffRecord {
3482 section: "action_graph".to_string(),
3483 label: "shape".to_string(),
3484 details: vec![format!(
3485 "nodes/edges: {}/{} -> {}/{}",
3486 left_graph.0, left_graph.1, right_graph.0, right_graph.1
3487 )],
3488 });
3489 }
3490
3491 let status_changed = left.status != right.status;
3492 let identical = !status_changed
3493 && stage_diffs.is_empty()
3494 && tool_diffs.is_empty()
3495 && observability_diffs.is_empty()
3496 && left.transitions.len() == right.transitions.len()
3497 && left.artifacts.len() == right.artifacts.len()
3498 && left.checkpoints.len() == right.checkpoints.len();
3499
3500 RunDiffReport {
3501 left_run_id: left.id.clone(),
3502 right_run_id: right.id.clone(),
3503 identical,
3504 status_changed,
3505 left_status: left.status.clone(),
3506 right_status: right.status.clone(),
3507 stage_diffs,
3508 tool_diffs,
3509 observability_diffs,
3510 transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3511 artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3512 checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3513 }
3514}
3515
3516#[cfg(test)]
3517mod tests {
3518 use super::*;
3519 use std::fs;
3520
3521 fn minimal_run(status: &str) -> RunRecord {
3522 RunRecord {
3523 type_name: "workflow_run".to_string(),
3524 id: "run_1".to_string(),
3525 workflow_id: "workflow_1".to_string(),
3526 status: status.to_string(),
3527 usage: Some(LlmUsageRecord {
3528 total_duration_ms: 12,
3529 total_cost: 0.01,
3530 input_tokens: 3,
3531 output_tokens: 4,
3532 call_count: 1,
3533 models: vec!["mock".to_string()],
3534 }),
3535 replay_fixture: Some(ReplayFixture {
3536 type_name: "replay_fixture".to_string(),
3537 expected_status: "completed".to_string(),
3538 ..ReplayFixture::default()
3539 }),
3540 ..RunRecord::default()
3541 }
3542 }
3543
3544 #[test]
3545 fn eval_pack_manifest_toml_runs_replay_case() {
3546 let temp = tempfile::tempdir().unwrap();
3547 let run_path = temp.path().join("run.json");
3548 fs::write(
3549 &run_path,
3550 serde_json::to_string(&minimal_run("completed")).unwrap(),
3551 )
3552 .unwrap();
3553 let pack_path = temp.path().join("harn.eval.toml");
3554 fs::write(
3555 &pack_path,
3556 r#"
3557version = 1
3558id = "connector-regressions"
3559name = "Connector regressions"
3560
3561[[cases]]
3562id = "webhook"
3563name = "Webhook normalization"
3564run = "run.json"
3565rubrics = ["status"]
3566
3567[[rubrics]]
3568id = "status"
3569kind = "deterministic"
3570
3571[[rubrics.assertions]]
3572kind = "run-status"
3573expected = "completed"
3574"#,
3575 )
3576 .unwrap();
3577
3578 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3579 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3580
3581 assert!(report.pass);
3582 assert_eq!(report.total, 1);
3583 assert_eq!(report.cases[0].label, "Webhook normalization");
3584 }
3585
3586 #[test]
3587 fn eval_pack_warning_case_does_not_block() {
3588 let temp = tempfile::tempdir().unwrap();
3589 let run_path = temp.path().join("run.json");
3590 fs::write(
3591 &run_path,
3592 serde_json::to_string(&minimal_run("completed")).unwrap(),
3593 )
3594 .unwrap();
3595 let pack_path = temp.path().join("harn.eval.toml");
3596 fs::write(
3597 &pack_path,
3598 r#"
3599version = 1
3600id = "budgets"
3601
3602[[cases]]
3603id = "latency-budget"
3604run = "run.json"
3605severity = "warning"
3606
3607[cases.thresholds]
3608max-latency-ms = 1
3609"#,
3610 )
3611 .unwrap();
3612
3613 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3614 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3615
3616 assert!(report.pass);
3617 assert_eq!(report.warning_failed, 1);
3618 assert!(report.cases[0].warnings[0].contains("latency"));
3619 }
3620
3621 #[test]
3622 fn eval_pack_manifest_runs_friction_context_pack_case() {
3623 let temp = tempfile::tempdir().unwrap();
3624 let events_path = temp.path().join("incident-friction.json");
3625 fs::write(
3626 &events_path,
3627 r#"
3628{
3629 "events": [
3630 {
3631 "kind": "repeated_query",
3632 "source": "incident-triage",
3633 "actor": "sre",
3634 "tool": "splunk",
3635 "provider": "splunk",
3636 "redacted_summary": "Checkout incidents need the same Splunk search",
3637 "recurrence_hints": ["checkout incident queries"],
3638 "estimated_time_ms": 300000,
3639 "metadata": {
3640 "query": "index=checkout service=api error",
3641 "capability": "splunk.search",
3642 "secret_ref": "SPLUNK_READ_TOKEN",
3643 "output_slot": "splunk_errors"
3644 }
3645 },
3646 {
3647 "kind": "repeated_query",
3648 "source": "incident-triage",
3649 "actor": "sre",
3650 "tool": "splunk",
3651 "provider": "splunk",
3652 "redacted_summary": "Checkout incident triage repeated the Splunk search",
3653 "recurrence_hints": ["checkout incident queries"],
3654 "estimated_time_ms": 240000,
3655 "metadata": {
3656 "query": "index=checkout service=api error",
3657 "capability": "splunk.search",
3658 "secret_ref": "SPLUNK_READ_TOKEN",
3659 "output_slot": "splunk_errors"
3660 }
3661 }
3662 ]
3663}
3664"#,
3665 )
3666 .unwrap();
3667 let pack_path = temp.path().join("harn.eval.toml");
3668 fs::write(
3669 &pack_path,
3670 r#"
3671version = 1
3672id = "team-learning"
3673name = "Team learning evals"
3674
3675[[fixtures]]
3676id = "incident-friction"
3677kind = "friction-events"
3678path = "incident-friction.json"
3679
3680[[cases]]
3681id = "incident-context-pack"
3682name = "Incident context pack suggestion"
3683friction_events = "incident-friction"
3684rubrics = ["context-pack"]
3685
3686[[rubrics]]
3687id = "context-pack"
3688kind = "friction"
3689
3690[[rubrics.assertions]]
3691kind = "context-pack-suggestion"
3692contains = "incident"
3693expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3694"#,
3695 )
3696 .unwrap();
3697
3698 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3699 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3700
3701 assert!(report.pass);
3702 assert_eq!(report.total, 1);
3703 assert_eq!(report.cases[0].run_id, "friction_events");
3704 assert_eq!(report.cases[0].stage_count, 2);
3705 }
3706}