1use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 default_run_dir, evaluate_context_pack_suggestion_expectations,
10 generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11 parse_json_payload, parse_json_value, run_persona_eval_ladder, sync_run_handoffs,
12 ArtifactRecord, CapabilityPolicy, ContextPackSuggestionExpectation,
13 ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact, PersonaEvalLadderManifest,
14 PersonaEvalLadderReport,
15};
16use crate::agent_events::AgentEvent;
17use crate::event_log::{
18 active_event_log, sanitize_topic_component, AnyEventLog, EventId, EventLog,
19 LogEvent as EventLogRecord, Topic,
20};
21use crate::llm::vm_value_to_json;
22use crate::personas::{
23 PersonaAssignmentStatus, PersonaBudgetStatus, PersonaHandoffInboxItem, PersonaQueuedWork,
24 PersonaStatus, PersonaValueReceipt,
25};
26use crate::triggers::{SignatureStatus, TriggerEvent};
27use crate::value::{VmError, VmValue};
28
29pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
30pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
31pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
32pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
33pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
34pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
35pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
36pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
37pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
38pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
39pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
40
41pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
42pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
43pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
44pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
45pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
46pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
47pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
48pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
49pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
50
51#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
52#[serde(default)]
53pub struct LlmUsageRecord {
54 pub input_tokens: i64,
55 pub output_tokens: i64,
56 pub total_duration_ms: i64,
57 pub call_count: i64,
58 pub total_cost: f64,
59 pub models: Vec<String>,
60}
61
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
63#[serde(default)]
64pub struct RunStageRecord {
65 pub id: String,
66 pub node_id: String,
67 pub kind: String,
68 pub status: String,
69 pub outcome: String,
70 pub branch: Option<String>,
71 pub started_at: String,
72 pub finished_at: Option<String>,
73 pub visible_text: Option<String>,
74 pub private_reasoning: Option<String>,
75 pub transcript: Option<serde_json::Value>,
76 pub verification: Option<serde_json::Value>,
77 pub usage: Option<LlmUsageRecord>,
78 pub artifacts: Vec<ArtifactRecord>,
79 pub consumed_artifact_ids: Vec<String>,
80 pub produced_artifact_ids: Vec<String>,
81 pub attempts: Vec<RunStageAttemptRecord>,
82 pub metadata: BTreeMap<String, serde_json::Value>,
83}
84
85#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
86#[serde(default)]
87pub struct RunStageAttemptRecord {
88 pub attempt: usize,
89 pub status: String,
90 pub outcome: String,
91 pub branch: Option<String>,
92 pub error: Option<String>,
93 pub verification: Option<serde_json::Value>,
94 pub started_at: String,
95 pub finished_at: Option<String>,
96}
97
98#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
99#[serde(default)]
100pub struct RunTransitionRecord {
101 pub id: String,
102 pub from_stage_id: Option<String>,
103 pub from_node_id: Option<String>,
104 pub to_node_id: String,
105 pub branch: Option<String>,
106 pub timestamp: String,
107 pub consumed_artifact_ids: Vec<String>,
108 pub produced_artifact_ids: Vec<String>,
109}
110
111#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
112#[serde(default)]
113pub struct RunCheckpointRecord {
114 pub id: String,
115 pub ready_nodes: Vec<String>,
116 pub completed_nodes: Vec<String>,
117 pub last_stage_id: Option<String>,
118 pub persisted_at: String,
119 pub reason: String,
120}
121
122#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
123#[serde(default)]
124pub struct ReplayFixture {
125 #[serde(rename = "_type")]
126 pub type_name: String,
127 pub id: String,
128 pub source_run_id: String,
129 pub workflow_id: String,
130 pub workflow_name: Option<String>,
131 pub created_at: String,
132 pub eval_kind: Option<String>,
133 pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
134 pub expected_status: String,
135 pub stage_assertions: Vec<ReplayStageAssertion>,
136}
137
138#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
139#[serde(default)]
140pub struct ClarifyingQuestionEvalSpec {
141 pub expected_question: Option<String>,
142 pub accepted_questions: Vec<String>,
143 pub required_terms: Vec<String>,
144 pub forbidden_terms: Vec<String>,
145 pub min_questions: usize,
146 pub max_questions: Option<usize>,
147}
148
149#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
150#[serde(default)]
151pub struct ReplayStageAssertion {
152 pub node_id: String,
153 pub expected_status: String,
154 pub expected_outcome: String,
155 pub expected_branch: Option<String>,
156 pub required_artifact_kinds: Vec<String>,
157 pub visible_text_contains: Option<String>,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalReport {
163 pub pass: bool,
164 pub failures: Vec<String>,
165 pub stage_count: usize,
166}
167
168#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
169#[serde(default)]
170pub struct ReplayEvalCaseReport {
171 pub run_id: String,
172 pub workflow_id: String,
173 pub label: Option<String>,
174 pub pass: bool,
175 pub failures: Vec<String>,
176 pub stage_count: usize,
177 pub source_path: Option<String>,
178 pub comparison: Option<RunDiffReport>,
179}
180
181#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
182#[serde(default)]
183pub struct ReplayEvalSuiteReport {
184 pub pass: bool,
185 pub total: usize,
186 pub passed: usize,
187 pub failed: usize,
188 pub cases: Vec<ReplayEvalCaseReport>,
189}
190
191#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
192#[serde(default)]
193pub struct RunDeliverableSummaryRecord {
194 pub id: String,
195 pub text: String,
196 pub status: String,
197 pub note: Option<String>,
198}
199
200#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
201#[serde(default)]
202pub struct RunTaskLedgerSummaryRecord {
203 pub root_task: String,
204 pub rationale: String,
205 pub deliverables: Vec<RunDeliverableSummaryRecord>,
206 pub observations: Vec<String>,
207 pub blocking_count: usize,
208}
209
210#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
211#[serde(default)]
212pub struct RunPlannerRoundRecord {
213 pub stage_id: String,
214 pub node_id: String,
215 pub stage_kind: String,
216 pub status: String,
217 pub outcome: String,
218 pub iteration_count: usize,
219 pub llm_call_count: usize,
220 pub tool_execution_count: usize,
221 pub tool_rejection_count: usize,
222 pub intervention_count: usize,
223 pub compaction_count: usize,
224 pub native_text_tool_fallback_count: usize,
225 pub native_text_tool_fallback_rejection_count: usize,
226 pub empty_completion_retry_count: usize,
227 pub tools_used: Vec<String>,
228 pub successful_tools: Vec<String>,
229 pub ledger_done_rejections: usize,
230 pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
231 pub research_facts: Vec<String>,
232}
233
234#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
235#[serde(default)]
236pub struct RunWorkerLineageRecord {
237 pub worker_id: String,
238 pub worker_name: String,
239 pub parent_stage_id: Option<String>,
240 pub task: String,
241 pub status: String,
242 pub session_id: Option<String>,
243 pub parent_session_id: Option<String>,
244 pub run_id: Option<String>,
245 pub run_path: Option<String>,
246 pub snapshot_path: Option<String>,
247}
248
249#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
250#[serde(default)]
251pub struct RunActionGraphNodeRecord {
252 pub id: String,
253 pub label: String,
254 pub kind: String,
255 pub status: String,
256 pub outcome: String,
257 pub trace_id: Option<String>,
258 pub stage_id: Option<String>,
259 pub node_id: Option<String>,
260 pub worker_id: Option<String>,
261 pub run_id: Option<String>,
262 pub run_path: Option<String>,
263 pub metadata: BTreeMap<String, serde_json::Value>,
264}
265
266#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
267#[serde(default)]
268pub struct RunActionGraphEdgeRecord {
269 pub from_id: String,
270 pub to_id: String,
271 pub kind: String,
272 pub label: Option<String>,
273}
274
275#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
276#[serde(default)]
277pub struct RunVerificationOutcomeRecord {
278 pub stage_id: String,
279 pub node_id: String,
280 pub status: String,
281 pub passed: Option<bool>,
282 pub summary: Option<String>,
283}
284
285#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
286#[serde(default)]
287pub struct RunTranscriptPointerRecord {
288 pub id: String,
289 pub label: String,
290 pub kind: String,
291 pub location: String,
292 pub path: Option<String>,
293 pub available: bool,
294}
295
296#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
297#[serde(default)]
298pub struct CompactionEventRecord {
299 pub id: String,
300 pub transcript_id: Option<String>,
301 pub stage_id: Option<String>,
302 pub node_id: Option<String>,
303 pub mode: String,
304 pub strategy: String,
305 pub archived_messages: usize,
306 pub estimated_tokens_before: usize,
307 pub estimated_tokens_after: usize,
308 pub snapshot_asset_id: Option<String>,
309 pub snapshot_location: String,
310 pub snapshot_path: Option<String>,
311 pub available: bool,
312}
313
314#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
315#[serde(rename_all = "snake_case")]
316pub enum DaemonEventKindRecord {
317 #[default]
318 Spawned,
319 Triggered,
320 Snapshotted,
321 Resumed,
322 Stopped,
323}
324
325#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
326#[serde(default)]
327pub struct DaemonEventRecord {
328 pub daemon_id: String,
329 pub name: String,
330 pub kind: DaemonEventKindRecord,
331 pub timestamp: String,
332 pub persist_path: String,
333 pub payload_summary: Option<String>,
334}
335
336#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
337#[serde(default)]
338pub struct RunObservabilityRecord {
339 pub schema_version: usize,
340 pub planner_rounds: Vec<RunPlannerRoundRecord>,
341 pub research_fact_count: usize,
342 pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
343 pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
344 pub worker_lineage: Vec<RunWorkerLineageRecord>,
345 pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
346 pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
347 pub compaction_events: Vec<CompactionEventRecord>,
348 pub daemon_events: Vec<DaemonEventRecord>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct RunStageDiffRecord {
354 pub node_id: String,
355 pub change: String,
356 pub details: Vec<String>,
357}
358
359#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
360#[serde(default)]
361pub struct ToolCallDiffRecord {
362 pub tool_name: String,
363 pub args_hash: String,
364 pub result_changed: bool,
365 pub left_result: Option<String>,
366 pub right_result: Option<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunObservabilityDiffRecord {
372 pub section: String,
373 pub label: String,
374 pub details: Vec<String>,
375}
376
377#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
378#[serde(default)]
379pub struct RunDiffReport {
380 pub left_run_id: String,
381 pub right_run_id: String,
382 pub identical: bool,
383 pub status_changed: bool,
384 pub left_status: String,
385 pub right_status: String,
386 pub stage_diffs: Vec<RunStageDiffRecord>,
387 pub tool_diffs: Vec<ToolCallDiffRecord>,
388 pub observability_diffs: Vec<RunObservabilityDiffRecord>,
389 pub transition_count_delta: isize,
390 pub artifact_count_delta: isize,
391 pub checkpoint_count_delta: isize,
392}
393
394#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
395#[serde(default)]
396pub struct EvalSuiteManifest {
397 #[serde(rename = "_type")]
398 pub type_name: String,
399 pub id: String,
400 pub name: Option<String>,
401 pub base_dir: Option<String>,
402 pub cases: Vec<EvalSuiteCase>,
403}
404
405#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
406#[serde(default)]
407pub struct EvalSuiteCase {
408 pub label: Option<String>,
409 pub run_path: String,
410 pub fixture_path: Option<String>,
411 pub compare_to: Option<String>,
412}
413
414#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
415#[serde(default)]
416pub struct EvalPackManifest {
417 pub version: u32,
418 pub id: String,
419 pub name: Option<String>,
420 pub description: Option<String>,
421 pub base_dir: Option<String>,
422 pub baseline: Option<String>,
423 pub package: Option<EvalPackPackage>,
424 pub defaults: EvalPackDefaults,
425 pub fixtures: Vec<EvalPackFixtureRef>,
426 pub rubrics: Vec<EvalPackRubric>,
427 pub judge: Option<EvalPackJudgeConfig>,
428 pub cases: Vec<EvalPackCase>,
429 pub ladders: Vec<PersonaEvalLadderManifest>,
430 pub metadata: BTreeMap<String, serde_json::Value>,
431}
432
433#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
434#[serde(default)]
435pub struct EvalPackPackage {
436 pub name: Option<String>,
437 pub version: Option<String>,
438 pub source: Option<String>,
439 pub templates: Vec<String>,
440}
441
442#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
443#[serde(default)]
444pub struct EvalPackDefaults {
445 pub severity: Option<String>,
446 pub fixture_root: Option<String>,
447 pub thresholds: EvalPackThresholds,
448 pub judge: Option<EvalPackJudgeConfig>,
449}
450
451#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
452#[serde(default)]
453pub struct EvalPackFixtureRef {
454 pub id: String,
455 pub kind: String,
456 pub path: Option<String>,
457 #[serde(default, alias = "trace-id")]
458 pub trace_id: Option<String>,
459 pub provider: Option<String>,
460 #[serde(default, alias = "event-kind")]
461 pub event_kind: Option<String>,
462 pub inline: Option<serde_json::Value>,
463 pub metadata: BTreeMap<String, serde_json::Value>,
464}
465
466#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
467#[serde(default)]
468pub struct EvalPackRubric {
469 pub id: String,
470 pub kind: String,
471 pub description: Option<String>,
472 pub prompt: Option<String>,
473 pub assertions: Vec<EvalPackAssertion>,
474 pub judge: Option<EvalPackJudgeConfig>,
475 pub calibration: Vec<EvalPackGoldenExample>,
476 pub thresholds: EvalPackThresholds,
477 pub metadata: BTreeMap<String, serde_json::Value>,
478}
479
480#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
481#[serde(default)]
482pub struct EvalPackAssertion {
483 pub kind: String,
484 pub stage: Option<String>,
485 pub path: Option<String>,
486 pub op: Option<String>,
487 pub expected: Option<serde_json::Value>,
488 pub contains: Option<String>,
489 pub metadata: BTreeMap<String, serde_json::Value>,
490}
491
492#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
493#[serde(default)]
494pub struct EvalPackJudgeConfig {
495 pub model: Option<String>,
496 #[serde(default, alias = "prompt-version")]
497 pub prompt_version: Option<String>,
498 #[serde(default, alias = "tie-break")]
499 pub tie_break: Option<String>,
500 #[serde(default, alias = "confidence-min")]
501 pub confidence_min: Option<f64>,
502 pub temperature: Option<f64>,
503 pub rubric: Option<String>,
504 pub metadata: BTreeMap<String, serde_json::Value>,
505}
506
507#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
508#[serde(default)]
509pub struct EvalPackGoldenExample {
510 pub input: serde_json::Value,
511 pub output: serde_json::Value,
512 pub score: Option<f64>,
513 pub explanation: Option<String>,
514}
515
516#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
517#[serde(default)]
518pub struct EvalPackThresholds {
519 pub severity: Option<String>,
520 #[serde(default, alias = "min-score")]
521 pub min_score: Option<f64>,
522 #[serde(default, alias = "min-confidence")]
523 pub min_confidence: Option<f64>,
524 #[serde(default, alias = "max-cost-usd")]
525 pub max_cost_usd: Option<f64>,
526 #[serde(default, alias = "max-latency-ms")]
527 pub max_latency_ms: Option<i64>,
528 #[serde(default, alias = "max-tokens")]
529 pub max_tokens: Option<i64>,
530 #[serde(default, alias = "max-stage-count")]
531 pub max_stage_count: Option<usize>,
532}
533
534#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
535#[serde(default)]
536pub struct EvalPackCase {
537 pub id: Option<String>,
538 pub name: Option<String>,
539 pub description: Option<String>,
540 pub run: Option<String>,
541 #[serde(default, alias = "run-path")]
542 pub run_path: Option<String>,
543 #[serde(default, alias = "friction-events", alias = "friction_events")]
544 pub friction_events: Option<String>,
545 pub fixture: Option<String>,
546 #[serde(default, alias = "fixture-path")]
547 pub fixture_path: Option<String>,
548 #[serde(default, alias = "compare-to")]
549 pub compare_to: Option<String>,
550 pub rubrics: Vec<String>,
551 pub severity: Option<String>,
552 pub thresholds: EvalPackThresholds,
553 pub metadata: BTreeMap<String, serde_json::Value>,
554}
555
556#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
557#[serde(default)]
558pub struct EvalPackReport {
559 pub pack_id: String,
560 pub pass: bool,
561 pub total: usize,
562 pub passed: usize,
563 pub failed: usize,
564 pub blocking_failed: usize,
565 pub warning_failed: usize,
566 pub informational_failed: usize,
567 pub cases: Vec<EvalPackCaseReport>,
568 pub ladders: Vec<PersonaEvalLadderReport>,
569}
570
571#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
572#[serde(default)]
573pub struct EvalPackCaseReport {
574 pub id: String,
575 pub label: String,
576 pub severity: String,
577 pub pass: bool,
578 pub blocking: bool,
579 pub run_id: String,
580 pub workflow_id: String,
581 pub source_path: Option<String>,
582 pub stage_count: usize,
583 pub failures: Vec<String>,
584 pub warnings: Vec<String>,
585 pub informational: Vec<String>,
586 pub comparison: Option<RunDiffReport>,
587}
588
589#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
590#[serde(default)]
591pub struct RunHitlQuestionRecord {
592 pub request_id: String,
593 pub prompt: String,
594 pub agent: String,
595 pub trace_id: Option<String>,
596 pub asked_at: String,
597}
598
599#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
600#[serde(default)]
601pub struct RunPersonaRuntimeRecord {
602 pub name: String,
603 pub role: String,
604 pub template_ref: Option<String>,
605 pub state: String,
606 pub entry_workflow: String,
607 pub current_assignment: Option<PersonaAssignmentStatus>,
608 pub queued_work: Vec<PersonaQueuedWork>,
609 pub handoff_inbox: Vec<PersonaHandoffInboxItem>,
610 pub budget: PersonaBudgetStatus,
611 pub value_receipts: Vec<PersonaValueReceipt>,
612 pub last_run: Option<String>,
613 pub last_error: Option<String>,
614}
615
616impl From<&PersonaStatus> for RunPersonaRuntimeRecord {
617 fn from(status: &PersonaStatus) -> Self {
618 Self {
619 name: status.name.clone(),
620 role: status.role.clone(),
621 template_ref: status.template_ref.clone(),
622 state: status.state.as_str().to_string(),
623 entry_workflow: status.entry_workflow.clone(),
624 current_assignment: status.current_assignment.clone(),
625 queued_work: status.queued_work.clone(),
626 handoff_inbox: status.handoff_inbox.clone(),
627 budget: status.budget.clone(),
628 value_receipts: status.value_receipts.clone(),
629 last_run: status.last_run.clone(),
630 last_error: status.last_error.clone(),
631 }
632 }
633}
634
635#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
636#[serde(default)]
637pub struct RunRecord {
638 #[serde(rename = "_type")]
639 pub type_name: String,
640 pub id: String,
641 pub workflow_id: String,
642 pub workflow_name: Option<String>,
643 pub task: String,
644 pub status: String,
645 pub started_at: String,
646 pub finished_at: Option<String>,
647 pub parent_run_id: Option<String>,
648 pub root_run_id: Option<String>,
649 pub stages: Vec<RunStageRecord>,
650 pub transitions: Vec<RunTransitionRecord>,
651 pub checkpoints: Vec<RunCheckpointRecord>,
652 pub pending_nodes: Vec<String>,
653 pub completed_nodes: Vec<String>,
654 pub child_runs: Vec<RunChildRecord>,
655 pub artifacts: Vec<ArtifactRecord>,
656 pub handoffs: Vec<HandoffArtifact>,
657 pub policy: CapabilityPolicy,
658 pub execution: Option<RunExecutionRecord>,
659 pub transcript: Option<serde_json::Value>,
660 pub usage: Option<LlmUsageRecord>,
661 pub replay_fixture: Option<ReplayFixture>,
662 pub observability: Option<RunObservabilityRecord>,
663 pub trace_spans: Vec<RunTraceSpanRecord>,
664 pub tool_recordings: Vec<ToolCallRecord>,
665 pub hitl_questions: Vec<RunHitlQuestionRecord>,
666 pub persona_runtime: Vec<RunPersonaRuntimeRecord>,
667 pub metadata: BTreeMap<String, serde_json::Value>,
668 pub persisted_path: Option<String>,
669}
670
671#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
672#[serde(default)]
673pub struct ToolCallRecord {
674 pub tool_name: String,
675 pub tool_use_id: String,
676 pub args_hash: String,
677 pub result: String,
678 pub is_rejected: bool,
679 pub duration_ms: u64,
680 pub iteration: usize,
681 pub timestamp: String,
682}
683
684pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
686 use std::hash::{Hash, Hasher};
687 let mut hasher = std::collections::hash_map::DefaultHasher::new();
688 tool_name.hash(&mut hasher);
689 let args_str = serde_json::to_string(args).unwrap_or_default();
690 args_str.hash(&mut hasher);
691 format!("{:016x}", hasher.finish())
692}
693
694#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
695#[serde(default)]
696pub struct RunTraceSpanRecord {
697 pub span_id: u64,
698 pub parent_id: Option<u64>,
699 pub kind: String,
700 pub name: String,
701 pub start_ms: u64,
702 pub duration_ms: u64,
703 pub metadata: BTreeMap<String, serde_json::Value>,
704}
705
706#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
707#[serde(default)]
708pub struct RunChildRecord {
709 pub worker_id: String,
710 pub worker_name: String,
711 pub parent_stage_id: Option<String>,
712 pub session_id: Option<String>,
713 pub parent_session_id: Option<String>,
714 pub mutation_scope: Option<String>,
715 pub approval_policy: Option<super::ToolApprovalPolicy>,
716 pub task: String,
717 pub request: Option<serde_json::Value>,
718 pub provenance: Option<serde_json::Value>,
719 pub status: String,
720 pub started_at: String,
721 pub finished_at: Option<String>,
722 pub run_id: Option<String>,
723 pub run_path: Option<String>,
724 pub snapshot_path: Option<String>,
725 pub execution: Option<RunExecutionRecord>,
726}
727
728pub(crate) fn run_child_record_from_worker_metadata(
729 parent_stage_id: Option<String>,
730 worker: &serde_json::Value,
731) -> Option<RunChildRecord> {
732 let worker_id = worker.get("id").and_then(|value| value.as_str())?;
733 if worker_id.is_empty() {
734 return None;
735 }
736 Some(RunChildRecord {
737 worker_id: worker_id.to_string(),
738 worker_name: worker
739 .get("name")
740 .and_then(|value| value.as_str())
741 .unwrap_or("worker")
742 .to_string(),
743 parent_stage_id,
744 session_id: worker
745 .get("audit")
746 .and_then(|value| value.get("session_id"))
747 .and_then(|value| value.as_str())
748 .map(str::to_string),
749 parent_session_id: worker
750 .get("audit")
751 .and_then(|value| value.get("parent_session_id"))
752 .and_then(|value| value.as_str())
753 .map(str::to_string),
754 mutation_scope: worker
755 .get("audit")
756 .and_then(|value| value.get("mutation_scope"))
757 .and_then(|value| value.as_str())
758 .map(str::to_string),
759 approval_policy: worker
760 .get("audit")
761 .and_then(|value| value.get("approval_policy"))
762 .and_then(|value| {
763 serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
764 }),
765 task: worker
766 .get("task")
767 .and_then(|value| value.as_str())
768 .unwrap_or_default()
769 .to_string(),
770 request: worker.get("request").cloned(),
771 provenance: worker.get("provenance").cloned(),
772 status: worker
773 .get("status")
774 .and_then(|value| value.as_str())
775 .unwrap_or("completed")
776 .to_string(),
777 started_at: worker
778 .get("started_at")
779 .and_then(|value| value.as_str())
780 .unwrap_or_default()
781 .to_string(),
782 finished_at: worker
783 .get("finished_at")
784 .and_then(|value| value.as_str())
785 .map(str::to_string),
786 run_id: worker
787 .get("child_run_id")
788 .and_then(|value| value.as_str())
789 .map(str::to_string),
790 run_path: worker
791 .get("child_run_path")
792 .and_then(|value| value.as_str())
793 .map(str::to_string),
794 snapshot_path: worker
795 .get("snapshot_path")
796 .and_then(|value| value.as_str())
797 .map(str::to_string),
798 execution: worker
799 .get("execution")
800 .cloned()
801 .and_then(|value| serde_json::from_value(value).ok()),
802 })
803}
804
805fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
806 let parent_stage_id = if stage.id.is_empty() {
807 None
808 } else {
809 Some(stage.id.clone())
810 };
811 run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
812}
813
814fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
815 if existing.worker_name.is_empty() {
816 existing.worker_name = child.worker_name;
817 }
818 if existing.parent_stage_id.is_none() {
819 existing.parent_stage_id = child.parent_stage_id;
820 }
821 if existing.session_id.is_none() {
822 existing.session_id = child.session_id;
823 }
824 if existing.parent_session_id.is_none() {
825 existing.parent_session_id = child.parent_session_id;
826 }
827 if existing.mutation_scope.is_none() {
828 existing.mutation_scope = child.mutation_scope;
829 }
830 if existing.approval_policy.is_none() {
831 existing.approval_policy = child.approval_policy;
832 }
833 if existing.task.is_empty() {
834 existing.task = child.task;
835 }
836 if existing.request.is_none() {
837 existing.request = child.request;
838 }
839 if existing.provenance.is_none() {
840 existing.provenance = child.provenance;
841 }
842 if existing.status.is_empty() {
843 existing.status = child.status;
844 }
845 if existing.started_at.is_empty() {
846 existing.started_at = child.started_at;
847 }
848 if existing.finished_at.is_none() {
849 existing.finished_at = child.finished_at;
850 }
851 if existing.run_id.is_none() {
852 existing.run_id = child.run_id;
853 }
854 if existing.run_path.is_none() {
855 existing.run_path = child.run_path;
856 }
857 if existing.snapshot_path.is_none() {
858 existing.snapshot_path = child.snapshot_path;
859 }
860 if existing.execution.is_none() {
861 existing.execution = child.execution;
862 }
863}
864
865fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
866 for child in run
867 .stages
868 .iter()
869 .filter_map(run_child_from_stage_metadata)
870 .collect::<Vec<_>>()
871 {
872 match run
873 .child_runs
874 .iter_mut()
875 .find(|existing| existing.worker_id == child.worker_id)
876 {
877 Some(existing) => fill_missing_child_run_fields(existing, child),
878 None => run.child_runs.push(child),
879 }
880 }
881}
882
883#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
884#[serde(default)]
885pub struct RunExecutionRecord {
886 pub cwd: Option<String>,
887 pub source_dir: Option<String>,
888 pub env: BTreeMap<String, String>,
889 pub adapter: Option<String>,
890 pub repo_path: Option<String>,
891 pub worktree_path: Option<String>,
892 pub branch: Option<String>,
893 pub base_ref: Option<String>,
894 pub cleanup: Option<String>,
895}
896
897fn compact_json_value(value: &serde_json::Value) -> String {
898 serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
899}
900
901fn normalize_question_text(text: &str) -> String {
902 text.chars()
903 .map(|ch| {
904 if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
905 ch.to_ascii_lowercase()
906 } else {
907 ' '
908 }
909 })
910 .collect::<String>()
911 .split_whitespace()
912 .collect::<Vec<_>>()
913 .join(" ")
914}
915
916fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
917 spec.min_questions.max(1)
918}
919
920fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
921 spec.max_questions.unwrap_or(1).max(1)
922}
923
924fn read_topic_records(
925 log: &AnyEventLog,
926 topic: &Topic,
927) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
928 let mut from = None;
929 let mut records = Vec::new();
930 loop {
931 let batch =
932 futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
933 if batch.is_empty() {
934 break;
935 }
936 from = batch.last().map(|(event_id, _)| *event_id);
937 records.extend(batch);
938 }
939 records
940}
941
942#[derive(Clone, Debug)]
943pub struct AgentSessionReplayEvent {
944 pub event_id: EventId,
945 pub event: AgentEvent,
946}
947
948pub async fn load_agent_session_replay_events(
949 session_id: &str,
950) -> Result<Vec<AgentSessionReplayEvent>, VmError> {
951 let Some(log) = active_event_log() else {
952 return Ok(Vec::new());
953 };
954 let topic = Topic::new(format!(
955 "observability.agent_events.{}",
956 sanitize_topic_component(session_id)
957 ))
958 .map_err(|error| VmError::Runtime(format!("failed to build agent event topic: {error}")))?;
959
960 let mut events = Vec::new();
961 let mut from = None;
962 loop {
963 let batch = log.read_range(&topic, from, 1024).await.map_err(|error| {
964 VmError::Runtime(format!(
965 "failed to read agent event replay topic {}: {error}",
966 topic.as_str()
967 ))
968 })?;
969 let batch_len = batch.len();
970 for (event_id, record) in batch {
971 from = Some(event_id);
972 if record.headers.get("session_id").map(String::as_str) != Some(session_id) {
973 continue;
974 }
975 let Some(event_value) = record.payload.get("event").cloned() else {
976 continue;
977 };
978 let event = serde_json::from_value::<AgentEvent>(event_value).map_err(|error| {
979 VmError::Runtime(format!(
980 "failed to decode agent event replay record {event_id}: {error}"
981 ))
982 })?;
983 if event.session_id() == session_id {
984 events.push(AgentSessionReplayEvent { event_id, event });
985 }
986 }
987 if batch_len < 1024 {
988 break;
989 }
990 }
991 Ok(events)
992}
993
994fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
995 let Some(log) = active_event_log() else {
996 return;
997 };
998 let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
999 .expect("static hitl.questions topic should always be valid");
1000 let mut merged = run
1001 .hitl_questions
1002 .iter()
1003 .cloned()
1004 .map(|question| (question.request_id.clone(), question))
1005 .collect::<BTreeMap<_, _>>();
1006
1007 for (_, event) in read_topic_records(log.as_ref(), &topic) {
1008 if event.kind != "hitl.question_asked" {
1009 continue;
1010 }
1011 let payload = &event.payload;
1012 let matches_run = event
1013 .headers
1014 .get("run_id")
1015 .is_some_and(|value| value == &run.id)
1016 || payload
1017 .get("run_id")
1018 .and_then(|value| value.as_str())
1019 .is_some_and(|value| value == run.id);
1020 if !matches_run {
1021 continue;
1022 }
1023 let request_id = payload
1024 .get("request_id")
1025 .and_then(|value| value.as_str())
1026 .or_else(|| event.headers.get("request_id").map(String::as_str))
1027 .unwrap_or_default();
1028 let prompt = payload
1029 .get("payload")
1030 .and_then(|value| value.get("prompt"))
1031 .and_then(|value| value.as_str())
1032 .unwrap_or_default();
1033 if request_id.is_empty() || prompt.is_empty() {
1034 continue;
1035 }
1036 merged.insert(
1037 request_id.to_string(),
1038 RunHitlQuestionRecord {
1039 request_id: request_id.to_string(),
1040 prompt: prompt.to_string(),
1041 agent: payload
1042 .get("agent")
1043 .and_then(|value| value.as_str())
1044 .unwrap_or_default()
1045 .to_string(),
1046 trace_id: payload
1047 .get("trace_id")
1048 .and_then(|value| value.as_str())
1049 .map(str::to_string),
1050 asked_at: payload
1051 .get("requested_at")
1052 .and_then(|value| value.as_str())
1053 .unwrap_or_default()
1054 .to_string(),
1055 },
1056 );
1057 }
1058
1059 run.hitl_questions = merged.into_values().collect();
1060 run.hitl_questions.sort_by(|left, right| {
1061 (left.asked_at.as_str(), left.request_id.as_str())
1062 .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
1063 });
1064}
1065
1066fn signature_status_label(status: &SignatureStatus) -> &'static str {
1067 match status {
1068 SignatureStatus::Verified => "verified",
1069 SignatureStatus::Unsigned => "unsigned",
1070 SignatureStatus::Failed { .. } => "failed",
1071 }
1072}
1073
1074fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
1075 run.metadata
1076 .get("trigger_event")
1077 .cloned()
1078 .and_then(|value| serde_json::from_value(value).ok())
1079}
1080
1081fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
1082 trigger_event
1083 .map(|event| event.trace_id.0.clone())
1084 .or_else(|| {
1085 run.metadata
1086 .get("trace_id")
1087 .and_then(|value| value.as_str())
1088 .map(str::to_string)
1089 })
1090}
1091
1092fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
1093 run.metadata
1094 .get("replay_of_event_id")
1095 .and_then(|value| value.as_str())
1096 .map(str::to_string)
1097}
1098
1099fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1100 if stage.kind == "condition" {
1101 ACTION_GRAPH_NODE_KIND_PREDICATE
1102 } else {
1103 ACTION_GRAPH_NODE_KIND_STAGE
1104 }
1105}
1106
1107fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1108 let mut metadata = BTreeMap::new();
1109 metadata.insert(
1110 "provider".to_string(),
1111 serde_json::json!(trigger_event.provider.as_str()),
1112 );
1113 metadata.insert(
1114 "event_kind".to_string(),
1115 serde_json::json!(trigger_event.kind),
1116 );
1117 metadata.insert(
1118 "dedupe_key".to_string(),
1119 serde_json::json!(trigger_event.dedupe_key),
1120 );
1121 metadata.insert(
1122 "signature_status".to_string(),
1123 serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1124 );
1125 metadata
1126}
1127
1128fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1129 let mut metadata = BTreeMap::new();
1130 metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1131 if let Some(branch) = stage.branch.as_ref() {
1132 metadata.insert("branch".to_string(), serde_json::json!(branch));
1133 }
1134 if let Some(worker_id) = stage
1135 .metadata
1136 .get("worker_id")
1137 .and_then(|value| value.as_str())
1138 {
1139 metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1140 }
1141 metadata
1142}
1143
1144fn append_action_graph_node(
1145 nodes: &mut Vec<RunActionGraphNodeRecord>,
1146 record: RunActionGraphNodeRecord,
1147) {
1148 nodes.push(record);
1149}
1150
1151pub async fn append_action_graph_update(
1152 headers: BTreeMap<String, String>,
1153 payload: serde_json::Value,
1154) -> Result<(), crate::event_log::LogError> {
1155 let Some(log) = active_event_log() else {
1156 return Ok(());
1157 };
1158 let topic = Topic::new("observability.action_graph")
1159 .expect("static observability.action_graph topic should always be valid");
1160 let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1161 log.append(&topic, record).await.map(|_| ())
1162}
1163
1164fn publish_action_graph_event(
1165 run: &RunRecord,
1166 observability: &RunObservabilityRecord,
1167 path: &Path,
1168) {
1169 let trigger_event = trigger_event_from_run(run);
1170 let mut headers = BTreeMap::new();
1171 headers.insert("run_id".to_string(), run.id.clone());
1172 headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1173 if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1174 headers.insert("trace_id".to_string(), trace_id);
1175 }
1176 let payload = serde_json::json!({
1177 "run_id": run.id,
1178 "workflow_id": run.workflow_id,
1179 "persisted_path": path.to_string_lossy(),
1180 "status": run.status,
1181 "observability": observability,
1182 });
1183 if let Ok(handle) = tokio::runtime::Handle::try_current() {
1184 handle.spawn(async move {
1185 let _ = append_action_graph_update(headers, payload).await;
1186 });
1187 } else {
1188 let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1189 }
1190}
1191
1192fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1193 let stem = run_path.file_stem()?.to_str()?;
1194 let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1195 Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1196}
1197
1198fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1199 value
1200 .and_then(|value| value.as_array())
1201 .map(|items| {
1202 items
1203 .iter()
1204 .filter_map(|item| item.as_str().map(str::to_string))
1205 .collect::<Vec<_>>()
1206 })
1207 .unwrap_or_default()
1208}
1209
1210fn json_usize(value: Option<&serde_json::Value>) -> usize {
1211 value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1212}
1213
1214fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1215 value.and_then(|value| value.as_bool())
1216}
1217
1218fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1219 stage
1220 .artifacts
1221 .iter()
1222 .find_map(|artifact| artifact.data.as_ref())
1223}
1224
1225fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1226 let deliverables = value
1227 .get("deliverables")
1228 .and_then(|raw| raw.as_array())
1229 .map(|items| {
1230 items
1231 .iter()
1232 .map(|item| RunDeliverableSummaryRecord {
1233 id: item
1234 .get("id")
1235 .and_then(|value| value.as_str())
1236 .unwrap_or_default()
1237 .to_string(),
1238 text: item
1239 .get("text")
1240 .and_then(|value| value.as_str())
1241 .unwrap_or_default()
1242 .to_string(),
1243 status: item
1244 .get("status")
1245 .and_then(|value| value.as_str())
1246 .unwrap_or_default()
1247 .to_string(),
1248 note: item
1249 .get("note")
1250 .and_then(|value| value.as_str())
1251 .map(str::to_string),
1252 })
1253 .collect::<Vec<_>>()
1254 })
1255 .unwrap_or_default();
1256 let observations = json_string_array(value.get("observations"));
1257 let root_task = value
1258 .get("root_task")
1259 .and_then(|value| value.as_str())
1260 .unwrap_or_default()
1261 .to_string();
1262 let rationale = value
1263 .get("rationale")
1264 .and_then(|value| value.as_str())
1265 .unwrap_or_default()
1266 .to_string();
1267 if root_task.is_empty()
1268 && rationale.is_empty()
1269 && deliverables.is_empty()
1270 && observations.is_empty()
1271 {
1272 return None;
1273 }
1274 let blocking_count = deliverables
1275 .iter()
1276 .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1277 .count();
1278 Some(RunTaskLedgerSummaryRecord {
1279 root_task,
1280 rationale,
1281 deliverables,
1282 observations,
1283 blocking_count,
1284 })
1285}
1286
1287fn compaction_events_from_transcript(
1288 transcript: &serde_json::Value,
1289 stage_id: Option<&str>,
1290 node_id: Option<&str>,
1291 location_prefix: &str,
1292 persisted_path: Option<&Path>,
1293) -> Vec<CompactionEventRecord> {
1294 let transcript_id = transcript
1295 .get("id")
1296 .and_then(|value| value.as_str())
1297 .map(str::to_string);
1298 let asset_ids = transcript
1299 .get("assets")
1300 .and_then(|value| value.as_array())
1301 .map(|assets| {
1302 assets
1303 .iter()
1304 .filter_map(|asset| {
1305 asset
1306 .get("id")
1307 .and_then(|value| value.as_str())
1308 .map(str::to_string)
1309 })
1310 .collect::<BTreeSet<_>>()
1311 })
1312 .unwrap_or_default();
1313 transcript
1314 .get("events")
1315 .and_then(|value| value.as_array())
1316 .map(|events| {
1317 events
1318 .iter()
1319 .filter(|event| {
1320 event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1321 })
1322 .map(|event| {
1323 let metadata = event.get("metadata");
1324 let snapshot_asset_id = metadata
1325 .and_then(|value| value.get("snapshot_asset_id"))
1326 .and_then(|value| value.as_str())
1327 .map(str::to_string);
1328 let available = snapshot_asset_id
1329 .as_ref()
1330 .is_some_and(|asset_id| asset_ids.contains(asset_id));
1331 let snapshot_location = snapshot_asset_id
1332 .as_ref()
1333 .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1334 .unwrap_or_else(|| location_prefix.to_string());
1335 CompactionEventRecord {
1336 id: event
1337 .get("id")
1338 .and_then(|value| value.as_str())
1339 .unwrap_or_default()
1340 .to_string(),
1341 transcript_id: transcript_id.clone(),
1342 stage_id: stage_id.map(str::to_string),
1343 node_id: node_id.map(str::to_string),
1344 mode: metadata
1345 .and_then(|value| value.get("mode"))
1346 .and_then(|value| value.as_str())
1347 .unwrap_or_default()
1348 .to_string(),
1349 strategy: metadata
1350 .and_then(|value| value.get("strategy"))
1351 .and_then(|value| value.as_str())
1352 .unwrap_or_default()
1353 .to_string(),
1354 archived_messages: json_usize(
1355 metadata.and_then(|value| value.get("archived_messages")),
1356 ),
1357 estimated_tokens_before: json_usize(
1358 metadata.and_then(|value| value.get("estimated_tokens_before")),
1359 ),
1360 estimated_tokens_after: json_usize(
1361 metadata.and_then(|value| value.get("estimated_tokens_after")),
1362 ),
1363 snapshot_asset_id,
1364 snapshot_location,
1365 snapshot_path: persisted_path
1366 .map(|path| path.to_string_lossy().into_owned()),
1367 available,
1368 }
1369 })
1370 .collect()
1371 })
1372 .unwrap_or_default()
1373}
1374
1375fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1376 let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1377 return Vec::new();
1378 };
1379 let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1380 return Vec::new();
1381 };
1382
1383 content
1384 .lines()
1385 .filter(|line| !line.trim().is_empty())
1386 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1387 .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1388 .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1389 .collect()
1390}
1391
1392pub fn derive_run_observability(
1393 run: &RunRecord,
1394 persisted_path: Option<&Path>,
1395) -> RunObservabilityRecord {
1396 let mut action_graph_nodes = Vec::new();
1397 let mut action_graph_edges = Vec::new();
1398 let mut verification_outcomes = Vec::new();
1399 let mut planner_rounds = Vec::new();
1400 let mut transcript_pointers = Vec::new();
1401 let mut compaction_events = Vec::new();
1402 let mut daemon_events = Vec::new();
1403 let mut research_fact_count = 0usize;
1404
1405 let root_node_id = format!("run:{}", run.id);
1406 let trigger_event = trigger_event_from_run(run);
1407 let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1408 append_action_graph_node(
1409 &mut action_graph_nodes,
1410 RunActionGraphNodeRecord {
1411 id: root_node_id.clone(),
1412 label: run
1413 .workflow_name
1414 .clone()
1415 .unwrap_or_else(|| run.workflow_id.clone()),
1416 kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1417 status: run.status.clone(),
1418 outcome: run.status.clone(),
1419 trace_id: propagated_trace_id.clone(),
1420 stage_id: None,
1421 node_id: None,
1422 worker_id: None,
1423 run_id: Some(run.id.clone()),
1424 run_path: run.persisted_path.clone(),
1425 metadata: BTreeMap::from([(
1426 "workflow_id".to_string(),
1427 serde_json::json!(run.workflow_id),
1428 )]),
1429 },
1430 );
1431 let mut entry_node_id = root_node_id.clone();
1432 if let Some(trigger_event) = trigger_event.as_ref() {
1433 if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1434 let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1435 append_action_graph_node(
1436 &mut action_graph_nodes,
1437 RunActionGraphNodeRecord {
1438 id: replay_source_node_id.clone(),
1439 label: format!(
1440 "{}:{} (original {})",
1441 trigger_event.provider.as_str(),
1442 trigger_event.kind,
1443 replay_of_event_id
1444 ),
1445 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1446 status: "historical".to_string(),
1447 outcome: "replayed_from".to_string(),
1448 trace_id: Some(trigger_event.trace_id.0.clone()),
1449 stage_id: None,
1450 node_id: None,
1451 worker_id: None,
1452 run_id: Some(run.id.clone()),
1453 run_path: run.persisted_path.clone(),
1454 metadata: trigger_node_metadata(trigger_event),
1455 },
1456 );
1457 action_graph_edges.push(RunActionGraphEdgeRecord {
1458 from_id: replay_source_node_id,
1459 to_id: format!("trigger:{}", trigger_event.id.0),
1460 kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1461 label: Some("replay chain".to_string()),
1462 });
1463 }
1464 let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1465 append_action_graph_node(
1466 &mut action_graph_nodes,
1467 RunActionGraphNodeRecord {
1468 id: trigger_node_id.clone(),
1469 label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1470 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1471 status: "received".to_string(),
1472 outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1473 trace_id: Some(trigger_event.trace_id.0.clone()),
1474 stage_id: None,
1475 node_id: None,
1476 worker_id: None,
1477 run_id: Some(run.id.clone()),
1478 run_path: run.persisted_path.clone(),
1479 metadata: trigger_node_metadata(trigger_event),
1480 },
1481 );
1482 action_graph_edges.push(RunActionGraphEdgeRecord {
1483 from_id: root_node_id.clone(),
1484 to_id: trigger_node_id.clone(),
1485 kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1486 label: Some(trigger_event.id.0.clone()),
1487 });
1488 entry_node_id = trigger_node_id;
1489 }
1490
1491 let stage_node_ids = run
1492 .stages
1493 .iter()
1494 .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1495 .collect::<BTreeMap<_, _>>();
1496 let stage_by_id = run
1497 .stages
1498 .iter()
1499 .map(|stage| (stage.id.as_str(), stage))
1500 .collect::<BTreeMap<_, _>>();
1501 let stage_by_node_id = run
1502 .stages
1503 .iter()
1504 .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1505 .collect::<BTreeMap<_, _>>();
1506
1507 let incoming_nodes = run
1508 .transitions
1509 .iter()
1510 .map(|transition| transition.to_node_id.clone())
1511 .collect::<BTreeSet<_>>();
1512
1513 for stage in &run.stages {
1514 let graph_node_id = stage_node_ids
1515 .get(&stage.id)
1516 .cloned()
1517 .unwrap_or_else(|| format!("stage:{}", stage.id));
1518 append_action_graph_node(
1519 &mut action_graph_nodes,
1520 RunActionGraphNodeRecord {
1521 id: graph_node_id.clone(),
1522 label: stage.node_id.clone(),
1523 kind: action_graph_kind_for_stage(stage).to_string(),
1524 status: stage.status.clone(),
1525 outcome: stage.outcome.clone(),
1526 trace_id: propagated_trace_id.clone(),
1527 stage_id: Some(stage.id.clone()),
1528 node_id: Some(stage.node_id.clone()),
1529 worker_id: stage
1530 .metadata
1531 .get("worker_id")
1532 .and_then(|value| value.as_str())
1533 .map(str::to_string),
1534 run_id: None,
1535 run_path: None,
1536 metadata: stage_node_metadata(stage),
1537 },
1538 );
1539 if !incoming_nodes.contains(&stage.node_id) {
1540 action_graph_edges.push(RunActionGraphEdgeRecord {
1541 from_id: entry_node_id.clone(),
1542 to_id: graph_node_id.clone(),
1543 kind: if trigger_event.is_some() {
1544 ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1545 } else {
1546 ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1547 },
1548 label: None,
1549 });
1550 }
1551
1552 if stage.kind == "verify" || stage.verification.is_some() {
1553 let passed = json_bool(
1554 stage
1555 .verification
1556 .as_ref()
1557 .and_then(|value| value.get("pass")),
1558 )
1559 .or_else(|| {
1560 json_bool(
1561 stage
1562 .verification
1563 .as_ref()
1564 .and_then(|value| value.get("success")),
1565 )
1566 })
1567 .or_else(|| {
1568 if stage.status == "completed" && stage.outcome == "success" {
1569 Some(true)
1570 } else if stage.status == "failed" || stage.outcome == "failed" {
1571 Some(false)
1572 } else {
1573 None
1574 }
1575 });
1576 verification_outcomes.push(RunVerificationOutcomeRecord {
1577 stage_id: stage.id.clone(),
1578 node_id: stage.node_id.clone(),
1579 status: stage.status.clone(),
1580 passed,
1581 summary: stage
1582 .verification
1583 .as_ref()
1584 .map(compact_json_value)
1585 .or_else(|| {
1586 stage
1587 .visible_text
1588 .as_ref()
1589 .filter(|value| !value.trim().is_empty())
1590 .cloned()
1591 }),
1592 });
1593 }
1594
1595 if stage.transcript.is_some() {
1596 transcript_pointers.push(RunTranscriptPointerRecord {
1597 id: format!("stage:{}:transcript", stage.id),
1598 label: format!("Stage {} transcript", stage.node_id),
1599 kind: "embedded_transcript".to_string(),
1600 location: format!("run.stages[{}].transcript", stage.node_id),
1601 path: run.persisted_path.clone(),
1602 available: true,
1603 });
1604 if let Some(transcript) = stage.transcript.as_ref() {
1605 compaction_events.extend(compaction_events_from_transcript(
1606 transcript,
1607 Some(&stage.id),
1608 Some(&stage.node_id),
1609 &format!("run.stages[{}].transcript", stage.node_id),
1610 persisted_path,
1611 ));
1612 }
1613 }
1614
1615 if let Some(payload) = stage_result_payload(stage) {
1616 let trace = payload.get("trace");
1617 let task_ledger = payload
1618 .get("task_ledger")
1619 .and_then(task_ledger_summary_from_value);
1620 let research_facts = task_ledger
1621 .as_ref()
1622 .map(|ledger| ledger.observations.clone())
1623 .unwrap_or_default();
1624 research_fact_count += research_facts.len();
1625 let tools_payload = payload.get("tools");
1626 let tools_used = json_string_array(
1627 tools_payload
1628 .and_then(|tools| tools.get("calls"))
1629 .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1630 );
1631 let successful_tools =
1632 json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1633 let planner_round = RunPlannerRoundRecord {
1634 stage_id: stage.id.clone(),
1635 node_id: stage.node_id.clone(),
1636 stage_kind: stage.kind.clone(),
1637 status: stage.status.clone(),
1638 outcome: stage.outcome.clone(),
1639 iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1640 llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1641 tool_execution_count: json_usize(
1642 trace.and_then(|trace| trace.get("tool_executions")),
1643 ),
1644 tool_rejection_count: json_usize(
1645 trace.and_then(|trace| trace.get("tool_rejections")),
1646 ),
1647 intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1648 compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1649 native_text_tool_fallback_count: json_usize(
1650 trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1651 ),
1652 native_text_tool_fallback_rejection_count: json_usize(
1653 trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1654 ),
1655 empty_completion_retry_count: json_usize(
1656 trace.and_then(|trace| trace.get("empty_completion_retries")),
1657 ),
1658 tools_used,
1659 successful_tools,
1660 ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1661 task_ledger,
1662 research_facts,
1663 };
1664 let has_agentic_detail = planner_round.iteration_count > 0
1665 || planner_round.llm_call_count > 0
1666 || planner_round.tool_execution_count > 0
1667 || planner_round.native_text_tool_fallback_count > 0
1668 || planner_round.native_text_tool_fallback_rejection_count > 0
1669 || planner_round.empty_completion_retry_count > 0
1670 || planner_round.ledger_done_rejections > 0
1671 || planner_round.task_ledger.is_some()
1672 || !planner_round.tools_used.is_empty()
1673 || !planner_round.successful_tools.is_empty();
1674 if has_agentic_detail {
1675 planner_rounds.push(planner_round);
1676 }
1677 }
1678 }
1679
1680 for transition in &run.transitions {
1681 let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1682 continue;
1683 };
1684 let from_stage = transition
1685 .from_stage_id
1686 .as_deref()
1687 .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1688 let from_id = transition
1689 .from_stage_id
1690 .as_ref()
1691 .and_then(|stage_id| stage_node_ids.get(stage_id))
1692 .cloned()
1693 .or_else(|| {
1694 transition
1695 .from_node_id
1696 .as_ref()
1697 .and_then(|node_id| stage_by_node_id.get(node_id))
1698 .cloned()
1699 })
1700 .unwrap_or_else(|| root_node_id.clone());
1701 action_graph_edges.push(RunActionGraphEdgeRecord {
1702 from_id,
1703 to_id,
1704 kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1705 ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1706 } else {
1707 ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1708 },
1709 label: transition.branch.clone(),
1710 });
1711 }
1712
1713 let worker_lineage = run
1714 .child_runs
1715 .iter()
1716 .map(|child| {
1717 let worker_node_id = format!("worker:{}", child.worker_id);
1718 append_action_graph_node(
1719 &mut action_graph_nodes,
1720 RunActionGraphNodeRecord {
1721 id: worker_node_id.clone(),
1722 label: child.worker_name.clone(),
1723 kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1724 status: child.status.clone(),
1725 outcome: child.status.clone(),
1726 trace_id: propagated_trace_id.clone(),
1727 stage_id: child.parent_stage_id.clone(),
1728 node_id: None,
1729 worker_id: Some(child.worker_id.clone()),
1730 run_id: child.run_id.clone(),
1731 run_path: child.run_path.clone(),
1732 metadata: BTreeMap::from([
1733 (
1734 "worker_name".to_string(),
1735 serde_json::json!(child.worker_name),
1736 ),
1737 ("task".to_string(), serde_json::json!(child.task)),
1738 ]),
1739 },
1740 );
1741 if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1742 if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1743 action_graph_edges.push(RunActionGraphEdgeRecord {
1744 from_id: stage_node_id.clone(),
1745 to_id: worker_node_id,
1746 kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1747 label: Some(child.worker_name.clone()),
1748 });
1749 }
1750 }
1751 RunWorkerLineageRecord {
1752 worker_id: child.worker_id.clone(),
1753 worker_name: child.worker_name.clone(),
1754 parent_stage_id: child.parent_stage_id.clone(),
1755 task: child.task.clone(),
1756 status: child.status.clone(),
1757 session_id: child.session_id.clone(),
1758 parent_session_id: child.parent_session_id.clone(),
1759 run_id: child.run_id.clone(),
1760 run_path: child.run_path.clone(),
1761 snapshot_path: child.snapshot_path.clone(),
1762 }
1763 })
1764 .collect::<Vec<_>>();
1765
1766 if run.transcript.is_some() {
1767 transcript_pointers.push(RunTranscriptPointerRecord {
1768 id: "run:transcript".to_string(),
1769 label: "Run transcript".to_string(),
1770 kind: "embedded_transcript".to_string(),
1771 location: "run.transcript".to_string(),
1772 path: run.persisted_path.clone(),
1773 available: true,
1774 });
1775 if let Some(transcript) = run.transcript.as_ref() {
1776 compaction_events.extend(compaction_events_from_transcript(
1777 transcript,
1778 None,
1779 None,
1780 "run.transcript",
1781 persisted_path,
1782 ));
1783 }
1784 }
1785
1786 if let Some(path) = persisted_path {
1787 if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1788 transcript_pointers.push(RunTranscriptPointerRecord {
1789 id: "run:llm_transcript".to_string(),
1790 label: "LLM transcript sidecar".to_string(),
1791 kind: "llm_jsonl".to_string(),
1792 location: "run sidecar".to_string(),
1793 path: Some(sidecar_path.to_string_lossy().into_owned()),
1794 available: sidecar_path.exists(),
1795 });
1796 }
1797 daemon_events.extend(daemon_events_from_sidecar(path));
1798 }
1799
1800 RunObservabilityRecord {
1801 schema_version: 4,
1802 planner_rounds,
1803 research_fact_count,
1804 action_graph_nodes,
1805 action_graph_edges,
1806 worker_lineage,
1807 verification_outcomes,
1808 transcript_pointers,
1809 compaction_events,
1810 daemon_events,
1811 }
1812}
1813
1814fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1815 run.observability = Some(derive_run_observability(run, persisted_path));
1816}
1817
1818pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1819 let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1820 if run.type_name.is_empty() {
1821 run.type_name = "run_record".to_string();
1822 }
1823 if run.id.is_empty() {
1824 run.id = new_id("run");
1825 }
1826 if run.started_at.is_empty() {
1827 run.started_at = now_rfc3339();
1828 }
1829 if run.status.is_empty() {
1830 run.status = "running".to_string();
1831 }
1832 if run.root_run_id.is_none() {
1833 run.root_run_id = Some(run.id.clone());
1834 }
1835 if run.replay_fixture.is_none() {
1836 run.replay_fixture = Some(replay_fixture_from_run(&run));
1837 }
1838 merge_hitl_questions_from_active_log(&mut run);
1839 materialize_child_runs_from_stage_metadata(&mut run);
1840 sync_run_handoffs(&mut run);
1841 if run.observability.is_none() {
1842 let persisted_path = run.persisted_path.clone();
1843 let persisted = persisted_path.as_deref().map(Path::new);
1844 refresh_run_observability(&mut run, persisted);
1845 }
1846 Ok(run)
1847}
1848
1849pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1850 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1851 if manifest.type_name.is_empty() {
1852 manifest.type_name = "eval_suite_manifest".to_string();
1853 }
1854 if manifest.id.is_empty() {
1855 manifest.id = new_id("eval_suite");
1856 }
1857 Ok(manifest)
1858}
1859
1860pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1861 let content = std::fs::read_to_string(path)
1862 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1863 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1864 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1865 if manifest.base_dir.is_none() {
1866 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1867 }
1868 Ok(manifest)
1869}
1870
1871pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1872 let content = std::fs::read_to_string(path)
1873 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1874 let mut manifest: EvalPackManifest =
1875 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1876 serde_json::from_str(&content)
1877 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1878 } else {
1879 toml::from_str(&content)
1880 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1881 };
1882 normalize_eval_pack_manifest(&mut manifest);
1883 if manifest.base_dir.is_none() {
1884 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1885 }
1886 Ok(manifest)
1887}
1888
1889pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
1890 let mut manifest: EvalPackManifest = parse_json_value(value)?;
1891 normalize_eval_pack_manifest(&mut manifest);
1892 Ok(manifest)
1893}
1894
1895fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1896 if manifest.version == 0 {
1897 manifest.version = 1;
1898 }
1899 if manifest.id.is_empty() {
1900 manifest.id = manifest
1901 .name
1902 .clone()
1903 .filter(|name| !name.trim().is_empty())
1904 .unwrap_or_else(|| new_id("eval_pack"));
1905 }
1906 for ladder in &mut manifest.ladders {
1907 super::normalize_persona_eval_ladder_manifest(ladder);
1908 }
1909}
1910
1911fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1912 let content = std::fs::read_to_string(path)
1913 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1914 serde_json::from_str(&content)
1915 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1916}
1917
1918fn load_run_record_from_fixture_ref(
1919 fixture: &EvalPackFixtureRef,
1920 base_dir: Option<&Path>,
1921) -> Result<RunRecord, VmError> {
1922 if let Some(inline) = &fixture.inline {
1923 let run: RunRecord = serde_json::from_value(inline.clone())
1924 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1925 return Ok(run);
1926 }
1927 let path = fixture.path.as_deref().ok_or_else(|| {
1928 VmError::Runtime(format!(
1929 "fixture '{}' is missing path or inline run",
1930 fixture.id
1931 ))
1932 })?;
1933 load_run_record(&resolve_manifest_path(base_dir, path))
1934}
1935
1936fn load_replay_fixture_from_ref(
1937 fixture: &EvalPackFixtureRef,
1938 base_dir: Option<&Path>,
1939) -> Result<ReplayFixture, VmError> {
1940 if let Some(inline) = &fixture.inline {
1941 return serde_json::from_value(inline.clone())
1942 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1943 }
1944 let path = fixture.path.as_deref().ok_or_else(|| {
1945 VmError::Runtime(format!(
1946 "fixture '{}' is missing path or inline replay fixture",
1947 fixture.id
1948 ))
1949 })?;
1950 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1951}
1952
1953fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1954 let path_buf = PathBuf::from(path);
1955 if path_buf.is_absolute() {
1956 path_buf
1957 } else if let Some(base_dir) = base_dir {
1958 base_dir.join(path_buf)
1959 } else {
1960 path_buf
1961 }
1962}
1963
1964pub fn evaluate_run_suite_manifest(
1965 manifest: &EvalSuiteManifest,
1966) -> Result<ReplayEvalSuiteReport, VmError> {
1967 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1968 let mut reports = Vec::new();
1969 for case in &manifest.cases {
1970 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1971 let run = load_run_record(&run_path)?;
1972 let fixture = match &case.fixture_path {
1973 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1974 None => run
1975 .replay_fixture
1976 .clone()
1977 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1978 };
1979 let eval = evaluate_run_against_fixture(&run, &fixture);
1980 let mut pass = eval.pass;
1981 let mut failures = eval.failures;
1982 let comparison = match &case.compare_to {
1983 Some(path) => {
1984 let baseline_path = resolve_manifest_path(base_dir, path);
1985 let baseline = load_run_record(&baseline_path)?;
1986 let diff = diff_run_records(&baseline, &run);
1987 if !diff.identical {
1988 pass = false;
1989 failures.push(format!(
1990 "run differs from baseline {} with {} stage changes",
1991 baseline_path.display(),
1992 diff.stage_diffs.len()
1993 ));
1994 }
1995 Some(diff)
1996 }
1997 None => None,
1998 };
1999 reports.push(ReplayEvalCaseReport {
2000 run_id: run.id.clone(),
2001 workflow_id: run.workflow_id.clone(),
2002 label: case.label.clone(),
2003 pass,
2004 failures,
2005 stage_count: eval.stage_count,
2006 source_path: Some(run_path.display().to_string()),
2007 comparison,
2008 });
2009 }
2010 let total = reports.len();
2011 let passed = reports.iter().filter(|report| report.pass).count();
2012 let failed = total.saturating_sub(passed);
2013 Ok(ReplayEvalSuiteReport {
2014 pass: failed == 0,
2015 total,
2016 passed,
2017 failed,
2018 cases: reports,
2019 })
2020}
2021
2022pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
2023 let base_dir = manifest.base_dir.as_deref().map(Path::new);
2024 let fixture_base_dir_buf = manifest
2025 .defaults
2026 .fixture_root
2027 .as_deref()
2028 .map(|root| resolve_manifest_path(base_dir, root));
2029 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
2030 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
2031 .fixtures
2032 .iter()
2033 .filter(|fixture| !fixture.id.is_empty())
2034 .map(|fixture| (fixture.id.as_str(), fixture))
2035 .collect();
2036 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
2037 .rubrics
2038 .iter()
2039 .filter(|rubric| !rubric.id.is_empty())
2040 .map(|rubric| (rubric.id.as_str(), rubric))
2041 .collect();
2042
2043 let mut reports = Vec::new();
2044 for (index, case) in manifest.cases.iter().enumerate() {
2045 let case_id = case
2046 .id
2047 .clone()
2048 .filter(|id| !id.trim().is_empty())
2049 .unwrap_or_else(|| format!("case_{}", index + 1));
2050 let label = case
2051 .name
2052 .clone()
2053 .or_else(|| case.id.clone())
2054 .unwrap_or_else(|| case_id.clone());
2055 let severity = eval_pack_case_severity(manifest, case);
2056 let blocking = severity == "blocking";
2057 let mut failures = Vec::new();
2058 let mut warnings = Vec::new();
2059 let mut informational = Vec::new();
2060
2061 if case.friction_events.is_some() {
2062 let report = evaluate_eval_pack_friction_case(
2063 manifest,
2064 case,
2065 &case_id,
2066 &label,
2067 &severity,
2068 blocking,
2069 base_dir,
2070 fixture_base_dir,
2071 &fixtures_by_id,
2072 &rubrics_by_id,
2073 )?;
2074 reports.push(report);
2075 continue;
2076 }
2077
2078 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
2079 let fixture =
2080 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
2081 let eval = evaluate_run_against_fixture(&run, &fixture);
2082 failures.extend(eval.failures);
2083 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2084 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2085
2086 let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2087 Some(path) => {
2088 let baseline_path = resolve_manifest_path(base_dir, path);
2089 let baseline = load_run_record(&baseline_path)?;
2090 let diff = diff_run_records(&baseline, &run);
2091 if !diff.identical {
2092 failures.push(format!(
2093 "run differs from baseline {} with {} stage changes",
2094 baseline_path.display(),
2095 diff.stage_diffs.len()
2096 ));
2097 }
2098 Some(diff)
2099 }
2100 None => None,
2101 };
2102
2103 for rubric_id in &case.rubrics {
2104 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2105 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2106 continue;
2107 };
2108 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2109 }
2110
2111 let pass = failures.is_empty() || !blocking;
2112 if !failures.is_empty() && !blocking {
2113 if severity == "warning" {
2114 warnings.append(&mut failures);
2115 } else {
2116 informational.append(&mut failures);
2117 }
2118 }
2119 reports.push(EvalPackCaseReport {
2120 id: case_id,
2121 label,
2122 severity,
2123 pass,
2124 blocking,
2125 run_id: run.id.clone(),
2126 workflow_id: run.workflow_id.clone(),
2127 source_path: eval_pack_case_source_path(
2128 case,
2129 base_dir,
2130 fixture_base_dir,
2131 &fixtures_by_id,
2132 ),
2133 stage_count: eval.stage_count,
2134 failures,
2135 warnings,
2136 informational,
2137 comparison,
2138 });
2139 }
2140
2141 let mut ladder_reports = Vec::new();
2142 for ladder in &manifest.ladders {
2143 let mut ladder = ladder.clone();
2144 if ladder.base_dir.is_none() {
2145 ladder.base_dir = manifest.base_dir.clone();
2146 }
2147 ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2148 }
2149
2150 let case_total = reports.len();
2151 let ladder_total = ladder_reports.len();
2152 let total = case_total + ladder_total;
2153 let case_blocking_failed = reports
2154 .iter()
2155 .filter(|report| report.blocking && !report.failures.is_empty())
2156 .count();
2157 let ladder_blocking_failed = ladder_reports
2158 .iter()
2159 .filter(|report| report.blocking && !report.pass)
2160 .count();
2161 let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2162 let warning_failed = reports
2163 .iter()
2164 .filter(|report| !report.warnings.is_empty())
2165 .count()
2166 + ladder_reports
2167 .iter()
2168 .filter(|report| !report.pass && report.severity == "warning")
2169 .count();
2170 let informational_failed = reports
2171 .iter()
2172 .filter(|report| !report.informational.is_empty())
2173 .count()
2174 + ladder_reports
2175 .iter()
2176 .filter(|report| !report.pass && report.severity == "informational")
2177 .count();
2178 let passed = reports.iter().filter(|report| report.pass).count()
2179 + ladder_reports.iter().filter(|report| report.pass).count();
2180 Ok(EvalPackReport {
2181 pack_id: manifest.id.clone(),
2182 pass: blocking_failed == 0,
2183 total,
2184 passed,
2185 failed: total.saturating_sub(passed),
2186 blocking_failed,
2187 warning_failed,
2188 informational_failed,
2189 cases: reports,
2190 ladders: ladder_reports,
2191 })
2192}
2193
2194#[allow(clippy::too_many_arguments)]
2195fn evaluate_eval_pack_friction_case(
2196 manifest: &EvalPackManifest,
2197 case: &EvalPackCase,
2198 case_id: &str,
2199 label: &str,
2200 severity: &str,
2201 blocking: bool,
2202 base_dir: Option<&Path>,
2203 fixture_base_dir: Option<&Path>,
2204 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2205 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2206) -> Result<EvalPackCaseReport, VmError> {
2207 let mut failures = Vec::new();
2208 let mut warnings = Vec::new();
2209 let mut informational = Vec::new();
2210 let events =
2211 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2212 let options = friction_suggestion_options(case, manifest);
2213 let suggestions = generate_context_pack_suggestions(&events, &options);
2214
2215 for rubric_id in &case.rubrics {
2216 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2217 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2218 continue;
2219 };
2220 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2221 }
2222
2223 if case.rubrics.is_empty() && suggestions.is_empty() {
2224 failures.push("friction fixture produced no context-pack suggestions".to_string());
2225 }
2226
2227 let pass = failures.is_empty() || !blocking;
2228 if !failures.is_empty() && !blocking {
2229 if severity == "warning" {
2230 warnings.append(&mut failures);
2231 } else {
2232 informational.append(&mut failures);
2233 }
2234 }
2235
2236 Ok(EvalPackCaseReport {
2237 id: case_id.to_string(),
2238 label: label.to_string(),
2239 severity: severity.to_string(),
2240 pass,
2241 blocking,
2242 run_id: "friction_events".to_string(),
2243 workflow_id: String::new(),
2244 source_path: eval_pack_case_friction_source_path(
2245 case,
2246 base_dir,
2247 fixture_base_dir,
2248 fixtures_by_id,
2249 ),
2250 stage_count: events.len(),
2251 failures,
2252 warnings,
2253 informational,
2254 comparison: None,
2255 })
2256}
2257
2258fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2259 normalize_eval_pack_severity(
2260 case.severity
2261 .as_deref()
2262 .or(case.thresholds.severity.as_deref())
2263 .or(manifest.defaults.severity.as_deref())
2264 .or(manifest.defaults.thresholds.severity.as_deref())
2265 .unwrap_or("blocking"),
2266 )
2267}
2268
2269fn normalize_eval_pack_severity(value: &str) -> String {
2270 match value.trim().to_ascii_lowercase().as_str() {
2271 "warn" | "warning" => "warning".to_string(),
2272 "info" | "informational" => "informational".to_string(),
2273 _ => "blocking".to_string(),
2274 }
2275}
2276
2277fn load_eval_pack_case_run(
2278 case: &EvalPackCase,
2279 base_dir: Option<&Path>,
2280 fixture_base_dir: Option<&Path>,
2281 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2282) -> Result<RunRecord, VmError> {
2283 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2284 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2285 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2286 }
2287 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2288 }
2289 Err(VmError::Runtime(
2290 "eval pack case is missing run or run_path".to_string(),
2291 ))
2292}
2293
2294fn load_eval_pack_case_fixture(
2295 case: &EvalPackCase,
2296 base_dir: Option<&Path>,
2297 fixture_base_dir: Option<&Path>,
2298 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2299 run: &RunRecord,
2300) -> Result<ReplayFixture, VmError> {
2301 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2302 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2303 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2304 }
2305 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2306 }
2307 Ok(run
2308 .replay_fixture
2309 .clone()
2310 .unwrap_or_else(|| replay_fixture_from_run(run)))
2311}
2312
2313fn eval_pack_case_source_path(
2314 case: &EvalPackCase,
2315 base_dir: Option<&Path>,
2316 fixture_base_dir: Option<&Path>,
2317 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2318) -> Option<String> {
2319 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2320 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2321 return fixture.path.as_ref().map(|path| {
2322 resolve_manifest_path(fixture_base_dir, path)
2323 .display()
2324 .to_string()
2325 });
2326 }
2327 Some(
2328 resolve_manifest_path(base_dir, run_ref)
2329 .display()
2330 .to_string(),
2331 )
2332}
2333
2334fn load_eval_pack_case_friction_events(
2335 case: &EvalPackCase,
2336 base_dir: Option<&Path>,
2337 fixture_base_dir: Option<&Path>,
2338 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2339) -> Result<Vec<FrictionEvent>, VmError> {
2340 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2341 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2342 })?;
2343 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2344 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2345 }
2346 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2347}
2348
2349fn load_friction_events_from_fixture_ref(
2350 fixture: &EvalPackFixtureRef,
2351 base_dir: Option<&Path>,
2352) -> Result<Vec<FrictionEvent>, VmError> {
2353 if let Some(inline) = &fixture.inline {
2354 return normalize_friction_events_json(inline.clone());
2355 }
2356 let path = fixture.path.as_deref().ok_or_else(|| {
2357 VmError::Runtime(format!(
2358 "fixture '{}' is missing path or inline friction events",
2359 fixture.id
2360 ))
2361 })?;
2362 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2363}
2364
2365fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2366 let content = std::fs::read_to_string(path)
2367 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2368 let value: serde_json::Value = serde_json::from_str(&content)
2369 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2370 normalize_friction_events_json(value)
2371}
2372
2373fn eval_pack_case_friction_source_path(
2374 case: &EvalPackCase,
2375 base_dir: Option<&Path>,
2376 fixture_base_dir: Option<&Path>,
2377 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2378) -> Option<String> {
2379 let event_ref = case.friction_events.as_deref()?;
2380 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2381 return fixture.path.as_ref().map(|path| {
2382 resolve_manifest_path(fixture_base_dir, path)
2383 .display()
2384 .to_string()
2385 });
2386 }
2387 Some(
2388 resolve_manifest_path(base_dir, event_ref)
2389 .display()
2390 .to_string(),
2391 )
2392}
2393
2394fn friction_suggestion_options(
2395 case: &EvalPackCase,
2396 manifest: &EvalPackManifest,
2397) -> ContextPackSuggestionOptions {
2398 let min_occurrences = case
2399 .metadata
2400 .get("min_occurrences")
2401 .or_else(|| manifest.metadata.get("min_occurrences"))
2402 .and_then(|value| value.as_u64())
2403 .unwrap_or(2) as usize;
2404 let owner = case
2405 .metadata
2406 .get("owner")
2407 .or_else(|| manifest.metadata.get("owner"))
2408 .and_then(|value| value.as_str())
2409 .map(str::to_string)
2410 .or_else(|| {
2411 manifest
2412 .package
2413 .as_ref()
2414 .and_then(|package| package.name.clone())
2415 });
2416 ContextPackSuggestionOptions {
2417 min_occurrences,
2418 owner,
2419 }
2420}
2421
2422fn apply_eval_pack_thresholds(
2423 run: &RunRecord,
2424 thresholds: &EvalPackThresholds,
2425 failures: &mut Vec<String>,
2426) {
2427 if let Some(max_stage_count) = thresholds.max_stage_count {
2428 if run.stages.len() > max_stage_count {
2429 failures.push(format!(
2430 "stage count {} exceeds threshold {}",
2431 run.stages.len(),
2432 max_stage_count
2433 ));
2434 }
2435 }
2436 if let Some(max_latency_ms) = thresholds.max_latency_ms {
2437 let actual = run
2438 .usage
2439 .as_ref()
2440 .map(|usage| usage.total_duration_ms)
2441 .unwrap_or_default();
2442 if actual > max_latency_ms {
2443 failures.push(format!(
2444 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2445 ));
2446 }
2447 }
2448 if let Some(max_cost_usd) = thresholds.max_cost_usd {
2449 let actual = run
2450 .usage
2451 .as_ref()
2452 .map(|usage| usage.total_cost)
2453 .unwrap_or_default();
2454 if actual > max_cost_usd {
2455 failures.push(format!(
2456 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2457 ));
2458 }
2459 }
2460 if let Some(max_tokens) = thresholds.max_tokens {
2461 let actual = run
2462 .usage
2463 .as_ref()
2464 .map(|usage| usage.input_tokens + usage.output_tokens)
2465 .unwrap_or_default();
2466 if actual > max_tokens {
2467 failures.push(format!(
2468 "token count {actual} exceeds threshold {max_tokens}"
2469 ));
2470 }
2471 }
2472}
2473
2474fn apply_eval_pack_rubric(
2475 rubric: &EvalPackRubric,
2476 run: &RunRecord,
2477 failures: &mut Vec<String>,
2478 warnings: &mut Vec<String>,
2479) {
2480 match rubric.kind.as_str() {
2481 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2482 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2483 for assertion in &rubric.assertions {
2484 apply_eval_pack_assertion(rubric, assertion, run, failures);
2485 }
2486 }
2487 "llm-judge" | "llm_as_judge" | "judge" => {
2488 let severity = normalize_eval_pack_severity(
2489 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2490 );
2491 let message = format!(
2492 "rubric '{}' requires an external LLM judge and was not run locally",
2493 rubric.id
2494 );
2495 if severity == "blocking" {
2496 failures.push(message);
2497 } else {
2498 warnings.push(message);
2499 }
2500 }
2501 other => warnings.push(format!(
2502 "rubric '{}' has unknown kind '{}' and was not run locally",
2503 rubric.id, other
2504 )),
2505 }
2506}
2507
2508fn apply_eval_pack_friction_rubric(
2509 rubric: &EvalPackRubric,
2510 suggestions: &[super::ContextPackSuggestion],
2511 failures: &mut Vec<String>,
2512 warnings: &mut Vec<String>,
2513) {
2514 match rubric.kind.as_str() {
2515 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2516 let mut expectations = Vec::new();
2517 for assertion in &rubric.assertions {
2518 match assertion.kind.as_str() {
2519 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2520 let expectation = context_pack_expectation_from_assertion(assertion);
2521 expectations.push(expectation);
2522 }
2523 other => failures.push(format!(
2524 "rubric '{}' has unsupported friction assertion kind '{}'",
2525 rubric.id, other
2526 )),
2527 }
2528 }
2529 failures.extend(evaluate_context_pack_suggestion_expectations(
2530 suggestions,
2531 &expectations,
2532 ));
2533 }
2534 other => warnings.push(format!(
2535 "rubric '{}' has unknown friction kind '{}' and was not run locally",
2536 rubric.id, other
2537 )),
2538 }
2539}
2540
2541fn context_pack_expectation_from_assertion(
2542 assertion: &EvalPackAssertion,
2543) -> ContextPackSuggestionExpectation {
2544 let expected = assertion
2545 .expected
2546 .as_ref()
2547 .and_then(|value| value.as_object());
2548 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2549 ContextPackSuggestionExpectation {
2550 min_suggestions: expected
2551 .and_then(|map| map.get("min_suggestions"))
2552 .and_then(|value| value.as_u64())
2553 .map(|value| value as usize),
2554 recommended_artifact: expected
2555 .and_then(|map| map.get("recommended_artifact"))
2556 .and_then(|value| value.as_str())
2557 .map(str::to_string)
2558 .or_else(|| expected_string.map(str::to_string)),
2559 title_contains: assertion.contains.clone().or_else(|| {
2560 expected
2561 .and_then(|map| map.get("title_contains"))
2562 .and_then(|value| value.as_str())
2563 .map(str::to_string)
2564 }),
2565 manifest_name_contains: expected
2566 .and_then(|map| map.get("manifest_name_contains"))
2567 .and_then(|value| value.as_str())
2568 .map(str::to_string),
2569 required_capability: expected
2570 .and_then(|map| map.get("required_capability"))
2571 .and_then(|value| value.as_str())
2572 .map(str::to_string),
2573 required_output_slot: expected
2574 .and_then(|map| map.get("required_output_slot"))
2575 .and_then(|value| value.as_str())
2576 .map(str::to_string),
2577 }
2578}
2579
2580fn apply_eval_pack_assertion(
2581 rubric: &EvalPackRubric,
2582 assertion: &EvalPackAssertion,
2583 run: &RunRecord,
2584 failures: &mut Vec<String>,
2585) {
2586 match assertion.kind.as_str() {
2587 "run-status" | "run_status" | "status" => {
2588 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2589 if let Some(expected) = expected {
2590 if run.status != expected {
2591 failures.push(format!(
2592 "rubric '{}' expected run status {}, got {}",
2593 rubric.id, expected, run.status
2594 ));
2595 }
2596 }
2597 }
2598 "stage-status" | "stage_status" => {
2599 let Some(stage_id) = assertion.stage.as_deref() else {
2600 failures.push(format!(
2601 "rubric '{}' stage-status assertion missing stage",
2602 rubric.id
2603 ));
2604 return;
2605 };
2606 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2607 let Some(expected) = expected else {
2608 failures.push(format!(
2609 "rubric '{}' stage-status assertion missing expected string",
2610 rubric.id
2611 ));
2612 return;
2613 };
2614 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2615 Some(stage) if stage.status == expected => {}
2616 Some(stage) => failures.push(format!(
2617 "rubric '{}' expected stage {} status {}, got {}",
2618 rubric.id, stage_id, expected, stage.status
2619 )),
2620 None => failures.push(format!(
2621 "rubric '{}' expected stage {} to exist",
2622 rubric.id, stage_id
2623 )),
2624 }
2625 }
2626 "visible-text-contains" | "visible_text_contains" => {
2627 let Some(needle) = assertion.contains.as_deref() else {
2628 failures.push(format!(
2629 "rubric '{}' visible-text assertion missing contains",
2630 rubric.id
2631 ));
2632 return;
2633 };
2634 let matched = match assertion.stage.as_deref() {
2635 Some(stage_id) => run
2636 .stages
2637 .iter()
2638 .find(|stage| stage.node_id == stage_id)
2639 .and_then(|stage| stage.visible_text.as_deref())
2640 .is_some_and(|text| text.contains(needle)),
2641 None => run
2642 .stages
2643 .iter()
2644 .filter_map(|stage| stage.visible_text.as_deref())
2645 .any(|text| text.contains(needle)),
2646 };
2647 if !matched {
2648 failures.push(format!(
2649 "rubric '{}' expected visible text to contain {:?}",
2650 rubric.id, needle
2651 ));
2652 }
2653 }
2654 "hitl-question-contains" | "hitl_question_contains" => {
2655 let Some(needle) = assertion.contains.as_deref() else {
2656 failures.push(format!(
2657 "rubric '{}' HITL assertion missing contains",
2658 rubric.id
2659 ));
2660 return;
2661 };
2662 if !run
2663 .hitl_questions
2664 .iter()
2665 .any(|question| question.prompt.contains(needle))
2666 {
2667 failures.push(format!(
2668 "rubric '{}' expected HITL question to contain {:?}",
2669 rubric.id, needle
2670 ));
2671 }
2672 }
2673 "" => {}
2674 other => failures.push(format!(
2675 "rubric '{}' has unsupported assertion kind '{}'",
2676 rubric.id, other
2677 )),
2678 }
2679}
2680
2681#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2683pub(crate) enum DiffOp {
2684 Equal,
2685 Delete,
2686 Insert,
2687}
2688
2689pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2693 let n = a.len() as isize;
2694 let m = b.len() as isize;
2695 if n == 0 && m == 0 {
2696 return Vec::new();
2697 }
2698 if n == 0 {
2699 return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2700 }
2701 if m == 0 {
2702 return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2703 }
2704
2705 let max_d = (n + m) as usize;
2706 let offset = max_d as isize;
2707 let v_size = 2 * max_d + 1;
2708 let mut v = vec![0isize; v_size];
2709 let mut trace: Vec<Vec<isize>> = Vec::new();
2711
2712 'outer: for d in 0..=max_d as isize {
2713 trace.push(v.clone());
2714 let mut new_v = v.clone();
2715 for k in (-d..=d).step_by(2) {
2716 let ki = (k + offset) as usize;
2717 let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2718 v[ki + 1]
2719 } else {
2720 v[ki - 1] + 1
2721 };
2722 let mut y = x - k;
2723 while x < n && y < m && a[x as usize] == b[y as usize] {
2724 x += 1;
2725 y += 1;
2726 }
2727 new_v[ki] = x;
2728 if x >= n && y >= m {
2729 let _ = new_v;
2730 break 'outer;
2731 }
2732 }
2733 v = new_v;
2734 }
2735
2736 let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2737 let mut x = n;
2738 let mut y = m;
2739 for d in (1..trace.len() as isize).rev() {
2740 let k = x - y;
2741 let v_prev = &trace[d as usize];
2742 let prev_k = if k == -d
2743 || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2744 {
2745 k + 1
2746 } else {
2747 k - 1
2748 };
2749 let prev_x = v_prev[(prev_k + offset) as usize];
2750 let prev_y = prev_x - prev_k;
2751
2752 while x > prev_x && y > prev_y {
2753 x -= 1;
2754 y -= 1;
2755 ops.push((DiffOp::Equal, x as usize));
2756 }
2757 if prev_k < k {
2758 x -= 1;
2759 ops.push((DiffOp::Delete, x as usize));
2760 } else {
2761 y -= 1;
2762 ops.push((DiffOp::Insert, y as usize));
2763 }
2764 }
2765 while x > 0 && y > 0 {
2766 x -= 1;
2767 y -= 1;
2768 ops.push((DiffOp::Equal, x as usize));
2769 }
2770 ops.reverse();
2771 ops
2772}
2773
2774pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2775 let before_lines: Vec<&str> = before.lines().collect();
2776 let after_lines: Vec<&str> = after.lines().collect();
2777 let ops = myers_diff(&before_lines, &after_lines);
2778
2779 let mut diff = String::new();
2780 let file = path.unwrap_or("artifact");
2781 diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2782 for &(op, idx) in &ops {
2783 match op {
2784 DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2785 DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2786 DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2787 }
2788 }
2789 diff
2790}
2791
2792pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2793 let path = path
2794 .map(PathBuf::from)
2795 .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2796 let mut materialized = run.clone();
2797 merge_hitl_questions_from_active_log(&mut materialized);
2798 materialize_child_runs_from_stage_metadata(&mut materialized);
2799 if materialized.replay_fixture.is_none() {
2800 materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2801 }
2802 materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2803 sync_run_handoffs(&mut materialized);
2804 refresh_run_observability(&mut materialized, Some(&path));
2805 if let Some(parent) = path.parent() {
2806 std::fs::create_dir_all(parent)
2807 .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2808 }
2809 let json = serde_json::to_string_pretty(&materialized)
2810 .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2811 crate::atomic_io::atomic_write(&path, json.as_bytes())
2812 .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2813 if let Some(observability) = materialized.observability.as_ref() {
2814 publish_action_graph_event(&materialized, observability, &path);
2815 }
2816 Ok(path.to_string_lossy().into_owned())
2817}
2818
2819pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2820 let content = std::fs::read_to_string(path)
2821 .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2822 let mut run: RunRecord = serde_json::from_str(&content)
2823 .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2824 materialize_child_runs_from_stage_metadata(&mut run);
2825 if run.replay_fixture.is_none() {
2826 run.replay_fixture = Some(replay_fixture_from_run(&run));
2827 }
2828 run.persisted_path
2829 .get_or_insert_with(|| path.to_string_lossy().into_owned());
2830 sync_run_handoffs(&mut run);
2831 refresh_run_observability(&mut run, Some(path));
2832 Ok(run)
2833}
2834
2835pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2836 ReplayFixture {
2837 type_name: "replay_fixture".to_string(),
2838 id: new_id("fixture"),
2839 source_run_id: run.id.clone(),
2840 workflow_id: run.workflow_id.clone(),
2841 workflow_name: run.workflow_name.clone(),
2842 created_at: now_rfc3339(),
2843 eval_kind: Some("replay".to_string()),
2844 clarifying_question: None,
2845 expected_status: run.status.clone(),
2846 stage_assertions: run
2847 .stages
2848 .iter()
2849 .map(|stage| ReplayStageAssertion {
2850 node_id: stage.node_id.clone(),
2851 expected_status: stage.status.clone(),
2852 expected_outcome: stage.outcome.clone(),
2853 expected_branch: stage.branch.clone(),
2854 required_artifact_kinds: stage
2855 .artifacts
2856 .iter()
2857 .map(|artifact| artifact.kind.clone())
2858 .collect(),
2859 visible_text_contains: stage
2860 .visible_text
2861 .as_ref()
2862 .filter(|text| !text.is_empty())
2863 .map(|text| text.chars().take(80).collect()),
2864 })
2865 .collect(),
2866 }
2867}
2868
2869pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2870 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2871 return evaluate_clarifying_question(run, fixture);
2872 }
2873 let mut failures = Vec::new();
2874 if run.status != fixture.expected_status {
2875 failures.push(format!(
2876 "run status mismatch: expected {}, got {}",
2877 fixture.expected_status, run.status
2878 ));
2879 }
2880 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2881 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2882 for assertion in &fixture.stage_assertions {
2883 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2884 failures.push(format!("missing stage {}", assertion.node_id));
2885 continue;
2886 };
2887 if stage.status != assertion.expected_status {
2888 failures.push(format!(
2889 "stage {} status mismatch: expected {}, got {}",
2890 assertion.node_id, assertion.expected_status, stage.status
2891 ));
2892 }
2893 if stage.outcome != assertion.expected_outcome {
2894 failures.push(format!(
2895 "stage {} outcome mismatch: expected {}, got {}",
2896 assertion.node_id, assertion.expected_outcome, stage.outcome
2897 ));
2898 }
2899 if stage.branch != assertion.expected_branch {
2900 failures.push(format!(
2901 "stage {} branch mismatch: expected {:?}, got {:?}",
2902 assertion.node_id, assertion.expected_branch, stage.branch
2903 ));
2904 }
2905 for required_kind in &assertion.required_artifact_kinds {
2906 if !stage
2907 .artifacts
2908 .iter()
2909 .any(|artifact| &artifact.kind == required_kind)
2910 {
2911 failures.push(format!(
2912 "stage {} missing artifact kind {}",
2913 assertion.node_id, required_kind
2914 ));
2915 }
2916 }
2917 if let Some(snippet) = &assertion.visible_text_contains {
2918 let actual = stage.visible_text.clone().unwrap_or_default();
2919 if !actual.contains(snippet) {
2920 failures.push(format!(
2921 "stage {} visible text does not contain expected snippet {:?}",
2922 assertion.node_id, snippet
2923 ));
2924 }
2925 }
2926 }
2927
2928 ReplayEvalReport {
2929 pass: failures.is_empty(),
2930 failures,
2931 stage_count: run.stages.len(),
2932 }
2933}
2934
2935fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2936 let mut failures = Vec::new();
2937 let spec = fixture.clarifying_question.clone().unwrap_or_default();
2938 let min_questions = clarifying_min_questions(&spec);
2939 let max_questions = clarifying_max_questions(&spec);
2940 let questions = &run.hitl_questions;
2941
2942 if run.status != fixture.expected_status {
2943 failures.push(format!(
2944 "run status mismatch: expected {}, got {}",
2945 fixture.expected_status, run.status
2946 ));
2947 }
2948 if questions.len() < min_questions {
2949 failures.push(format!(
2950 "expected at least {min_questions} clarifying question(s), got {}",
2951 questions.len()
2952 ));
2953 }
2954 if questions.len() > max_questions {
2955 failures.push(format!(
2956 "expected at most {max_questions} clarifying question(s), got {}",
2957 questions.len()
2958 ));
2959 }
2960
2961 let normalized_expected = spec
2962 .expected_question
2963 .as_deref()
2964 .map(normalize_question_text);
2965 let normalized_accepted = spec
2966 .accepted_questions
2967 .iter()
2968 .map(|question| normalize_question_text(question))
2969 .collect::<Vec<_>>();
2970 let required_terms = spec
2971 .required_terms
2972 .iter()
2973 .map(|term| normalize_question_text(term))
2974 .collect::<Vec<_>>();
2975 let forbidden_terms = spec
2976 .forbidden_terms
2977 .iter()
2978 .map(|term| normalize_question_text(term))
2979 .collect::<Vec<_>>();
2980
2981 let matched = questions.iter().any(|question| {
2982 let normalized = normalize_question_text(&question.prompt);
2983 let matches_expected = normalized_expected
2984 .as_ref()
2985 .is_none_or(|expected| &normalized == expected)
2986 && (normalized_accepted.is_empty()
2987 || normalized_accepted
2988 .iter()
2989 .any(|candidate| candidate == &normalized));
2990 let has_required_terms = required_terms
2991 .iter()
2992 .all(|term| normalized.contains(term.as_str()));
2993 let avoids_forbidden_terms = forbidden_terms
2994 .iter()
2995 .all(|term| !normalized.contains(term.as_str()));
2996 matches_expected && has_required_terms && avoids_forbidden_terms
2997 });
2998
2999 if !questions.is_empty()
3000 && (!normalized_accepted.is_empty()
3001 || normalized_expected.is_some()
3002 || !required_terms.is_empty()
3003 || !forbidden_terms.is_empty())
3004 && !matched
3005 {
3006 failures.push(format!(
3007 "no clarifying question matched fixture; actual questions: {}",
3008 questions
3009 .iter()
3010 .map(|question| format!("{:?}", question.prompt))
3011 .collect::<Vec<_>>()
3012 .join(", ")
3013 ));
3014 }
3015
3016 ReplayEvalReport {
3017 pass: failures.is_empty(),
3018 failures,
3019 stage_count: run.stages.len(),
3020 }
3021}
3022
3023pub fn evaluate_run_suite(
3024 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
3025) -> ReplayEvalSuiteReport {
3026 let mut reports = Vec::new();
3027 for (run, fixture, source_path) in cases {
3028 let report = evaluate_run_against_fixture(&run, &fixture);
3029 reports.push(ReplayEvalCaseReport {
3030 run_id: run.id.clone(),
3031 workflow_id: run.workflow_id.clone(),
3032 label: None,
3033 pass: report.pass,
3034 failures: report.failures,
3035 stage_count: report.stage_count,
3036 source_path,
3037 comparison: None,
3038 });
3039 }
3040 let total = reports.len();
3041 let passed = reports.iter().filter(|report| report.pass).count();
3042 let failed = total.saturating_sub(passed);
3043 ReplayEvalSuiteReport {
3044 pass: failed == 0,
3045 total,
3046 passed,
3047 failed,
3048 cases: reports,
3049 }
3050}
3051
3052pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
3053 let mut stage_diffs = Vec::new();
3054 let mut all_node_ids = BTreeSet::new();
3055 let left_by_id: BTreeMap<&str, &RunStageRecord> = left
3056 .stages
3057 .iter()
3058 .map(|s| (s.node_id.as_str(), s))
3059 .collect();
3060 let right_by_id: BTreeMap<&str, &RunStageRecord> = right
3061 .stages
3062 .iter()
3063 .map(|s| (s.node_id.as_str(), s))
3064 .collect();
3065 all_node_ids.extend(left_by_id.keys().copied());
3066 all_node_ids.extend(right_by_id.keys().copied());
3067
3068 for node_id in all_node_ids {
3069 let left_stage = left_by_id.get(node_id).copied();
3070 let right_stage = right_by_id.get(node_id).copied();
3071 match (left_stage, right_stage) {
3072 (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
3073 node_id: node_id.to_string(),
3074 change: "removed".to_string(),
3075 details: vec!["stage missing from right run".to_string()],
3076 }),
3077 (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
3078 node_id: node_id.to_string(),
3079 change: "added".to_string(),
3080 details: vec!["stage missing from left run".to_string()],
3081 }),
3082 (Some(left_stage), Some(right_stage)) => {
3083 let mut details = Vec::new();
3084 if left_stage.status != right_stage.status {
3085 details.push(format!(
3086 "status: {} -> {}",
3087 left_stage.status, right_stage.status
3088 ));
3089 }
3090 if left_stage.outcome != right_stage.outcome {
3091 details.push(format!(
3092 "outcome: {} -> {}",
3093 left_stage.outcome, right_stage.outcome
3094 ));
3095 }
3096 if left_stage.branch != right_stage.branch {
3097 details.push(format!(
3098 "branch: {:?} -> {:?}",
3099 left_stage.branch, right_stage.branch
3100 ));
3101 }
3102 if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
3103 {
3104 details.push(format!(
3105 "produced_artifacts: {} -> {}",
3106 left_stage.produced_artifact_ids.len(),
3107 right_stage.produced_artifact_ids.len()
3108 ));
3109 }
3110 if left_stage.artifacts.len() != right_stage.artifacts.len() {
3111 details.push(format!(
3112 "artifact_records: {} -> {}",
3113 left_stage.artifacts.len(),
3114 right_stage.artifacts.len()
3115 ));
3116 }
3117 if !details.is_empty() {
3118 stage_diffs.push(RunStageDiffRecord {
3119 node_id: node_id.to_string(),
3120 change: "changed".to_string(),
3121 details,
3122 });
3123 }
3124 }
3125 (None, None) => {}
3126 }
3127 }
3128
3129 let mut tool_diffs = Vec::new();
3130 let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3131 .tool_recordings
3132 .iter()
3133 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3134 .collect();
3135 let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3136 .tool_recordings
3137 .iter()
3138 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3139 .collect();
3140 let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3141 .keys()
3142 .chain(right_tools.keys())
3143 .cloned()
3144 .collect();
3145 for key in &all_tool_keys {
3146 let l = left_tools.get(key);
3147 let r = right_tools.get(key);
3148 let result_changed = match (l, r) {
3149 (Some(a), Some(b)) => a.result != b.result,
3150 _ => true,
3151 };
3152 if result_changed {
3153 tool_diffs.push(ToolCallDiffRecord {
3154 tool_name: key.0.clone(),
3155 args_hash: key.1.clone(),
3156 result_changed,
3157 left_result: l.map(|t| t.result.clone()),
3158 right_result: r.map(|t| t.result.clone()),
3159 });
3160 }
3161 }
3162
3163 let left_observability = left.observability.clone().unwrap_or_else(|| {
3164 derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3165 });
3166 let right_observability = right.observability.clone().unwrap_or_else(|| {
3167 derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3168 });
3169 let mut observability_diffs = Vec::new();
3170
3171 let left_workers = left_observability
3172 .worker_lineage
3173 .iter()
3174 .map(|worker| {
3175 (
3176 worker.worker_id.clone(),
3177 (
3178 worker.status.clone(),
3179 worker.run_id.clone(),
3180 worker.run_path.clone(),
3181 ),
3182 )
3183 })
3184 .collect::<BTreeMap<_, _>>();
3185 let right_workers = right_observability
3186 .worker_lineage
3187 .iter()
3188 .map(|worker| {
3189 (
3190 worker.worker_id.clone(),
3191 (
3192 worker.status.clone(),
3193 worker.run_id.clone(),
3194 worker.run_path.clone(),
3195 ),
3196 )
3197 })
3198 .collect::<BTreeMap<_, _>>();
3199 let worker_ids = left_workers
3200 .keys()
3201 .chain(right_workers.keys())
3202 .cloned()
3203 .collect::<BTreeSet<_>>();
3204 for worker_id in worker_ids {
3205 match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3206 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3207 section: "worker_lineage".to_string(),
3208 label: worker_id,
3209 details: vec!["worker missing from right run".to_string()],
3210 }),
3211 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3212 section: "worker_lineage".to_string(),
3213 label: worker_id,
3214 details: vec!["worker missing from left run".to_string()],
3215 }),
3216 (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3217 let mut details = Vec::new();
3218 if left_worker.0 != right_worker.0 {
3219 details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3220 }
3221 if left_worker.1 != right_worker.1 {
3222 details.push(format!(
3223 "run_id: {:?} -> {:?}",
3224 left_worker.1, right_worker.1
3225 ));
3226 }
3227 if left_worker.2 != right_worker.2 {
3228 details.push(format!(
3229 "run_path: {:?} -> {:?}",
3230 left_worker.2, right_worker.2
3231 ));
3232 }
3233 observability_diffs.push(RunObservabilityDiffRecord {
3234 section: "worker_lineage".to_string(),
3235 label: worker_id,
3236 details,
3237 });
3238 }
3239 _ => {}
3240 }
3241 }
3242
3243 let left_rounds = left_observability
3244 .planner_rounds
3245 .iter()
3246 .map(|round| (round.stage_id.clone(), round))
3247 .collect::<BTreeMap<_, _>>();
3248 let right_rounds = right_observability
3249 .planner_rounds
3250 .iter()
3251 .map(|round| (round.stage_id.clone(), round))
3252 .collect::<BTreeMap<_, _>>();
3253 let round_ids = left_rounds
3254 .keys()
3255 .chain(right_rounds.keys())
3256 .cloned()
3257 .collect::<BTreeSet<_>>();
3258 for stage_id in round_ids {
3259 match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3260 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3261 section: "planner_rounds".to_string(),
3262 label: stage_id,
3263 details: vec!["planner summary missing from right run".to_string()],
3264 }),
3265 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3266 section: "planner_rounds".to_string(),
3267 label: stage_id,
3268 details: vec!["planner summary missing from left run".to_string()],
3269 }),
3270 (Some(left_round), Some(right_round)) => {
3271 let mut details = Vec::new();
3272 if left_round.iteration_count != right_round.iteration_count {
3273 details.push(format!(
3274 "iterations: {} -> {}",
3275 left_round.iteration_count, right_round.iteration_count
3276 ));
3277 }
3278 if left_round.tool_execution_count != right_round.tool_execution_count {
3279 details.push(format!(
3280 "tool_executions: {} -> {}",
3281 left_round.tool_execution_count, right_round.tool_execution_count
3282 ));
3283 }
3284 if left_round.native_text_tool_fallback_count
3285 != right_round.native_text_tool_fallback_count
3286 {
3287 details.push(format!(
3288 "native_text_tool_fallbacks: {} -> {}",
3289 left_round.native_text_tool_fallback_count,
3290 right_round.native_text_tool_fallback_count
3291 ));
3292 }
3293 if left_round.native_text_tool_fallback_rejection_count
3294 != right_round.native_text_tool_fallback_rejection_count
3295 {
3296 details.push(format!(
3297 "native_text_tool_fallback_rejections: {} -> {}",
3298 left_round.native_text_tool_fallback_rejection_count,
3299 right_round.native_text_tool_fallback_rejection_count
3300 ));
3301 }
3302 if left_round.empty_completion_retry_count
3303 != right_round.empty_completion_retry_count
3304 {
3305 details.push(format!(
3306 "empty_completion_retries: {} -> {}",
3307 left_round.empty_completion_retry_count,
3308 right_round.empty_completion_retry_count
3309 ));
3310 }
3311 if left_round.research_facts != right_round.research_facts {
3312 details.push(format!(
3313 "research_facts: {:?} -> {:?}",
3314 left_round.research_facts, right_round.research_facts
3315 ));
3316 }
3317 let left_deliverables = left_round
3318 .task_ledger
3319 .as_ref()
3320 .map(|ledger| {
3321 ledger
3322 .deliverables
3323 .iter()
3324 .map(|item| format!("{}:{}", item.id, item.status))
3325 .collect::<Vec<_>>()
3326 })
3327 .unwrap_or_default();
3328 let right_deliverables = right_round
3329 .task_ledger
3330 .as_ref()
3331 .map(|ledger| {
3332 ledger
3333 .deliverables
3334 .iter()
3335 .map(|item| format!("{}:{}", item.id, item.status))
3336 .collect::<Vec<_>>()
3337 })
3338 .unwrap_or_default();
3339 if left_deliverables != right_deliverables {
3340 details.push(format!(
3341 "deliverables: {:?} -> {:?}",
3342 left_deliverables, right_deliverables
3343 ));
3344 }
3345 if left_round.successful_tools != right_round.successful_tools {
3346 details.push(format!(
3347 "successful_tools: {:?} -> {:?}",
3348 left_round.successful_tools, right_round.successful_tools
3349 ));
3350 }
3351 if !details.is_empty() {
3352 observability_diffs.push(RunObservabilityDiffRecord {
3353 section: "planner_rounds".to_string(),
3354 label: left_round.node_id.clone(),
3355 details,
3356 });
3357 }
3358 }
3359 _ => {}
3360 }
3361 }
3362
3363 let left_pointers = left_observability
3364 .transcript_pointers
3365 .iter()
3366 .map(|pointer| {
3367 (
3368 pointer.id.clone(),
3369 (
3370 pointer.available,
3371 pointer.path.clone(),
3372 pointer.location.clone(),
3373 ),
3374 )
3375 })
3376 .collect::<BTreeMap<_, _>>();
3377 let right_pointers = right_observability
3378 .transcript_pointers
3379 .iter()
3380 .map(|pointer| {
3381 (
3382 pointer.id.clone(),
3383 (
3384 pointer.available,
3385 pointer.path.clone(),
3386 pointer.location.clone(),
3387 ),
3388 )
3389 })
3390 .collect::<BTreeMap<_, _>>();
3391 let pointer_ids = left_pointers
3392 .keys()
3393 .chain(right_pointers.keys())
3394 .cloned()
3395 .collect::<BTreeSet<_>>();
3396 for pointer_id in pointer_ids {
3397 match (
3398 left_pointers.get(&pointer_id),
3399 right_pointers.get(&pointer_id),
3400 ) {
3401 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3402 section: "transcript_pointers".to_string(),
3403 label: pointer_id,
3404 details: vec!["pointer missing from right run".to_string()],
3405 }),
3406 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3407 section: "transcript_pointers".to_string(),
3408 label: pointer_id,
3409 details: vec!["pointer missing from left run".to_string()],
3410 }),
3411 (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3412 observability_diffs.push(RunObservabilityDiffRecord {
3413 section: "transcript_pointers".to_string(),
3414 label: pointer_id,
3415 details: vec![format!(
3416 "pointer: {:?} -> {:?}",
3417 left_pointer, right_pointer
3418 )],
3419 });
3420 }
3421 _ => {}
3422 }
3423 }
3424
3425 let left_compactions = left_observability
3426 .compaction_events
3427 .iter()
3428 .map(|event| {
3429 (
3430 event.id.clone(),
3431 (
3432 event.strategy.clone(),
3433 event.archived_messages,
3434 event.snapshot_asset_id.clone(),
3435 event.available,
3436 ),
3437 )
3438 })
3439 .collect::<BTreeMap<_, _>>();
3440 let right_compactions = right_observability
3441 .compaction_events
3442 .iter()
3443 .map(|event| {
3444 (
3445 event.id.clone(),
3446 (
3447 event.strategy.clone(),
3448 event.archived_messages,
3449 event.snapshot_asset_id.clone(),
3450 event.available,
3451 ),
3452 )
3453 })
3454 .collect::<BTreeMap<_, _>>();
3455 let compaction_ids = left_compactions
3456 .keys()
3457 .chain(right_compactions.keys())
3458 .cloned()
3459 .collect::<BTreeSet<_>>();
3460 for compaction_id in compaction_ids {
3461 match (
3462 left_compactions.get(&compaction_id),
3463 right_compactions.get(&compaction_id),
3464 ) {
3465 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3466 section: "compaction_events".to_string(),
3467 label: compaction_id,
3468 details: vec!["compaction event missing from right run".to_string()],
3469 }),
3470 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3471 section: "compaction_events".to_string(),
3472 label: compaction_id,
3473 details: vec!["compaction event missing from left run".to_string()],
3474 }),
3475 (Some(left_event), Some(right_event)) if left_event != right_event => {
3476 observability_diffs.push(RunObservabilityDiffRecord {
3477 section: "compaction_events".to_string(),
3478 label: compaction_id,
3479 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3480 });
3481 }
3482 _ => {}
3483 }
3484 }
3485
3486 let left_daemons = left_observability
3487 .daemon_events
3488 .iter()
3489 .map(|event| {
3490 (
3491 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3492 (
3493 event.name.clone(),
3494 event.persist_path.clone(),
3495 event.payload_summary.clone(),
3496 ),
3497 )
3498 })
3499 .collect::<BTreeMap<_, _>>();
3500 let right_daemons = right_observability
3501 .daemon_events
3502 .iter()
3503 .map(|event| {
3504 (
3505 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3506 (
3507 event.name.clone(),
3508 event.persist_path.clone(),
3509 event.payload_summary.clone(),
3510 ),
3511 )
3512 })
3513 .collect::<BTreeMap<_, _>>();
3514 let daemon_keys = left_daemons
3515 .keys()
3516 .chain(right_daemons.keys())
3517 .cloned()
3518 .collect::<BTreeSet<_>>();
3519 for daemon_key in daemon_keys {
3520 let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3521 match (
3522 left_daemons.get(&daemon_key),
3523 right_daemons.get(&daemon_key),
3524 ) {
3525 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3526 section: "daemon_events".to_string(),
3527 label,
3528 details: vec!["daemon event missing from right run".to_string()],
3529 }),
3530 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3531 section: "daemon_events".to_string(),
3532 label,
3533 details: vec!["daemon event missing from left run".to_string()],
3534 }),
3535 (Some(left_event), Some(right_event)) if left_event != right_event => {
3536 observability_diffs.push(RunObservabilityDiffRecord {
3537 section: "daemon_events".to_string(),
3538 label,
3539 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3540 });
3541 }
3542 _ => {}
3543 }
3544 }
3545
3546 let left_verification = left_observability
3547 .verification_outcomes
3548 .iter()
3549 .map(|item| (item.stage_id.clone(), item))
3550 .collect::<BTreeMap<_, _>>();
3551 let right_verification = right_observability
3552 .verification_outcomes
3553 .iter()
3554 .map(|item| (item.stage_id.clone(), item))
3555 .collect::<BTreeMap<_, _>>();
3556 let verification_ids = left_verification
3557 .keys()
3558 .chain(right_verification.keys())
3559 .cloned()
3560 .collect::<BTreeSet<_>>();
3561 for stage_id in verification_ids {
3562 match (
3563 left_verification.get(&stage_id),
3564 right_verification.get(&stage_id),
3565 ) {
3566 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3567 section: "verification".to_string(),
3568 label: stage_id,
3569 details: vec!["verification missing from right run".to_string()],
3570 }),
3571 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3572 section: "verification".to_string(),
3573 label: stage_id,
3574 details: vec!["verification missing from left run".to_string()],
3575 }),
3576 (Some(left_item), Some(right_item)) if left_item != right_item => {
3577 let mut details = Vec::new();
3578 if left_item.passed != right_item.passed {
3579 details.push(format!(
3580 "passed: {:?} -> {:?}",
3581 left_item.passed, right_item.passed
3582 ));
3583 }
3584 if left_item.summary != right_item.summary {
3585 details.push(format!(
3586 "summary: {:?} -> {:?}",
3587 left_item.summary, right_item.summary
3588 ));
3589 }
3590 observability_diffs.push(RunObservabilityDiffRecord {
3591 section: "verification".to_string(),
3592 label: left_item.node_id.clone(),
3593 details,
3594 });
3595 }
3596 _ => {}
3597 }
3598 }
3599
3600 let left_graph = (
3601 left_observability.action_graph_nodes.len(),
3602 left_observability.action_graph_edges.len(),
3603 );
3604 let right_graph = (
3605 right_observability.action_graph_nodes.len(),
3606 right_observability.action_graph_edges.len(),
3607 );
3608 if left_graph != right_graph {
3609 observability_diffs.push(RunObservabilityDiffRecord {
3610 section: "action_graph".to_string(),
3611 label: "shape".to_string(),
3612 details: vec![format!(
3613 "nodes/edges: {}/{} -> {}/{}",
3614 left_graph.0, left_graph.1, right_graph.0, right_graph.1
3615 )],
3616 });
3617 }
3618
3619 let status_changed = left.status != right.status;
3620 let identical = !status_changed
3621 && stage_diffs.is_empty()
3622 && tool_diffs.is_empty()
3623 && observability_diffs.is_empty()
3624 && left.transitions.len() == right.transitions.len()
3625 && left.artifacts.len() == right.artifacts.len()
3626 && left.checkpoints.len() == right.checkpoints.len();
3627
3628 RunDiffReport {
3629 left_run_id: left.id.clone(),
3630 right_run_id: right.id.clone(),
3631 identical,
3632 status_changed,
3633 left_status: left.status.clone(),
3634 right_status: right.status.clone(),
3635 stage_diffs,
3636 tool_diffs,
3637 observability_diffs,
3638 transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3639 artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3640 checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3641 }
3642}
3643
3644#[cfg(test)]
3645mod tests {
3646 use super::*;
3647 use std::fs;
3648
3649 fn repo_root() -> PathBuf {
3650 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
3651 .parent()
3652 .unwrap()
3653 .parent()
3654 .unwrap()
3655 .to_path_buf()
3656 }
3657
3658 fn minimal_run(status: &str) -> RunRecord {
3659 RunRecord {
3660 type_name: "workflow_run".to_string(),
3661 id: "run_1".to_string(),
3662 workflow_id: "workflow_1".to_string(),
3663 status: status.to_string(),
3664 usage: Some(LlmUsageRecord {
3665 total_duration_ms: 12,
3666 total_cost: 0.01,
3667 input_tokens: 3,
3668 output_tokens: 4,
3669 call_count: 1,
3670 models: vec!["mock".to_string()],
3671 }),
3672 replay_fixture: Some(ReplayFixture {
3673 type_name: "replay_fixture".to_string(),
3674 expected_status: "completed".to_string(),
3675 ..ReplayFixture::default()
3676 }),
3677 ..RunRecord::default()
3678 }
3679 }
3680
3681 #[test]
3682 fn eval_pack_manifest_toml_runs_replay_case() {
3683 let temp = tempfile::tempdir().unwrap();
3684 let run_path = temp.path().join("run.json");
3685 fs::write(
3686 &run_path,
3687 serde_json::to_string(&minimal_run("completed")).unwrap(),
3688 )
3689 .unwrap();
3690 let pack_path = temp.path().join("harn.eval.toml");
3691 fs::write(
3692 &pack_path,
3693 r#"
3694version = 1
3695id = "connector-regressions"
3696name = "Connector regressions"
3697
3698[[cases]]
3699id = "webhook"
3700name = "Webhook normalization"
3701run = "run.json"
3702rubrics = ["status"]
3703
3704[[rubrics]]
3705id = "status"
3706kind = "deterministic"
3707
3708[[rubrics.assertions]]
3709kind = "run-status"
3710expected = "completed"
3711"#,
3712 )
3713 .unwrap();
3714
3715 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3716 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3717
3718 assert!(report.pass);
3719 assert_eq!(report.total, 1);
3720 assert_eq!(report.cases[0].label, "Webhook normalization");
3721 }
3722
3723 #[test]
3724 fn eval_pack_warning_case_does_not_block() {
3725 let temp = tempfile::tempdir().unwrap();
3726 let run_path = temp.path().join("run.json");
3727 fs::write(
3728 &run_path,
3729 serde_json::to_string(&minimal_run("completed")).unwrap(),
3730 )
3731 .unwrap();
3732 let pack_path = temp.path().join("harn.eval.toml");
3733 fs::write(
3734 &pack_path,
3735 r#"
3736version = 1
3737id = "budgets"
3738
3739[[cases]]
3740id = "latency-budget"
3741run = "run.json"
3742severity = "warning"
3743
3744[cases.thresholds]
3745max-latency-ms = 1
3746"#,
3747 )
3748 .unwrap();
3749
3750 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3751 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3752
3753 assert!(report.pass);
3754 assert_eq!(report.warning_failed, 1);
3755 assert!(report.cases[0].warnings[0].contains("latency"));
3756 }
3757
3758 #[test]
3759 fn eval_pack_manifest_runs_persona_ladder() {
3760 let temp = tempfile::tempdir().unwrap();
3761 let pack_path = temp.path().join("harn.eval.toml");
3762 let base_dir = format!("{:?}", repo_root().display().to_string());
3763 let artifact_root = format!("{:?}", temp.path().join("artifacts").display().to_string());
3764 fs::write(
3765 &pack_path,
3766 format!(
3767 r#"
3768version = 1
3769id = "merge-captain-ladders"
3770base_dir = {}
3771
3772[[ladders]]
3773id = "merge-captain-timeout"
3774persona = "merge_captain"
3775artifact-root = {}
3776
3777[ladders.backend]
3778kind = "replay"
3779path = "examples/personas/merge_captain/transcripts/green_pr.jsonl"
3780
3781[[ladders.model-routes]]
3782id = "gemma-value"
3783route = "local/gemma-value"
3784provider = "llama.cpp"
3785model = "gemma"
3786profile = "value"
3787
3788[[ladders.timeout-tiers]]
3789id = "tiny"
3790max-tool-calls = 1
3791
3792[[ladders.timeout-tiers]]
3793id = "balanced"
3794max-tool-calls = 4
3795max-model-calls = 1
3796"#,
3797 base_dir, artifact_root
3798 ),
3799 )
3800 .unwrap();
3801
3802 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3803 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3804
3805 assert!(report.pass);
3806 assert_eq!(report.total, 1);
3807 assert_eq!(report.ladders.len(), 1);
3808 assert_eq!(
3809 report.ladders[0].first_correct_tier.as_deref(),
3810 Some("balanced")
3811 );
3812 assert_eq!(report.ladders[0].tiers[0].outcome, "degraded");
3813 assert_eq!(report.ladders[0].tiers[1].outcome, "correct");
3814 }
3815
3816 #[test]
3817 fn eval_pack_manifest_runs_friction_context_pack_case() {
3818 let temp = tempfile::tempdir().unwrap();
3819 let events_path = temp.path().join("incident-friction.json");
3820 fs::write(
3821 &events_path,
3822 r#"
3823{
3824 "events": [
3825 {
3826 "kind": "repeated_query",
3827 "source": "incident-triage",
3828 "actor": "sre",
3829 "tool": "splunk",
3830 "provider": "splunk",
3831 "redacted_summary": "Checkout incidents need the same Splunk search",
3832 "recurrence_hints": ["checkout incident queries"],
3833 "estimated_time_ms": 300000,
3834 "metadata": {
3835 "query": "index=checkout service=api error",
3836 "capability": "splunk.search",
3837 "secret_ref": "SPLUNK_READ_TOKEN",
3838 "output_slot": "splunk_errors"
3839 }
3840 },
3841 {
3842 "kind": "repeated_query",
3843 "source": "incident-triage",
3844 "actor": "sre",
3845 "tool": "splunk",
3846 "provider": "splunk",
3847 "redacted_summary": "Checkout incident triage repeated the Splunk search",
3848 "recurrence_hints": ["checkout incident queries"],
3849 "estimated_time_ms": 240000,
3850 "metadata": {
3851 "query": "index=checkout service=api error",
3852 "capability": "splunk.search",
3853 "secret_ref": "SPLUNK_READ_TOKEN",
3854 "output_slot": "splunk_errors"
3855 }
3856 }
3857 ]
3858}
3859"#,
3860 )
3861 .unwrap();
3862 let pack_path = temp.path().join("harn.eval.toml");
3863 fs::write(
3864 &pack_path,
3865 r#"
3866version = 1
3867id = "team-learning"
3868name = "Team learning evals"
3869
3870[[fixtures]]
3871id = "incident-friction"
3872kind = "friction-events"
3873path = "incident-friction.json"
3874
3875[[cases]]
3876id = "incident-context-pack"
3877name = "Incident context pack suggestion"
3878friction_events = "incident-friction"
3879rubrics = ["context-pack"]
3880
3881[[rubrics]]
3882id = "context-pack"
3883kind = "friction"
3884
3885[[rubrics.assertions]]
3886kind = "context-pack-suggestion"
3887contains = "incident"
3888expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3889"#,
3890 )
3891 .unwrap();
3892
3893 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3894 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3895
3896 assert!(report.pass);
3897 assert_eq!(report.total, 1);
3898 assert_eq!(report.cases[0].run_id, "friction_events");
3899 assert_eq!(report.cases[0].stage_count, 2);
3900 }
3901}