1use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 default_run_dir, evaluate_context_pack_suggestion_expectations,
10 generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11 parse_json_payload, parse_json_value, run_persona_eval_ladder, sync_run_handoffs,
12 ArtifactRecord, CapabilityPolicy, ContextPackSuggestionExpectation,
13 ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact, PersonaEvalLadderManifest,
14 PersonaEvalLadderReport,
15};
16use crate::agent_events::AgentEvent;
17use crate::event_log::{
18 active_event_log, sanitize_topic_component, AnyEventLog, EventId, EventLog,
19 LogEvent as EventLogRecord, Topic,
20};
21use crate::llm::vm_value_to_json;
22use crate::triggers::{SignatureStatus, TriggerEvent};
23use crate::value::{VmError, VmValue};
24
25pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
26pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
27pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
28pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
29pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
30pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
31pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
32pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
33pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
34pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
35pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
36
37pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
38pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
39pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
40pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
41pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
42pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
43pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
44pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
45pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
46
47#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
48#[serde(default)]
49pub struct LlmUsageRecord {
50 pub input_tokens: i64,
51 pub output_tokens: i64,
52 pub total_duration_ms: i64,
53 pub call_count: i64,
54 pub total_cost: f64,
55 pub models: Vec<String>,
56}
57
58#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
59#[serde(default)]
60pub struct RunStageRecord {
61 pub id: String,
62 pub node_id: String,
63 pub kind: String,
64 pub status: String,
65 pub outcome: String,
66 pub branch: Option<String>,
67 pub started_at: String,
68 pub finished_at: Option<String>,
69 pub visible_text: Option<String>,
70 pub private_reasoning: Option<String>,
71 pub transcript: Option<serde_json::Value>,
72 pub verification: Option<serde_json::Value>,
73 pub usage: Option<LlmUsageRecord>,
74 pub artifacts: Vec<ArtifactRecord>,
75 pub consumed_artifact_ids: Vec<String>,
76 pub produced_artifact_ids: Vec<String>,
77 pub attempts: Vec<RunStageAttemptRecord>,
78 pub metadata: BTreeMap<String, serde_json::Value>,
79}
80
81#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
82#[serde(default)]
83pub struct RunStageAttemptRecord {
84 pub attempt: usize,
85 pub status: String,
86 pub outcome: String,
87 pub branch: Option<String>,
88 pub error: Option<String>,
89 pub verification: Option<serde_json::Value>,
90 pub started_at: String,
91 pub finished_at: Option<String>,
92}
93
94#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
95#[serde(default)]
96pub struct RunTransitionRecord {
97 pub id: String,
98 pub from_stage_id: Option<String>,
99 pub from_node_id: Option<String>,
100 pub to_node_id: String,
101 pub branch: Option<String>,
102 pub timestamp: String,
103 pub consumed_artifact_ids: Vec<String>,
104 pub produced_artifact_ids: Vec<String>,
105}
106
107#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
108#[serde(default)]
109pub struct RunCheckpointRecord {
110 pub id: String,
111 pub ready_nodes: Vec<String>,
112 pub completed_nodes: Vec<String>,
113 pub last_stage_id: Option<String>,
114 pub persisted_at: String,
115 pub reason: String,
116}
117
118#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
119#[serde(default)]
120pub struct ReplayFixture {
121 #[serde(rename = "_type")]
122 pub type_name: String,
123 pub id: String,
124 pub source_run_id: String,
125 pub workflow_id: String,
126 pub workflow_name: Option<String>,
127 pub created_at: String,
128 pub eval_kind: Option<String>,
129 pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
130 pub expected_status: String,
131 pub stage_assertions: Vec<ReplayStageAssertion>,
132}
133
134#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
135#[serde(default)]
136pub struct ClarifyingQuestionEvalSpec {
137 pub expected_question: Option<String>,
138 pub accepted_questions: Vec<String>,
139 pub required_terms: Vec<String>,
140 pub forbidden_terms: Vec<String>,
141 pub min_questions: usize,
142 pub max_questions: Option<usize>,
143}
144
145#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
146#[serde(default)]
147pub struct ReplayStageAssertion {
148 pub node_id: String,
149 pub expected_status: String,
150 pub expected_outcome: String,
151 pub expected_branch: Option<String>,
152 pub required_artifact_kinds: Vec<String>,
153 pub visible_text_contains: Option<String>,
154}
155
156#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
157#[serde(default)]
158pub struct ReplayEvalReport {
159 pub pass: bool,
160 pub failures: Vec<String>,
161 pub stage_count: usize,
162}
163
164#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
165#[serde(default)]
166pub struct ReplayEvalCaseReport {
167 pub run_id: String,
168 pub workflow_id: String,
169 pub label: Option<String>,
170 pub pass: bool,
171 pub failures: Vec<String>,
172 pub stage_count: usize,
173 pub source_path: Option<String>,
174 pub comparison: Option<RunDiffReport>,
175}
176
177#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
178#[serde(default)]
179pub struct ReplayEvalSuiteReport {
180 pub pass: bool,
181 pub total: usize,
182 pub passed: usize,
183 pub failed: usize,
184 pub cases: Vec<ReplayEvalCaseReport>,
185}
186
187#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
188#[serde(default)]
189pub struct RunDeliverableSummaryRecord {
190 pub id: String,
191 pub text: String,
192 pub status: String,
193 pub note: Option<String>,
194}
195
196#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
197#[serde(default)]
198pub struct RunTaskLedgerSummaryRecord {
199 pub root_task: String,
200 pub rationale: String,
201 pub deliverables: Vec<RunDeliverableSummaryRecord>,
202 pub observations: Vec<String>,
203 pub blocking_count: usize,
204}
205
206#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
207#[serde(default)]
208pub struct RunPlannerRoundRecord {
209 pub stage_id: String,
210 pub node_id: String,
211 pub stage_kind: String,
212 pub status: String,
213 pub outcome: String,
214 pub iteration_count: usize,
215 pub llm_call_count: usize,
216 pub tool_execution_count: usize,
217 pub tool_rejection_count: usize,
218 pub intervention_count: usize,
219 pub compaction_count: usize,
220 pub native_text_tool_fallback_count: usize,
221 pub native_text_tool_fallback_rejection_count: usize,
222 pub empty_completion_retry_count: usize,
223 pub tools_used: Vec<String>,
224 pub successful_tools: Vec<String>,
225 pub ledger_done_rejections: usize,
226 pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
227 pub research_facts: Vec<String>,
228}
229
230#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
231#[serde(default)]
232pub struct RunWorkerLineageRecord {
233 pub worker_id: String,
234 pub worker_name: String,
235 pub parent_stage_id: Option<String>,
236 pub task: String,
237 pub status: String,
238 pub session_id: Option<String>,
239 pub parent_session_id: Option<String>,
240 pub run_id: Option<String>,
241 pub run_path: Option<String>,
242 pub snapshot_path: Option<String>,
243}
244
245#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
246#[serde(default)]
247pub struct RunActionGraphNodeRecord {
248 pub id: String,
249 pub label: String,
250 pub kind: String,
251 pub status: String,
252 pub outcome: String,
253 pub trace_id: Option<String>,
254 pub stage_id: Option<String>,
255 pub node_id: Option<String>,
256 pub worker_id: Option<String>,
257 pub run_id: Option<String>,
258 pub run_path: Option<String>,
259 pub metadata: BTreeMap<String, serde_json::Value>,
260}
261
262#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
263#[serde(default)]
264pub struct RunActionGraphEdgeRecord {
265 pub from_id: String,
266 pub to_id: String,
267 pub kind: String,
268 pub label: Option<String>,
269}
270
271#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
272#[serde(default)]
273pub struct RunVerificationOutcomeRecord {
274 pub stage_id: String,
275 pub node_id: String,
276 pub status: String,
277 pub passed: Option<bool>,
278 pub summary: Option<String>,
279}
280
281#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
282#[serde(default)]
283pub struct RunTranscriptPointerRecord {
284 pub id: String,
285 pub label: String,
286 pub kind: String,
287 pub location: String,
288 pub path: Option<String>,
289 pub available: bool,
290}
291
292#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
293#[serde(default)]
294pub struct CompactionEventRecord {
295 pub id: String,
296 pub transcript_id: Option<String>,
297 pub stage_id: Option<String>,
298 pub node_id: Option<String>,
299 pub mode: String,
300 pub strategy: String,
301 pub archived_messages: usize,
302 pub estimated_tokens_before: usize,
303 pub estimated_tokens_after: usize,
304 pub snapshot_asset_id: Option<String>,
305 pub snapshot_location: String,
306 pub snapshot_path: Option<String>,
307 pub available: bool,
308}
309
310#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
311#[serde(rename_all = "snake_case")]
312pub enum DaemonEventKindRecord {
313 #[default]
314 Spawned,
315 Triggered,
316 Snapshotted,
317 Resumed,
318 Stopped,
319}
320
321#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
322#[serde(default)]
323pub struct DaemonEventRecord {
324 pub daemon_id: String,
325 pub name: String,
326 pub kind: DaemonEventKindRecord,
327 pub timestamp: String,
328 pub persist_path: String,
329 pub payload_summary: Option<String>,
330}
331
332#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
333#[serde(default)]
334pub struct RunObservabilityRecord {
335 pub schema_version: usize,
336 pub planner_rounds: Vec<RunPlannerRoundRecord>,
337 pub research_fact_count: usize,
338 pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
339 pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
340 pub worker_lineage: Vec<RunWorkerLineageRecord>,
341 pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
342 pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
343 pub compaction_events: Vec<CompactionEventRecord>,
344 pub daemon_events: Vec<DaemonEventRecord>,
345}
346
347#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
348#[serde(default)]
349pub struct RunStageDiffRecord {
350 pub node_id: String,
351 pub change: String,
352 pub details: Vec<String>,
353}
354
355#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
356#[serde(default)]
357pub struct ToolCallDiffRecord {
358 pub tool_name: String,
359 pub args_hash: String,
360 pub result_changed: bool,
361 pub left_result: Option<String>,
362 pub right_result: Option<String>,
363}
364
365#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
366#[serde(default)]
367pub struct RunObservabilityDiffRecord {
368 pub section: String,
369 pub label: String,
370 pub details: Vec<String>,
371}
372
373#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
374#[serde(default)]
375pub struct RunDiffReport {
376 pub left_run_id: String,
377 pub right_run_id: String,
378 pub identical: bool,
379 pub status_changed: bool,
380 pub left_status: String,
381 pub right_status: String,
382 pub stage_diffs: Vec<RunStageDiffRecord>,
383 pub tool_diffs: Vec<ToolCallDiffRecord>,
384 pub observability_diffs: Vec<RunObservabilityDiffRecord>,
385 pub transition_count_delta: isize,
386 pub artifact_count_delta: isize,
387 pub checkpoint_count_delta: isize,
388}
389
390#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
391#[serde(default)]
392pub struct EvalSuiteManifest {
393 #[serde(rename = "_type")]
394 pub type_name: String,
395 pub id: String,
396 pub name: Option<String>,
397 pub base_dir: Option<String>,
398 pub cases: Vec<EvalSuiteCase>,
399}
400
401#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
402#[serde(default)]
403pub struct EvalSuiteCase {
404 pub label: Option<String>,
405 pub run_path: String,
406 pub fixture_path: Option<String>,
407 pub compare_to: Option<String>,
408}
409
410#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
411#[serde(default)]
412pub struct EvalPackManifest {
413 pub version: u32,
414 pub id: String,
415 pub name: Option<String>,
416 pub description: Option<String>,
417 pub base_dir: Option<String>,
418 pub baseline: Option<String>,
419 pub package: Option<EvalPackPackage>,
420 pub defaults: EvalPackDefaults,
421 pub fixtures: Vec<EvalPackFixtureRef>,
422 pub rubrics: Vec<EvalPackRubric>,
423 pub judge: Option<EvalPackJudgeConfig>,
424 pub cases: Vec<EvalPackCase>,
425 pub ladders: Vec<PersonaEvalLadderManifest>,
426 pub metadata: BTreeMap<String, serde_json::Value>,
427}
428
429#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
430#[serde(default)]
431pub struct EvalPackPackage {
432 pub name: Option<String>,
433 pub version: Option<String>,
434 pub source: Option<String>,
435 pub templates: Vec<String>,
436}
437
438#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
439#[serde(default)]
440pub struct EvalPackDefaults {
441 pub severity: Option<String>,
442 pub fixture_root: Option<String>,
443 pub thresholds: EvalPackThresholds,
444 pub judge: Option<EvalPackJudgeConfig>,
445}
446
447#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
448#[serde(default)]
449pub struct EvalPackFixtureRef {
450 pub id: String,
451 pub kind: String,
452 pub path: Option<String>,
453 #[serde(default, alias = "trace-id")]
454 pub trace_id: Option<String>,
455 pub provider: Option<String>,
456 #[serde(default, alias = "event-kind")]
457 pub event_kind: Option<String>,
458 pub inline: Option<serde_json::Value>,
459 pub metadata: BTreeMap<String, serde_json::Value>,
460}
461
462#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
463#[serde(default)]
464pub struct EvalPackRubric {
465 pub id: String,
466 pub kind: String,
467 pub description: Option<String>,
468 pub prompt: Option<String>,
469 pub assertions: Vec<EvalPackAssertion>,
470 pub judge: Option<EvalPackJudgeConfig>,
471 pub calibration: Vec<EvalPackGoldenExample>,
472 pub thresholds: EvalPackThresholds,
473 pub metadata: BTreeMap<String, serde_json::Value>,
474}
475
476#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
477#[serde(default)]
478pub struct EvalPackAssertion {
479 pub kind: String,
480 pub stage: Option<String>,
481 pub path: Option<String>,
482 pub op: Option<String>,
483 pub expected: Option<serde_json::Value>,
484 pub contains: Option<String>,
485 pub metadata: BTreeMap<String, serde_json::Value>,
486}
487
488#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
489#[serde(default)]
490pub struct EvalPackJudgeConfig {
491 pub model: Option<String>,
492 #[serde(default, alias = "prompt-version")]
493 pub prompt_version: Option<String>,
494 #[serde(default, alias = "tie-break")]
495 pub tie_break: Option<String>,
496 #[serde(default, alias = "confidence-min")]
497 pub confidence_min: Option<f64>,
498 pub temperature: Option<f64>,
499 pub rubric: Option<String>,
500 pub metadata: BTreeMap<String, serde_json::Value>,
501}
502
503#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
504#[serde(default)]
505pub struct EvalPackGoldenExample {
506 pub input: serde_json::Value,
507 pub output: serde_json::Value,
508 pub score: Option<f64>,
509 pub explanation: Option<String>,
510}
511
512#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
513#[serde(default)]
514pub struct EvalPackThresholds {
515 pub severity: Option<String>,
516 #[serde(default, alias = "min-score")]
517 pub min_score: Option<f64>,
518 #[serde(default, alias = "min-confidence")]
519 pub min_confidence: Option<f64>,
520 #[serde(default, alias = "max-cost-usd")]
521 pub max_cost_usd: Option<f64>,
522 #[serde(default, alias = "max-latency-ms")]
523 pub max_latency_ms: Option<i64>,
524 #[serde(default, alias = "max-tokens")]
525 pub max_tokens: Option<i64>,
526 #[serde(default, alias = "max-stage-count")]
527 pub max_stage_count: Option<usize>,
528}
529
530#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
531#[serde(default)]
532pub struct EvalPackCase {
533 pub id: Option<String>,
534 pub name: Option<String>,
535 pub description: Option<String>,
536 pub run: Option<String>,
537 #[serde(default, alias = "run-path")]
538 pub run_path: Option<String>,
539 #[serde(default, alias = "friction-events", alias = "friction_events")]
540 pub friction_events: Option<String>,
541 pub fixture: Option<String>,
542 #[serde(default, alias = "fixture-path")]
543 pub fixture_path: Option<String>,
544 #[serde(default, alias = "compare-to")]
545 pub compare_to: Option<String>,
546 pub rubrics: Vec<String>,
547 pub severity: Option<String>,
548 pub thresholds: EvalPackThresholds,
549 pub metadata: BTreeMap<String, serde_json::Value>,
550}
551
552#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
553#[serde(default)]
554pub struct EvalPackReport {
555 pub pack_id: String,
556 pub pass: bool,
557 pub total: usize,
558 pub passed: usize,
559 pub failed: usize,
560 pub blocking_failed: usize,
561 pub warning_failed: usize,
562 pub informational_failed: usize,
563 pub cases: Vec<EvalPackCaseReport>,
564 pub ladders: Vec<PersonaEvalLadderReport>,
565}
566
567#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
568#[serde(default)]
569pub struct EvalPackCaseReport {
570 pub id: String,
571 pub label: String,
572 pub severity: String,
573 pub pass: bool,
574 pub blocking: bool,
575 pub run_id: String,
576 pub workflow_id: String,
577 pub source_path: Option<String>,
578 pub stage_count: usize,
579 pub failures: Vec<String>,
580 pub warnings: Vec<String>,
581 pub informational: Vec<String>,
582 pub comparison: Option<RunDiffReport>,
583}
584
585#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
586#[serde(default)]
587pub struct RunHitlQuestionRecord {
588 pub request_id: String,
589 pub prompt: String,
590 pub agent: String,
591 pub trace_id: Option<String>,
592 pub asked_at: String,
593}
594
595#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
596#[serde(default)]
597pub struct RunRecord {
598 #[serde(rename = "_type")]
599 pub type_name: String,
600 pub id: String,
601 pub workflow_id: String,
602 pub workflow_name: Option<String>,
603 pub task: String,
604 pub status: String,
605 pub started_at: String,
606 pub finished_at: Option<String>,
607 pub parent_run_id: Option<String>,
608 pub root_run_id: Option<String>,
609 pub stages: Vec<RunStageRecord>,
610 pub transitions: Vec<RunTransitionRecord>,
611 pub checkpoints: Vec<RunCheckpointRecord>,
612 pub pending_nodes: Vec<String>,
613 pub completed_nodes: Vec<String>,
614 pub child_runs: Vec<RunChildRecord>,
615 pub artifacts: Vec<ArtifactRecord>,
616 pub handoffs: Vec<HandoffArtifact>,
617 pub policy: CapabilityPolicy,
618 pub execution: Option<RunExecutionRecord>,
619 pub transcript: Option<serde_json::Value>,
620 pub usage: Option<LlmUsageRecord>,
621 pub replay_fixture: Option<ReplayFixture>,
622 pub observability: Option<RunObservabilityRecord>,
623 pub trace_spans: Vec<RunTraceSpanRecord>,
624 pub tool_recordings: Vec<ToolCallRecord>,
625 pub hitl_questions: Vec<RunHitlQuestionRecord>,
626 pub metadata: BTreeMap<String, serde_json::Value>,
627 pub persisted_path: Option<String>,
628}
629
630#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
631#[serde(default)]
632pub struct ToolCallRecord {
633 pub tool_name: String,
634 pub tool_use_id: String,
635 pub args_hash: String,
636 pub result: String,
637 pub is_rejected: bool,
638 pub duration_ms: u64,
639 pub iteration: usize,
640 pub timestamp: String,
641}
642
643pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
645 use std::hash::{Hash, Hasher};
646 let mut hasher = std::collections::hash_map::DefaultHasher::new();
647 tool_name.hash(&mut hasher);
648 let args_str = serde_json::to_string(args).unwrap_or_default();
649 args_str.hash(&mut hasher);
650 format!("{:016x}", hasher.finish())
651}
652
653#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
654#[serde(default)]
655pub struct RunTraceSpanRecord {
656 pub span_id: u64,
657 pub parent_id: Option<u64>,
658 pub kind: String,
659 pub name: String,
660 pub start_ms: u64,
661 pub duration_ms: u64,
662 pub metadata: BTreeMap<String, serde_json::Value>,
663}
664
665#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
666#[serde(default)]
667pub struct RunChildRecord {
668 pub worker_id: String,
669 pub worker_name: String,
670 pub parent_stage_id: Option<String>,
671 pub session_id: Option<String>,
672 pub parent_session_id: Option<String>,
673 pub mutation_scope: Option<String>,
674 pub approval_policy: Option<super::ToolApprovalPolicy>,
675 pub task: String,
676 pub request: Option<serde_json::Value>,
677 pub provenance: Option<serde_json::Value>,
678 pub status: String,
679 pub started_at: String,
680 pub finished_at: Option<String>,
681 pub run_id: Option<String>,
682 pub run_path: Option<String>,
683 pub snapshot_path: Option<String>,
684 pub execution: Option<RunExecutionRecord>,
685}
686
687pub(crate) fn run_child_record_from_worker_metadata(
688 parent_stage_id: Option<String>,
689 worker: &serde_json::Value,
690) -> Option<RunChildRecord> {
691 let worker_id = worker.get("id").and_then(|value| value.as_str())?;
692 if worker_id.is_empty() {
693 return None;
694 }
695 Some(RunChildRecord {
696 worker_id: worker_id.to_string(),
697 worker_name: worker
698 .get("name")
699 .and_then(|value| value.as_str())
700 .unwrap_or("worker")
701 .to_string(),
702 parent_stage_id,
703 session_id: worker
704 .get("audit")
705 .and_then(|value| value.get("session_id"))
706 .and_then(|value| value.as_str())
707 .map(str::to_string),
708 parent_session_id: worker
709 .get("audit")
710 .and_then(|value| value.get("parent_session_id"))
711 .and_then(|value| value.as_str())
712 .map(str::to_string),
713 mutation_scope: worker
714 .get("audit")
715 .and_then(|value| value.get("mutation_scope"))
716 .and_then(|value| value.as_str())
717 .map(str::to_string),
718 approval_policy: worker
719 .get("audit")
720 .and_then(|value| value.get("approval_policy"))
721 .and_then(|value| {
722 serde_json::from_value::<super::ToolApprovalPolicy>(value.clone()).ok()
723 }),
724 task: worker
725 .get("task")
726 .and_then(|value| value.as_str())
727 .unwrap_or_default()
728 .to_string(),
729 request: worker.get("request").cloned(),
730 provenance: worker.get("provenance").cloned(),
731 status: worker
732 .get("status")
733 .and_then(|value| value.as_str())
734 .unwrap_or("completed")
735 .to_string(),
736 started_at: worker
737 .get("started_at")
738 .and_then(|value| value.as_str())
739 .unwrap_or_default()
740 .to_string(),
741 finished_at: worker
742 .get("finished_at")
743 .and_then(|value| value.as_str())
744 .map(str::to_string),
745 run_id: worker
746 .get("child_run_id")
747 .and_then(|value| value.as_str())
748 .map(str::to_string),
749 run_path: worker
750 .get("child_run_path")
751 .and_then(|value| value.as_str())
752 .map(str::to_string),
753 snapshot_path: worker
754 .get("snapshot_path")
755 .and_then(|value| value.as_str())
756 .map(str::to_string),
757 execution: worker
758 .get("execution")
759 .cloned()
760 .and_then(|value| serde_json::from_value(value).ok()),
761 })
762}
763
764fn run_child_from_stage_metadata(stage: &RunStageRecord) -> Option<RunChildRecord> {
765 let parent_stage_id = if stage.id.is_empty() {
766 None
767 } else {
768 Some(stage.id.clone())
769 };
770 run_child_record_from_worker_metadata(parent_stage_id, stage.metadata.get("worker")?)
771}
772
773fn fill_missing_child_run_fields(existing: &mut RunChildRecord, child: RunChildRecord) {
774 if existing.worker_name.is_empty() {
775 existing.worker_name = child.worker_name;
776 }
777 if existing.parent_stage_id.is_none() {
778 existing.parent_stage_id = child.parent_stage_id;
779 }
780 if existing.session_id.is_none() {
781 existing.session_id = child.session_id;
782 }
783 if existing.parent_session_id.is_none() {
784 existing.parent_session_id = child.parent_session_id;
785 }
786 if existing.mutation_scope.is_none() {
787 existing.mutation_scope = child.mutation_scope;
788 }
789 if existing.approval_policy.is_none() {
790 existing.approval_policy = child.approval_policy;
791 }
792 if existing.task.is_empty() {
793 existing.task = child.task;
794 }
795 if existing.request.is_none() {
796 existing.request = child.request;
797 }
798 if existing.provenance.is_none() {
799 existing.provenance = child.provenance;
800 }
801 if existing.status.is_empty() {
802 existing.status = child.status;
803 }
804 if existing.started_at.is_empty() {
805 existing.started_at = child.started_at;
806 }
807 if existing.finished_at.is_none() {
808 existing.finished_at = child.finished_at;
809 }
810 if existing.run_id.is_none() {
811 existing.run_id = child.run_id;
812 }
813 if existing.run_path.is_none() {
814 existing.run_path = child.run_path;
815 }
816 if existing.snapshot_path.is_none() {
817 existing.snapshot_path = child.snapshot_path;
818 }
819 if existing.execution.is_none() {
820 existing.execution = child.execution;
821 }
822}
823
824fn materialize_child_runs_from_stage_metadata(run: &mut RunRecord) {
825 for child in run
826 .stages
827 .iter()
828 .filter_map(run_child_from_stage_metadata)
829 .collect::<Vec<_>>()
830 {
831 match run
832 .child_runs
833 .iter_mut()
834 .find(|existing| existing.worker_id == child.worker_id)
835 {
836 Some(existing) => fill_missing_child_run_fields(existing, child),
837 None => run.child_runs.push(child),
838 }
839 }
840}
841
842#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
843#[serde(default)]
844pub struct RunExecutionRecord {
845 pub cwd: Option<String>,
846 pub source_dir: Option<String>,
847 pub env: BTreeMap<String, String>,
848 pub adapter: Option<String>,
849 pub repo_path: Option<String>,
850 pub worktree_path: Option<String>,
851 pub branch: Option<String>,
852 pub base_ref: Option<String>,
853 pub cleanup: Option<String>,
854}
855
856fn compact_json_value(value: &serde_json::Value) -> String {
857 serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
858}
859
860fn normalize_question_text(text: &str) -> String {
861 text.chars()
862 .map(|ch| {
863 if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
864 ch.to_ascii_lowercase()
865 } else {
866 ' '
867 }
868 })
869 .collect::<String>()
870 .split_whitespace()
871 .collect::<Vec<_>>()
872 .join(" ")
873}
874
875fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
876 spec.min_questions.max(1)
877}
878
879fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
880 spec.max_questions.unwrap_or(1).max(1)
881}
882
883fn read_topic_records(
884 log: &AnyEventLog,
885 topic: &Topic,
886) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
887 let mut from = None;
888 let mut records = Vec::new();
889 loop {
890 let batch =
891 futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
892 if batch.is_empty() {
893 break;
894 }
895 from = batch.last().map(|(event_id, _)| *event_id);
896 records.extend(batch);
897 }
898 records
899}
900
901#[derive(Clone, Debug)]
902pub struct AgentSessionReplayEvent {
903 pub event_id: EventId,
904 pub event: AgentEvent,
905}
906
907pub async fn load_agent_session_replay_events(
908 session_id: &str,
909) -> Result<Vec<AgentSessionReplayEvent>, VmError> {
910 let Some(log) = active_event_log() else {
911 return Ok(Vec::new());
912 };
913 let topic = Topic::new(format!(
914 "observability.agent_events.{}",
915 sanitize_topic_component(session_id)
916 ))
917 .map_err(|error| VmError::Runtime(format!("failed to build agent event topic: {error}")))?;
918
919 let mut events = Vec::new();
920 let mut from = None;
921 loop {
922 let batch = log.read_range(&topic, from, 1024).await.map_err(|error| {
923 VmError::Runtime(format!(
924 "failed to read agent event replay topic {}: {error}",
925 topic.as_str()
926 ))
927 })?;
928 let batch_len = batch.len();
929 for (event_id, record) in batch {
930 from = Some(event_id);
931 if record.headers.get("session_id").map(String::as_str) != Some(session_id) {
932 continue;
933 }
934 let Some(event_value) = record.payload.get("event").cloned() else {
935 continue;
936 };
937 let event = serde_json::from_value::<AgentEvent>(event_value).map_err(|error| {
938 VmError::Runtime(format!(
939 "failed to decode agent event replay record {event_id}: {error}"
940 ))
941 })?;
942 if event.session_id() == session_id {
943 events.push(AgentSessionReplayEvent { event_id, event });
944 }
945 }
946 if batch_len < 1024 {
947 break;
948 }
949 }
950 Ok(events)
951}
952
953fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
954 let Some(log) = active_event_log() else {
955 return;
956 };
957 let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
958 .expect("static hitl.questions topic should always be valid");
959 let mut merged = run
960 .hitl_questions
961 .iter()
962 .cloned()
963 .map(|question| (question.request_id.clone(), question))
964 .collect::<BTreeMap<_, _>>();
965
966 for (_, event) in read_topic_records(log.as_ref(), &topic) {
967 if event.kind != "hitl.question_asked" {
968 continue;
969 }
970 let payload = &event.payload;
971 let matches_run = event
972 .headers
973 .get("run_id")
974 .is_some_and(|value| value == &run.id)
975 || payload
976 .get("run_id")
977 .and_then(|value| value.as_str())
978 .is_some_and(|value| value == run.id);
979 if !matches_run {
980 continue;
981 }
982 let request_id = payload
983 .get("request_id")
984 .and_then(|value| value.as_str())
985 .or_else(|| event.headers.get("request_id").map(String::as_str))
986 .unwrap_or_default();
987 let prompt = payload
988 .get("payload")
989 .and_then(|value| value.get("prompt"))
990 .and_then(|value| value.as_str())
991 .unwrap_or_default();
992 if request_id.is_empty() || prompt.is_empty() {
993 continue;
994 }
995 merged.insert(
996 request_id.to_string(),
997 RunHitlQuestionRecord {
998 request_id: request_id.to_string(),
999 prompt: prompt.to_string(),
1000 agent: payload
1001 .get("agent")
1002 .and_then(|value| value.as_str())
1003 .unwrap_or_default()
1004 .to_string(),
1005 trace_id: payload
1006 .get("trace_id")
1007 .and_then(|value| value.as_str())
1008 .map(str::to_string),
1009 asked_at: payload
1010 .get("requested_at")
1011 .and_then(|value| value.as_str())
1012 .unwrap_or_default()
1013 .to_string(),
1014 },
1015 );
1016 }
1017
1018 run.hitl_questions = merged.into_values().collect();
1019 run.hitl_questions.sort_by(|left, right| {
1020 (left.asked_at.as_str(), left.request_id.as_str())
1021 .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
1022 });
1023}
1024
1025fn signature_status_label(status: &SignatureStatus) -> &'static str {
1026 match status {
1027 SignatureStatus::Verified => "verified",
1028 SignatureStatus::Unsigned => "unsigned",
1029 SignatureStatus::Failed { .. } => "failed",
1030 }
1031}
1032
1033fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
1034 run.metadata
1035 .get("trigger_event")
1036 .cloned()
1037 .and_then(|value| serde_json::from_value(value).ok())
1038}
1039
1040fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
1041 trigger_event
1042 .map(|event| event.trace_id.0.clone())
1043 .or_else(|| {
1044 run.metadata
1045 .get("trace_id")
1046 .and_then(|value| value.as_str())
1047 .map(str::to_string)
1048 })
1049}
1050
1051fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
1052 run.metadata
1053 .get("replay_of_event_id")
1054 .and_then(|value| value.as_str())
1055 .map(str::to_string)
1056}
1057
1058fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
1059 if stage.kind == "condition" {
1060 ACTION_GRAPH_NODE_KIND_PREDICATE
1061 } else {
1062 ACTION_GRAPH_NODE_KIND_STAGE
1063 }
1064}
1065
1066fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
1067 let mut metadata = BTreeMap::new();
1068 metadata.insert(
1069 "provider".to_string(),
1070 serde_json::json!(trigger_event.provider.as_str()),
1071 );
1072 metadata.insert(
1073 "event_kind".to_string(),
1074 serde_json::json!(trigger_event.kind),
1075 );
1076 metadata.insert(
1077 "dedupe_key".to_string(),
1078 serde_json::json!(trigger_event.dedupe_key),
1079 );
1080 metadata.insert(
1081 "signature_status".to_string(),
1082 serde_json::json!(signature_status_label(&trigger_event.signature_status)),
1083 );
1084 metadata
1085}
1086
1087fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
1088 let mut metadata = BTreeMap::new();
1089 metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
1090 if let Some(branch) = stage.branch.as_ref() {
1091 metadata.insert("branch".to_string(), serde_json::json!(branch));
1092 }
1093 if let Some(worker_id) = stage
1094 .metadata
1095 .get("worker_id")
1096 .and_then(|value| value.as_str())
1097 {
1098 metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
1099 }
1100 metadata
1101}
1102
1103fn append_action_graph_node(
1104 nodes: &mut Vec<RunActionGraphNodeRecord>,
1105 record: RunActionGraphNodeRecord,
1106) {
1107 nodes.push(record);
1108}
1109
1110pub async fn append_action_graph_update(
1111 headers: BTreeMap<String, String>,
1112 payload: serde_json::Value,
1113) -> Result<(), crate::event_log::LogError> {
1114 let Some(log) = active_event_log() else {
1115 return Ok(());
1116 };
1117 let topic = Topic::new("observability.action_graph")
1118 .expect("static observability.action_graph topic should always be valid");
1119 let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
1120 log.append(&topic, record).await.map(|_| ())
1121}
1122
1123fn publish_action_graph_event(
1124 run: &RunRecord,
1125 observability: &RunObservabilityRecord,
1126 path: &Path,
1127) {
1128 let trigger_event = trigger_event_from_run(run);
1129 let mut headers = BTreeMap::new();
1130 headers.insert("run_id".to_string(), run.id.clone());
1131 headers.insert("workflow_id".to_string(), run.workflow_id.clone());
1132 if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
1133 headers.insert("trace_id".to_string(), trace_id);
1134 }
1135 let payload = serde_json::json!({
1136 "run_id": run.id,
1137 "workflow_id": run.workflow_id,
1138 "persisted_path": path.to_string_lossy(),
1139 "status": run.status,
1140 "observability": observability,
1141 });
1142 if let Ok(handle) = tokio::runtime::Handle::try_current() {
1143 handle.spawn(async move {
1144 let _ = append_action_graph_update(headers, payload).await;
1145 });
1146 } else {
1147 let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
1148 }
1149}
1150
1151fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
1152 let stem = run_path.file_stem()?.to_str()?;
1153 let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
1154 Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
1155}
1156
1157fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
1158 value
1159 .and_then(|value| value.as_array())
1160 .map(|items| {
1161 items
1162 .iter()
1163 .filter_map(|item| item.as_str().map(str::to_string))
1164 .collect::<Vec<_>>()
1165 })
1166 .unwrap_or_default()
1167}
1168
1169fn json_usize(value: Option<&serde_json::Value>) -> usize {
1170 value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
1171}
1172
1173fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
1174 value.and_then(|value| value.as_bool())
1175}
1176
1177fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
1178 stage
1179 .artifacts
1180 .iter()
1181 .find_map(|artifact| artifact.data.as_ref())
1182}
1183
1184fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
1185 let deliverables = value
1186 .get("deliverables")
1187 .and_then(|raw| raw.as_array())
1188 .map(|items| {
1189 items
1190 .iter()
1191 .map(|item| RunDeliverableSummaryRecord {
1192 id: item
1193 .get("id")
1194 .and_then(|value| value.as_str())
1195 .unwrap_or_default()
1196 .to_string(),
1197 text: item
1198 .get("text")
1199 .and_then(|value| value.as_str())
1200 .unwrap_or_default()
1201 .to_string(),
1202 status: item
1203 .get("status")
1204 .and_then(|value| value.as_str())
1205 .unwrap_or_default()
1206 .to_string(),
1207 note: item
1208 .get("note")
1209 .and_then(|value| value.as_str())
1210 .map(str::to_string),
1211 })
1212 .collect::<Vec<_>>()
1213 })
1214 .unwrap_or_default();
1215 let observations = json_string_array(value.get("observations"));
1216 let root_task = value
1217 .get("root_task")
1218 .and_then(|value| value.as_str())
1219 .unwrap_or_default()
1220 .to_string();
1221 let rationale = value
1222 .get("rationale")
1223 .and_then(|value| value.as_str())
1224 .unwrap_or_default()
1225 .to_string();
1226 if root_task.is_empty()
1227 && rationale.is_empty()
1228 && deliverables.is_empty()
1229 && observations.is_empty()
1230 {
1231 return None;
1232 }
1233 let blocking_count = deliverables
1234 .iter()
1235 .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1236 .count();
1237 Some(RunTaskLedgerSummaryRecord {
1238 root_task,
1239 rationale,
1240 deliverables,
1241 observations,
1242 blocking_count,
1243 })
1244}
1245
1246fn compaction_events_from_transcript(
1247 transcript: &serde_json::Value,
1248 stage_id: Option<&str>,
1249 node_id: Option<&str>,
1250 location_prefix: &str,
1251 persisted_path: Option<&Path>,
1252) -> Vec<CompactionEventRecord> {
1253 let transcript_id = transcript
1254 .get("id")
1255 .and_then(|value| value.as_str())
1256 .map(str::to_string);
1257 let asset_ids = transcript
1258 .get("assets")
1259 .and_then(|value| value.as_array())
1260 .map(|assets| {
1261 assets
1262 .iter()
1263 .filter_map(|asset| {
1264 asset
1265 .get("id")
1266 .and_then(|value| value.as_str())
1267 .map(str::to_string)
1268 })
1269 .collect::<BTreeSet<_>>()
1270 })
1271 .unwrap_or_default();
1272 transcript
1273 .get("events")
1274 .and_then(|value| value.as_array())
1275 .map(|events| {
1276 events
1277 .iter()
1278 .filter(|event| {
1279 event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1280 })
1281 .map(|event| {
1282 let metadata = event.get("metadata");
1283 let snapshot_asset_id = metadata
1284 .and_then(|value| value.get("snapshot_asset_id"))
1285 .and_then(|value| value.as_str())
1286 .map(str::to_string);
1287 let available = snapshot_asset_id
1288 .as_ref()
1289 .is_some_and(|asset_id| asset_ids.contains(asset_id));
1290 let snapshot_location = snapshot_asset_id
1291 .as_ref()
1292 .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1293 .unwrap_or_else(|| location_prefix.to_string());
1294 CompactionEventRecord {
1295 id: event
1296 .get("id")
1297 .and_then(|value| value.as_str())
1298 .unwrap_or_default()
1299 .to_string(),
1300 transcript_id: transcript_id.clone(),
1301 stage_id: stage_id.map(str::to_string),
1302 node_id: node_id.map(str::to_string),
1303 mode: metadata
1304 .and_then(|value| value.get("mode"))
1305 .and_then(|value| value.as_str())
1306 .unwrap_or_default()
1307 .to_string(),
1308 strategy: metadata
1309 .and_then(|value| value.get("strategy"))
1310 .and_then(|value| value.as_str())
1311 .unwrap_or_default()
1312 .to_string(),
1313 archived_messages: json_usize(
1314 metadata.and_then(|value| value.get("archived_messages")),
1315 ),
1316 estimated_tokens_before: json_usize(
1317 metadata.and_then(|value| value.get("estimated_tokens_before")),
1318 ),
1319 estimated_tokens_after: json_usize(
1320 metadata.and_then(|value| value.get("estimated_tokens_after")),
1321 ),
1322 snapshot_asset_id,
1323 snapshot_location,
1324 snapshot_path: persisted_path
1325 .map(|path| path.to_string_lossy().into_owned()),
1326 available,
1327 }
1328 })
1329 .collect()
1330 })
1331 .unwrap_or_default()
1332}
1333
1334fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1335 let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1336 return Vec::new();
1337 };
1338 let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1339 return Vec::new();
1340 };
1341
1342 content
1343 .lines()
1344 .filter(|line| !line.trim().is_empty())
1345 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1346 .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1347 .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1348 .collect()
1349}
1350
1351pub fn derive_run_observability(
1352 run: &RunRecord,
1353 persisted_path: Option<&Path>,
1354) -> RunObservabilityRecord {
1355 let mut action_graph_nodes = Vec::new();
1356 let mut action_graph_edges = Vec::new();
1357 let mut verification_outcomes = Vec::new();
1358 let mut planner_rounds = Vec::new();
1359 let mut transcript_pointers = Vec::new();
1360 let mut compaction_events = Vec::new();
1361 let mut daemon_events = Vec::new();
1362 let mut research_fact_count = 0usize;
1363
1364 let root_node_id = format!("run:{}", run.id);
1365 let trigger_event = trigger_event_from_run(run);
1366 let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1367 append_action_graph_node(
1368 &mut action_graph_nodes,
1369 RunActionGraphNodeRecord {
1370 id: root_node_id.clone(),
1371 label: run
1372 .workflow_name
1373 .clone()
1374 .unwrap_or_else(|| run.workflow_id.clone()),
1375 kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1376 status: run.status.clone(),
1377 outcome: run.status.clone(),
1378 trace_id: propagated_trace_id.clone(),
1379 stage_id: None,
1380 node_id: None,
1381 worker_id: None,
1382 run_id: Some(run.id.clone()),
1383 run_path: run.persisted_path.clone(),
1384 metadata: BTreeMap::from([(
1385 "workflow_id".to_string(),
1386 serde_json::json!(run.workflow_id),
1387 )]),
1388 },
1389 );
1390 let mut entry_node_id = root_node_id.clone();
1391 if let Some(trigger_event) = trigger_event.as_ref() {
1392 if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1393 let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1394 append_action_graph_node(
1395 &mut action_graph_nodes,
1396 RunActionGraphNodeRecord {
1397 id: replay_source_node_id.clone(),
1398 label: format!(
1399 "{}:{} (original {})",
1400 trigger_event.provider.as_str(),
1401 trigger_event.kind,
1402 replay_of_event_id
1403 ),
1404 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1405 status: "historical".to_string(),
1406 outcome: "replayed_from".to_string(),
1407 trace_id: Some(trigger_event.trace_id.0.clone()),
1408 stage_id: None,
1409 node_id: None,
1410 worker_id: None,
1411 run_id: Some(run.id.clone()),
1412 run_path: run.persisted_path.clone(),
1413 metadata: trigger_node_metadata(trigger_event),
1414 },
1415 );
1416 action_graph_edges.push(RunActionGraphEdgeRecord {
1417 from_id: replay_source_node_id,
1418 to_id: format!("trigger:{}", trigger_event.id.0),
1419 kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1420 label: Some("replay chain".to_string()),
1421 });
1422 }
1423 let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1424 append_action_graph_node(
1425 &mut action_graph_nodes,
1426 RunActionGraphNodeRecord {
1427 id: trigger_node_id.clone(),
1428 label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1429 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1430 status: "received".to_string(),
1431 outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1432 trace_id: Some(trigger_event.trace_id.0.clone()),
1433 stage_id: None,
1434 node_id: None,
1435 worker_id: None,
1436 run_id: Some(run.id.clone()),
1437 run_path: run.persisted_path.clone(),
1438 metadata: trigger_node_metadata(trigger_event),
1439 },
1440 );
1441 action_graph_edges.push(RunActionGraphEdgeRecord {
1442 from_id: root_node_id.clone(),
1443 to_id: trigger_node_id.clone(),
1444 kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1445 label: Some(trigger_event.id.0.clone()),
1446 });
1447 entry_node_id = trigger_node_id;
1448 }
1449
1450 let stage_node_ids = run
1451 .stages
1452 .iter()
1453 .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1454 .collect::<BTreeMap<_, _>>();
1455 let stage_by_id = run
1456 .stages
1457 .iter()
1458 .map(|stage| (stage.id.as_str(), stage))
1459 .collect::<BTreeMap<_, _>>();
1460 let stage_by_node_id = run
1461 .stages
1462 .iter()
1463 .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1464 .collect::<BTreeMap<_, _>>();
1465
1466 let incoming_nodes = run
1467 .transitions
1468 .iter()
1469 .map(|transition| transition.to_node_id.clone())
1470 .collect::<BTreeSet<_>>();
1471
1472 for stage in &run.stages {
1473 let graph_node_id = stage_node_ids
1474 .get(&stage.id)
1475 .cloned()
1476 .unwrap_or_else(|| format!("stage:{}", stage.id));
1477 append_action_graph_node(
1478 &mut action_graph_nodes,
1479 RunActionGraphNodeRecord {
1480 id: graph_node_id.clone(),
1481 label: stage.node_id.clone(),
1482 kind: action_graph_kind_for_stage(stage).to_string(),
1483 status: stage.status.clone(),
1484 outcome: stage.outcome.clone(),
1485 trace_id: propagated_trace_id.clone(),
1486 stage_id: Some(stage.id.clone()),
1487 node_id: Some(stage.node_id.clone()),
1488 worker_id: stage
1489 .metadata
1490 .get("worker_id")
1491 .and_then(|value| value.as_str())
1492 .map(str::to_string),
1493 run_id: None,
1494 run_path: None,
1495 metadata: stage_node_metadata(stage),
1496 },
1497 );
1498 if !incoming_nodes.contains(&stage.node_id) {
1499 action_graph_edges.push(RunActionGraphEdgeRecord {
1500 from_id: entry_node_id.clone(),
1501 to_id: graph_node_id.clone(),
1502 kind: if trigger_event.is_some() {
1503 ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1504 } else {
1505 ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1506 },
1507 label: None,
1508 });
1509 }
1510
1511 if stage.kind == "verify" || stage.verification.is_some() {
1512 let passed = json_bool(
1513 stage
1514 .verification
1515 .as_ref()
1516 .and_then(|value| value.get("pass")),
1517 )
1518 .or_else(|| {
1519 json_bool(
1520 stage
1521 .verification
1522 .as_ref()
1523 .and_then(|value| value.get("success")),
1524 )
1525 })
1526 .or_else(|| {
1527 if stage.status == "completed" && stage.outcome == "success" {
1528 Some(true)
1529 } else if stage.status == "failed" || stage.outcome == "failed" {
1530 Some(false)
1531 } else {
1532 None
1533 }
1534 });
1535 verification_outcomes.push(RunVerificationOutcomeRecord {
1536 stage_id: stage.id.clone(),
1537 node_id: stage.node_id.clone(),
1538 status: stage.status.clone(),
1539 passed,
1540 summary: stage
1541 .verification
1542 .as_ref()
1543 .map(compact_json_value)
1544 .or_else(|| {
1545 stage
1546 .visible_text
1547 .as_ref()
1548 .filter(|value| !value.trim().is_empty())
1549 .cloned()
1550 }),
1551 });
1552 }
1553
1554 if stage.transcript.is_some() {
1555 transcript_pointers.push(RunTranscriptPointerRecord {
1556 id: format!("stage:{}:transcript", stage.id),
1557 label: format!("Stage {} transcript", stage.node_id),
1558 kind: "embedded_transcript".to_string(),
1559 location: format!("run.stages[{}].transcript", stage.node_id),
1560 path: run.persisted_path.clone(),
1561 available: true,
1562 });
1563 if let Some(transcript) = stage.transcript.as_ref() {
1564 compaction_events.extend(compaction_events_from_transcript(
1565 transcript,
1566 Some(&stage.id),
1567 Some(&stage.node_id),
1568 &format!("run.stages[{}].transcript", stage.node_id),
1569 persisted_path,
1570 ));
1571 }
1572 }
1573
1574 if let Some(payload) = stage_result_payload(stage) {
1575 let trace = payload.get("trace");
1576 let task_ledger = payload
1577 .get("task_ledger")
1578 .and_then(task_ledger_summary_from_value);
1579 let research_facts = task_ledger
1580 .as_ref()
1581 .map(|ledger| ledger.observations.clone())
1582 .unwrap_or_default();
1583 research_fact_count += research_facts.len();
1584 let tools_payload = payload.get("tools");
1585 let tools_used = json_string_array(
1586 tools_payload
1587 .and_then(|tools| tools.get("calls"))
1588 .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1589 );
1590 let successful_tools =
1591 json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1592 let planner_round = RunPlannerRoundRecord {
1593 stage_id: stage.id.clone(),
1594 node_id: stage.node_id.clone(),
1595 stage_kind: stage.kind.clone(),
1596 status: stage.status.clone(),
1597 outcome: stage.outcome.clone(),
1598 iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1599 llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1600 tool_execution_count: json_usize(
1601 trace.and_then(|trace| trace.get("tool_executions")),
1602 ),
1603 tool_rejection_count: json_usize(
1604 trace.and_then(|trace| trace.get("tool_rejections")),
1605 ),
1606 intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1607 compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1608 native_text_tool_fallback_count: json_usize(
1609 trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1610 ),
1611 native_text_tool_fallback_rejection_count: json_usize(
1612 trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1613 ),
1614 empty_completion_retry_count: json_usize(
1615 trace.and_then(|trace| trace.get("empty_completion_retries")),
1616 ),
1617 tools_used,
1618 successful_tools,
1619 ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1620 task_ledger,
1621 research_facts,
1622 };
1623 let has_agentic_detail = planner_round.iteration_count > 0
1624 || planner_round.llm_call_count > 0
1625 || planner_round.tool_execution_count > 0
1626 || planner_round.native_text_tool_fallback_count > 0
1627 || planner_round.native_text_tool_fallback_rejection_count > 0
1628 || planner_round.empty_completion_retry_count > 0
1629 || planner_round.ledger_done_rejections > 0
1630 || planner_round.task_ledger.is_some()
1631 || !planner_round.tools_used.is_empty()
1632 || !planner_round.successful_tools.is_empty();
1633 if has_agentic_detail {
1634 planner_rounds.push(planner_round);
1635 }
1636 }
1637 }
1638
1639 for transition in &run.transitions {
1640 let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1641 continue;
1642 };
1643 let from_stage = transition
1644 .from_stage_id
1645 .as_deref()
1646 .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1647 let from_id = transition
1648 .from_stage_id
1649 .as_ref()
1650 .and_then(|stage_id| stage_node_ids.get(stage_id))
1651 .cloned()
1652 .or_else(|| {
1653 transition
1654 .from_node_id
1655 .as_ref()
1656 .and_then(|node_id| stage_by_node_id.get(node_id))
1657 .cloned()
1658 })
1659 .unwrap_or_else(|| root_node_id.clone());
1660 action_graph_edges.push(RunActionGraphEdgeRecord {
1661 from_id,
1662 to_id,
1663 kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1664 ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1665 } else {
1666 ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1667 },
1668 label: transition.branch.clone(),
1669 });
1670 }
1671
1672 let worker_lineage = run
1673 .child_runs
1674 .iter()
1675 .map(|child| {
1676 let worker_node_id = format!("worker:{}", child.worker_id);
1677 append_action_graph_node(
1678 &mut action_graph_nodes,
1679 RunActionGraphNodeRecord {
1680 id: worker_node_id.clone(),
1681 label: child.worker_name.clone(),
1682 kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1683 status: child.status.clone(),
1684 outcome: child.status.clone(),
1685 trace_id: propagated_trace_id.clone(),
1686 stage_id: child.parent_stage_id.clone(),
1687 node_id: None,
1688 worker_id: Some(child.worker_id.clone()),
1689 run_id: child.run_id.clone(),
1690 run_path: child.run_path.clone(),
1691 metadata: BTreeMap::from([
1692 (
1693 "worker_name".to_string(),
1694 serde_json::json!(child.worker_name),
1695 ),
1696 ("task".to_string(), serde_json::json!(child.task)),
1697 ]),
1698 },
1699 );
1700 if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1701 if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1702 action_graph_edges.push(RunActionGraphEdgeRecord {
1703 from_id: stage_node_id.clone(),
1704 to_id: worker_node_id,
1705 kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1706 label: Some(child.worker_name.clone()),
1707 });
1708 }
1709 }
1710 RunWorkerLineageRecord {
1711 worker_id: child.worker_id.clone(),
1712 worker_name: child.worker_name.clone(),
1713 parent_stage_id: child.parent_stage_id.clone(),
1714 task: child.task.clone(),
1715 status: child.status.clone(),
1716 session_id: child.session_id.clone(),
1717 parent_session_id: child.parent_session_id.clone(),
1718 run_id: child.run_id.clone(),
1719 run_path: child.run_path.clone(),
1720 snapshot_path: child.snapshot_path.clone(),
1721 }
1722 })
1723 .collect::<Vec<_>>();
1724
1725 if run.transcript.is_some() {
1726 transcript_pointers.push(RunTranscriptPointerRecord {
1727 id: "run:transcript".to_string(),
1728 label: "Run transcript".to_string(),
1729 kind: "embedded_transcript".to_string(),
1730 location: "run.transcript".to_string(),
1731 path: run.persisted_path.clone(),
1732 available: true,
1733 });
1734 if let Some(transcript) = run.transcript.as_ref() {
1735 compaction_events.extend(compaction_events_from_transcript(
1736 transcript,
1737 None,
1738 None,
1739 "run.transcript",
1740 persisted_path,
1741 ));
1742 }
1743 }
1744
1745 if let Some(path) = persisted_path {
1746 if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1747 transcript_pointers.push(RunTranscriptPointerRecord {
1748 id: "run:llm_transcript".to_string(),
1749 label: "LLM transcript sidecar".to_string(),
1750 kind: "llm_jsonl".to_string(),
1751 location: "run sidecar".to_string(),
1752 path: Some(sidecar_path.to_string_lossy().into_owned()),
1753 available: sidecar_path.exists(),
1754 });
1755 }
1756 daemon_events.extend(daemon_events_from_sidecar(path));
1757 }
1758
1759 RunObservabilityRecord {
1760 schema_version: 4,
1761 planner_rounds,
1762 research_fact_count,
1763 action_graph_nodes,
1764 action_graph_edges,
1765 worker_lineage,
1766 verification_outcomes,
1767 transcript_pointers,
1768 compaction_events,
1769 daemon_events,
1770 }
1771}
1772
1773fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1774 run.observability = Some(derive_run_observability(run, persisted_path));
1775}
1776
1777pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1778 let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1779 if run.type_name.is_empty() {
1780 run.type_name = "run_record".to_string();
1781 }
1782 if run.id.is_empty() {
1783 run.id = new_id("run");
1784 }
1785 if run.started_at.is_empty() {
1786 run.started_at = now_rfc3339();
1787 }
1788 if run.status.is_empty() {
1789 run.status = "running".to_string();
1790 }
1791 if run.root_run_id.is_none() {
1792 run.root_run_id = Some(run.id.clone());
1793 }
1794 if run.replay_fixture.is_none() {
1795 run.replay_fixture = Some(replay_fixture_from_run(&run));
1796 }
1797 merge_hitl_questions_from_active_log(&mut run);
1798 materialize_child_runs_from_stage_metadata(&mut run);
1799 sync_run_handoffs(&mut run);
1800 if run.observability.is_none() {
1801 let persisted_path = run.persisted_path.clone();
1802 let persisted = persisted_path.as_deref().map(Path::new);
1803 refresh_run_observability(&mut run, persisted);
1804 }
1805 Ok(run)
1806}
1807
1808pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1809 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1810 if manifest.type_name.is_empty() {
1811 manifest.type_name = "eval_suite_manifest".to_string();
1812 }
1813 if manifest.id.is_empty() {
1814 manifest.id = new_id("eval_suite");
1815 }
1816 Ok(manifest)
1817}
1818
1819pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1820 let content = std::fs::read_to_string(path)
1821 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1822 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1823 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1824 if manifest.base_dir.is_none() {
1825 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1826 }
1827 Ok(manifest)
1828}
1829
1830pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1831 let content = std::fs::read_to_string(path)
1832 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1833 let mut manifest: EvalPackManifest =
1834 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1835 serde_json::from_str(&content)
1836 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1837 } else {
1838 toml::from_str(&content)
1839 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1840 };
1841 normalize_eval_pack_manifest(&mut manifest);
1842 if manifest.base_dir.is_none() {
1843 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1844 }
1845 Ok(manifest)
1846}
1847
1848pub fn normalize_eval_pack_manifest_value(value: &VmValue) -> Result<EvalPackManifest, VmError> {
1849 let mut manifest: EvalPackManifest = parse_json_value(value)?;
1850 normalize_eval_pack_manifest(&mut manifest);
1851 Ok(manifest)
1852}
1853
1854fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1855 if manifest.version == 0 {
1856 manifest.version = 1;
1857 }
1858 if manifest.id.is_empty() {
1859 manifest.id = manifest
1860 .name
1861 .clone()
1862 .filter(|name| !name.trim().is_empty())
1863 .unwrap_or_else(|| new_id("eval_pack"));
1864 }
1865 for ladder in &mut manifest.ladders {
1866 super::normalize_persona_eval_ladder_manifest(ladder);
1867 }
1868}
1869
1870fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1871 let content = std::fs::read_to_string(path)
1872 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1873 serde_json::from_str(&content)
1874 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1875}
1876
1877fn load_run_record_from_fixture_ref(
1878 fixture: &EvalPackFixtureRef,
1879 base_dir: Option<&Path>,
1880) -> Result<RunRecord, VmError> {
1881 if let Some(inline) = &fixture.inline {
1882 let run: RunRecord = serde_json::from_value(inline.clone())
1883 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1884 return Ok(run);
1885 }
1886 let path = fixture.path.as_deref().ok_or_else(|| {
1887 VmError::Runtime(format!(
1888 "fixture '{}' is missing path or inline run",
1889 fixture.id
1890 ))
1891 })?;
1892 load_run_record(&resolve_manifest_path(base_dir, path))
1893}
1894
1895fn load_replay_fixture_from_ref(
1896 fixture: &EvalPackFixtureRef,
1897 base_dir: Option<&Path>,
1898) -> Result<ReplayFixture, VmError> {
1899 if let Some(inline) = &fixture.inline {
1900 return serde_json::from_value(inline.clone())
1901 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1902 }
1903 let path = fixture.path.as_deref().ok_or_else(|| {
1904 VmError::Runtime(format!(
1905 "fixture '{}' is missing path or inline replay fixture",
1906 fixture.id
1907 ))
1908 })?;
1909 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1910}
1911
1912fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1913 let path_buf = PathBuf::from(path);
1914 if path_buf.is_absolute() {
1915 path_buf
1916 } else if let Some(base_dir) = base_dir {
1917 base_dir.join(path_buf)
1918 } else {
1919 path_buf
1920 }
1921}
1922
1923pub fn evaluate_run_suite_manifest(
1924 manifest: &EvalSuiteManifest,
1925) -> Result<ReplayEvalSuiteReport, VmError> {
1926 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1927 let mut reports = Vec::new();
1928 for case in &manifest.cases {
1929 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1930 let run = load_run_record(&run_path)?;
1931 let fixture = match &case.fixture_path {
1932 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1933 None => run
1934 .replay_fixture
1935 .clone()
1936 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1937 };
1938 let eval = evaluate_run_against_fixture(&run, &fixture);
1939 let mut pass = eval.pass;
1940 let mut failures = eval.failures;
1941 let comparison = match &case.compare_to {
1942 Some(path) => {
1943 let baseline_path = resolve_manifest_path(base_dir, path);
1944 let baseline = load_run_record(&baseline_path)?;
1945 let diff = diff_run_records(&baseline, &run);
1946 if !diff.identical {
1947 pass = false;
1948 failures.push(format!(
1949 "run differs from baseline {} with {} stage changes",
1950 baseline_path.display(),
1951 diff.stage_diffs.len()
1952 ));
1953 }
1954 Some(diff)
1955 }
1956 None => None,
1957 };
1958 reports.push(ReplayEvalCaseReport {
1959 run_id: run.id.clone(),
1960 workflow_id: run.workflow_id.clone(),
1961 label: case.label.clone(),
1962 pass,
1963 failures,
1964 stage_count: eval.stage_count,
1965 source_path: Some(run_path.display().to_string()),
1966 comparison,
1967 });
1968 }
1969 let total = reports.len();
1970 let passed = reports.iter().filter(|report| report.pass).count();
1971 let failed = total.saturating_sub(passed);
1972 Ok(ReplayEvalSuiteReport {
1973 pass: failed == 0,
1974 total,
1975 passed,
1976 failed,
1977 cases: reports,
1978 })
1979}
1980
1981pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1982 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1983 let fixture_base_dir_buf = manifest
1984 .defaults
1985 .fixture_root
1986 .as_deref()
1987 .map(|root| resolve_manifest_path(base_dir, root));
1988 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1989 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1990 .fixtures
1991 .iter()
1992 .filter(|fixture| !fixture.id.is_empty())
1993 .map(|fixture| (fixture.id.as_str(), fixture))
1994 .collect();
1995 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1996 .rubrics
1997 .iter()
1998 .filter(|rubric| !rubric.id.is_empty())
1999 .map(|rubric| (rubric.id.as_str(), rubric))
2000 .collect();
2001
2002 let mut reports = Vec::new();
2003 for (index, case) in manifest.cases.iter().enumerate() {
2004 let case_id = case
2005 .id
2006 .clone()
2007 .filter(|id| !id.trim().is_empty())
2008 .unwrap_or_else(|| format!("case_{}", index + 1));
2009 let label = case
2010 .name
2011 .clone()
2012 .or_else(|| case.id.clone())
2013 .unwrap_or_else(|| case_id.clone());
2014 let severity = eval_pack_case_severity(manifest, case);
2015 let blocking = severity == "blocking";
2016 let mut failures = Vec::new();
2017 let mut warnings = Vec::new();
2018 let mut informational = Vec::new();
2019
2020 if case.friction_events.is_some() {
2021 let report = evaluate_eval_pack_friction_case(
2022 manifest,
2023 case,
2024 &case_id,
2025 &label,
2026 &severity,
2027 blocking,
2028 base_dir,
2029 fixture_base_dir,
2030 &fixtures_by_id,
2031 &rubrics_by_id,
2032 )?;
2033 reports.push(report);
2034 continue;
2035 }
2036
2037 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
2038 let fixture =
2039 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
2040 let eval = evaluate_run_against_fixture(&run, &fixture);
2041 failures.extend(eval.failures);
2042 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
2043 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
2044
2045 let comparison = match case.compare_to.as_ref().or(manifest.baseline.as_ref()) {
2046 Some(path) => {
2047 let baseline_path = resolve_manifest_path(base_dir, path);
2048 let baseline = load_run_record(&baseline_path)?;
2049 let diff = diff_run_records(&baseline, &run);
2050 if !diff.identical {
2051 failures.push(format!(
2052 "run differs from baseline {} with {} stage changes",
2053 baseline_path.display(),
2054 diff.stage_diffs.len()
2055 ));
2056 }
2057 Some(diff)
2058 }
2059 None => None,
2060 };
2061
2062 for rubric_id in &case.rubrics {
2063 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2064 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2065 continue;
2066 };
2067 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
2068 }
2069
2070 let pass = failures.is_empty() || !blocking;
2071 if !failures.is_empty() && !blocking {
2072 if severity == "warning" {
2073 warnings.append(&mut failures);
2074 } else {
2075 informational.append(&mut failures);
2076 }
2077 }
2078 reports.push(EvalPackCaseReport {
2079 id: case_id,
2080 label,
2081 severity,
2082 pass,
2083 blocking,
2084 run_id: run.id.clone(),
2085 workflow_id: run.workflow_id.clone(),
2086 source_path: eval_pack_case_source_path(
2087 case,
2088 base_dir,
2089 fixture_base_dir,
2090 &fixtures_by_id,
2091 ),
2092 stage_count: eval.stage_count,
2093 failures,
2094 warnings,
2095 informational,
2096 comparison,
2097 });
2098 }
2099
2100 let mut ladder_reports = Vec::new();
2101 for ladder in &manifest.ladders {
2102 let mut ladder = ladder.clone();
2103 if ladder.base_dir.is_none() {
2104 ladder.base_dir = manifest.base_dir.clone();
2105 }
2106 ladder_reports.push(run_persona_eval_ladder(&ladder)?);
2107 }
2108
2109 let case_total = reports.len();
2110 let ladder_total = ladder_reports.len();
2111 let total = case_total + ladder_total;
2112 let case_blocking_failed = reports
2113 .iter()
2114 .filter(|report| report.blocking && !report.failures.is_empty())
2115 .count();
2116 let ladder_blocking_failed = ladder_reports
2117 .iter()
2118 .filter(|report| report.blocking && !report.pass)
2119 .count();
2120 let blocking_failed = case_blocking_failed + ladder_blocking_failed;
2121 let warning_failed = reports
2122 .iter()
2123 .filter(|report| !report.warnings.is_empty())
2124 .count()
2125 + ladder_reports
2126 .iter()
2127 .filter(|report| !report.pass && report.severity == "warning")
2128 .count();
2129 let informational_failed = reports
2130 .iter()
2131 .filter(|report| !report.informational.is_empty())
2132 .count()
2133 + ladder_reports
2134 .iter()
2135 .filter(|report| !report.pass && report.severity == "informational")
2136 .count();
2137 let passed = reports.iter().filter(|report| report.pass).count()
2138 + ladder_reports.iter().filter(|report| report.pass).count();
2139 Ok(EvalPackReport {
2140 pack_id: manifest.id.clone(),
2141 pass: blocking_failed == 0,
2142 total,
2143 passed,
2144 failed: total.saturating_sub(passed),
2145 blocking_failed,
2146 warning_failed,
2147 informational_failed,
2148 cases: reports,
2149 ladders: ladder_reports,
2150 })
2151}
2152
2153#[allow(clippy::too_many_arguments)]
2154fn evaluate_eval_pack_friction_case(
2155 manifest: &EvalPackManifest,
2156 case: &EvalPackCase,
2157 case_id: &str,
2158 label: &str,
2159 severity: &str,
2160 blocking: bool,
2161 base_dir: Option<&Path>,
2162 fixture_base_dir: Option<&Path>,
2163 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2164 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
2165) -> Result<EvalPackCaseReport, VmError> {
2166 let mut failures = Vec::new();
2167 let mut warnings = Vec::new();
2168 let mut informational = Vec::new();
2169 let events =
2170 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
2171 let options = friction_suggestion_options(case, manifest);
2172 let suggestions = generate_context_pack_suggestions(&events, &options);
2173
2174 for rubric_id in &case.rubrics {
2175 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
2176 failures.push(format!("case references unknown rubric '{rubric_id}'"));
2177 continue;
2178 };
2179 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
2180 }
2181
2182 if case.rubrics.is_empty() && suggestions.is_empty() {
2183 failures.push("friction fixture produced no context-pack suggestions".to_string());
2184 }
2185
2186 let pass = failures.is_empty() || !blocking;
2187 if !failures.is_empty() && !blocking {
2188 if severity == "warning" {
2189 warnings.append(&mut failures);
2190 } else {
2191 informational.append(&mut failures);
2192 }
2193 }
2194
2195 Ok(EvalPackCaseReport {
2196 id: case_id.to_string(),
2197 label: label.to_string(),
2198 severity: severity.to_string(),
2199 pass,
2200 blocking,
2201 run_id: "friction_events".to_string(),
2202 workflow_id: String::new(),
2203 source_path: eval_pack_case_friction_source_path(
2204 case,
2205 base_dir,
2206 fixture_base_dir,
2207 fixtures_by_id,
2208 ),
2209 stage_count: events.len(),
2210 failures,
2211 warnings,
2212 informational,
2213 comparison: None,
2214 })
2215}
2216
2217fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
2218 normalize_eval_pack_severity(
2219 case.severity
2220 .as_deref()
2221 .or(case.thresholds.severity.as_deref())
2222 .or(manifest.defaults.severity.as_deref())
2223 .or(manifest.defaults.thresholds.severity.as_deref())
2224 .unwrap_or("blocking"),
2225 )
2226}
2227
2228fn normalize_eval_pack_severity(value: &str) -> String {
2229 match value.trim().to_ascii_lowercase().as_str() {
2230 "warn" | "warning" => "warning".to_string(),
2231 "info" | "informational" => "informational".to_string(),
2232 _ => "blocking".to_string(),
2233 }
2234}
2235
2236fn load_eval_pack_case_run(
2237 case: &EvalPackCase,
2238 base_dir: Option<&Path>,
2239 fixture_base_dir: Option<&Path>,
2240 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2241) -> Result<RunRecord, VmError> {
2242 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
2243 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2244 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
2245 }
2246 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
2247 }
2248 Err(VmError::Runtime(
2249 "eval pack case is missing run or run_path".to_string(),
2250 ))
2251}
2252
2253fn load_eval_pack_case_fixture(
2254 case: &EvalPackCase,
2255 base_dir: Option<&Path>,
2256 fixture_base_dir: Option<&Path>,
2257 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2258 run: &RunRecord,
2259) -> Result<ReplayFixture, VmError> {
2260 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2261 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2262 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2263 }
2264 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2265 }
2266 Ok(run
2267 .replay_fixture
2268 .clone()
2269 .unwrap_or_else(|| replay_fixture_from_run(run)))
2270}
2271
2272fn eval_pack_case_source_path(
2273 case: &EvalPackCase,
2274 base_dir: Option<&Path>,
2275 fixture_base_dir: Option<&Path>,
2276 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2277) -> Option<String> {
2278 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2279 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2280 return fixture.path.as_ref().map(|path| {
2281 resolve_manifest_path(fixture_base_dir, path)
2282 .display()
2283 .to_string()
2284 });
2285 }
2286 Some(
2287 resolve_manifest_path(base_dir, run_ref)
2288 .display()
2289 .to_string(),
2290 )
2291}
2292
2293fn load_eval_pack_case_friction_events(
2294 case: &EvalPackCase,
2295 base_dir: Option<&Path>,
2296 fixture_base_dir: Option<&Path>,
2297 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2298) -> Result<Vec<FrictionEvent>, VmError> {
2299 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2300 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2301 })?;
2302 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2303 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2304 }
2305 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2306}
2307
2308fn load_friction_events_from_fixture_ref(
2309 fixture: &EvalPackFixtureRef,
2310 base_dir: Option<&Path>,
2311) -> Result<Vec<FrictionEvent>, VmError> {
2312 if let Some(inline) = &fixture.inline {
2313 return normalize_friction_events_json(inline.clone());
2314 }
2315 let path = fixture.path.as_deref().ok_or_else(|| {
2316 VmError::Runtime(format!(
2317 "fixture '{}' is missing path or inline friction events",
2318 fixture.id
2319 ))
2320 })?;
2321 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2322}
2323
2324fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2325 let content = std::fs::read_to_string(path)
2326 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2327 let value: serde_json::Value = serde_json::from_str(&content)
2328 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2329 normalize_friction_events_json(value)
2330}
2331
2332fn eval_pack_case_friction_source_path(
2333 case: &EvalPackCase,
2334 base_dir: Option<&Path>,
2335 fixture_base_dir: Option<&Path>,
2336 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2337) -> Option<String> {
2338 let event_ref = case.friction_events.as_deref()?;
2339 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2340 return fixture.path.as_ref().map(|path| {
2341 resolve_manifest_path(fixture_base_dir, path)
2342 .display()
2343 .to_string()
2344 });
2345 }
2346 Some(
2347 resolve_manifest_path(base_dir, event_ref)
2348 .display()
2349 .to_string(),
2350 )
2351}
2352
2353fn friction_suggestion_options(
2354 case: &EvalPackCase,
2355 manifest: &EvalPackManifest,
2356) -> ContextPackSuggestionOptions {
2357 let min_occurrences = case
2358 .metadata
2359 .get("min_occurrences")
2360 .or_else(|| manifest.metadata.get("min_occurrences"))
2361 .and_then(|value| value.as_u64())
2362 .unwrap_or(2) as usize;
2363 let owner = case
2364 .metadata
2365 .get("owner")
2366 .or_else(|| manifest.metadata.get("owner"))
2367 .and_then(|value| value.as_str())
2368 .map(str::to_string)
2369 .or_else(|| {
2370 manifest
2371 .package
2372 .as_ref()
2373 .and_then(|package| package.name.clone())
2374 });
2375 ContextPackSuggestionOptions {
2376 min_occurrences,
2377 owner,
2378 }
2379}
2380
2381fn apply_eval_pack_thresholds(
2382 run: &RunRecord,
2383 thresholds: &EvalPackThresholds,
2384 failures: &mut Vec<String>,
2385) {
2386 if let Some(max_stage_count) = thresholds.max_stage_count {
2387 if run.stages.len() > max_stage_count {
2388 failures.push(format!(
2389 "stage count {} exceeds threshold {}",
2390 run.stages.len(),
2391 max_stage_count
2392 ));
2393 }
2394 }
2395 if let Some(max_latency_ms) = thresholds.max_latency_ms {
2396 let actual = run
2397 .usage
2398 .as_ref()
2399 .map(|usage| usage.total_duration_ms)
2400 .unwrap_or_default();
2401 if actual > max_latency_ms {
2402 failures.push(format!(
2403 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2404 ));
2405 }
2406 }
2407 if let Some(max_cost_usd) = thresholds.max_cost_usd {
2408 let actual = run
2409 .usage
2410 .as_ref()
2411 .map(|usage| usage.total_cost)
2412 .unwrap_or_default();
2413 if actual > max_cost_usd {
2414 failures.push(format!(
2415 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2416 ));
2417 }
2418 }
2419 if let Some(max_tokens) = thresholds.max_tokens {
2420 let actual = run
2421 .usage
2422 .as_ref()
2423 .map(|usage| usage.input_tokens + usage.output_tokens)
2424 .unwrap_or_default();
2425 if actual > max_tokens {
2426 failures.push(format!(
2427 "token count {actual} exceeds threshold {max_tokens}"
2428 ));
2429 }
2430 }
2431}
2432
2433fn apply_eval_pack_rubric(
2434 rubric: &EvalPackRubric,
2435 run: &RunRecord,
2436 failures: &mut Vec<String>,
2437 warnings: &mut Vec<String>,
2438) {
2439 match rubric.kind.as_str() {
2440 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2441 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2442 for assertion in &rubric.assertions {
2443 apply_eval_pack_assertion(rubric, assertion, run, failures);
2444 }
2445 }
2446 "llm-judge" | "llm_as_judge" | "judge" => {
2447 let severity = normalize_eval_pack_severity(
2448 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2449 );
2450 let message = format!(
2451 "rubric '{}' requires an external LLM judge and was not run locally",
2452 rubric.id
2453 );
2454 if severity == "blocking" {
2455 failures.push(message);
2456 } else {
2457 warnings.push(message);
2458 }
2459 }
2460 other => warnings.push(format!(
2461 "rubric '{}' has unknown kind '{}' and was not run locally",
2462 rubric.id, other
2463 )),
2464 }
2465}
2466
2467fn apply_eval_pack_friction_rubric(
2468 rubric: &EvalPackRubric,
2469 suggestions: &[super::ContextPackSuggestion],
2470 failures: &mut Vec<String>,
2471 warnings: &mut Vec<String>,
2472) {
2473 match rubric.kind.as_str() {
2474 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2475 let mut expectations = Vec::new();
2476 for assertion in &rubric.assertions {
2477 match assertion.kind.as_str() {
2478 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2479 let expectation = context_pack_expectation_from_assertion(assertion);
2480 expectations.push(expectation);
2481 }
2482 other => failures.push(format!(
2483 "rubric '{}' has unsupported friction assertion kind '{}'",
2484 rubric.id, other
2485 )),
2486 }
2487 }
2488 failures.extend(evaluate_context_pack_suggestion_expectations(
2489 suggestions,
2490 &expectations,
2491 ));
2492 }
2493 other => warnings.push(format!(
2494 "rubric '{}' has unknown friction kind '{}' and was not run locally",
2495 rubric.id, other
2496 )),
2497 }
2498}
2499
2500fn context_pack_expectation_from_assertion(
2501 assertion: &EvalPackAssertion,
2502) -> ContextPackSuggestionExpectation {
2503 let expected = assertion
2504 .expected
2505 .as_ref()
2506 .and_then(|value| value.as_object());
2507 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2508 ContextPackSuggestionExpectation {
2509 min_suggestions: expected
2510 .and_then(|map| map.get("min_suggestions"))
2511 .and_then(|value| value.as_u64())
2512 .map(|value| value as usize),
2513 recommended_artifact: expected
2514 .and_then(|map| map.get("recommended_artifact"))
2515 .and_then(|value| value.as_str())
2516 .map(str::to_string)
2517 .or_else(|| expected_string.map(str::to_string)),
2518 title_contains: assertion.contains.clone().or_else(|| {
2519 expected
2520 .and_then(|map| map.get("title_contains"))
2521 .and_then(|value| value.as_str())
2522 .map(str::to_string)
2523 }),
2524 manifest_name_contains: expected
2525 .and_then(|map| map.get("manifest_name_contains"))
2526 .and_then(|value| value.as_str())
2527 .map(str::to_string),
2528 required_capability: expected
2529 .and_then(|map| map.get("required_capability"))
2530 .and_then(|value| value.as_str())
2531 .map(str::to_string),
2532 required_output_slot: expected
2533 .and_then(|map| map.get("required_output_slot"))
2534 .and_then(|value| value.as_str())
2535 .map(str::to_string),
2536 }
2537}
2538
2539fn apply_eval_pack_assertion(
2540 rubric: &EvalPackRubric,
2541 assertion: &EvalPackAssertion,
2542 run: &RunRecord,
2543 failures: &mut Vec<String>,
2544) {
2545 match assertion.kind.as_str() {
2546 "run-status" | "run_status" | "status" => {
2547 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2548 if let Some(expected) = expected {
2549 if run.status != expected {
2550 failures.push(format!(
2551 "rubric '{}' expected run status {}, got {}",
2552 rubric.id, expected, run.status
2553 ));
2554 }
2555 }
2556 }
2557 "stage-status" | "stage_status" => {
2558 let Some(stage_id) = assertion.stage.as_deref() else {
2559 failures.push(format!(
2560 "rubric '{}' stage-status assertion missing stage",
2561 rubric.id
2562 ));
2563 return;
2564 };
2565 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2566 let Some(expected) = expected else {
2567 failures.push(format!(
2568 "rubric '{}' stage-status assertion missing expected string",
2569 rubric.id
2570 ));
2571 return;
2572 };
2573 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2574 Some(stage) if stage.status == expected => {}
2575 Some(stage) => failures.push(format!(
2576 "rubric '{}' expected stage {} status {}, got {}",
2577 rubric.id, stage_id, expected, stage.status
2578 )),
2579 None => failures.push(format!(
2580 "rubric '{}' expected stage {} to exist",
2581 rubric.id, stage_id
2582 )),
2583 }
2584 }
2585 "visible-text-contains" | "visible_text_contains" => {
2586 let Some(needle) = assertion.contains.as_deref() else {
2587 failures.push(format!(
2588 "rubric '{}' visible-text assertion missing contains",
2589 rubric.id
2590 ));
2591 return;
2592 };
2593 let matched = match assertion.stage.as_deref() {
2594 Some(stage_id) => run
2595 .stages
2596 .iter()
2597 .find(|stage| stage.node_id == stage_id)
2598 .and_then(|stage| stage.visible_text.as_deref())
2599 .is_some_and(|text| text.contains(needle)),
2600 None => run
2601 .stages
2602 .iter()
2603 .filter_map(|stage| stage.visible_text.as_deref())
2604 .any(|text| text.contains(needle)),
2605 };
2606 if !matched {
2607 failures.push(format!(
2608 "rubric '{}' expected visible text to contain {:?}",
2609 rubric.id, needle
2610 ));
2611 }
2612 }
2613 "hitl-question-contains" | "hitl_question_contains" => {
2614 let Some(needle) = assertion.contains.as_deref() else {
2615 failures.push(format!(
2616 "rubric '{}' HITL assertion missing contains",
2617 rubric.id
2618 ));
2619 return;
2620 };
2621 if !run
2622 .hitl_questions
2623 .iter()
2624 .any(|question| question.prompt.contains(needle))
2625 {
2626 failures.push(format!(
2627 "rubric '{}' expected HITL question to contain {:?}",
2628 rubric.id, needle
2629 ));
2630 }
2631 }
2632 "" => {}
2633 other => failures.push(format!(
2634 "rubric '{}' has unsupported assertion kind '{}'",
2635 rubric.id, other
2636 )),
2637 }
2638}
2639
2640#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2642pub(crate) enum DiffOp {
2643 Equal,
2644 Delete,
2645 Insert,
2646}
2647
2648pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2652 let n = a.len() as isize;
2653 let m = b.len() as isize;
2654 if n == 0 && m == 0 {
2655 return Vec::new();
2656 }
2657 if n == 0 {
2658 return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2659 }
2660 if m == 0 {
2661 return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2662 }
2663
2664 let max_d = (n + m) as usize;
2665 let offset = max_d as isize;
2666 let v_size = 2 * max_d + 1;
2667 let mut v = vec![0isize; v_size];
2668 let mut trace: Vec<Vec<isize>> = Vec::new();
2670
2671 'outer: for d in 0..=max_d as isize {
2672 trace.push(v.clone());
2673 let mut new_v = v.clone();
2674 for k in (-d..=d).step_by(2) {
2675 let ki = (k + offset) as usize;
2676 let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2677 v[ki + 1]
2678 } else {
2679 v[ki - 1] + 1
2680 };
2681 let mut y = x - k;
2682 while x < n && y < m && a[x as usize] == b[y as usize] {
2683 x += 1;
2684 y += 1;
2685 }
2686 new_v[ki] = x;
2687 if x >= n && y >= m {
2688 let _ = new_v;
2689 break 'outer;
2690 }
2691 }
2692 v = new_v;
2693 }
2694
2695 let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2696 let mut x = n;
2697 let mut y = m;
2698 for d in (1..trace.len() as isize).rev() {
2699 let k = x - y;
2700 let v_prev = &trace[d as usize];
2701 let prev_k = if k == -d
2702 || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2703 {
2704 k + 1
2705 } else {
2706 k - 1
2707 };
2708 let prev_x = v_prev[(prev_k + offset) as usize];
2709 let prev_y = prev_x - prev_k;
2710
2711 while x > prev_x && y > prev_y {
2712 x -= 1;
2713 y -= 1;
2714 ops.push((DiffOp::Equal, x as usize));
2715 }
2716 if prev_k < k {
2717 x -= 1;
2718 ops.push((DiffOp::Delete, x as usize));
2719 } else {
2720 y -= 1;
2721 ops.push((DiffOp::Insert, y as usize));
2722 }
2723 }
2724 while x > 0 && y > 0 {
2725 x -= 1;
2726 y -= 1;
2727 ops.push((DiffOp::Equal, x as usize));
2728 }
2729 ops.reverse();
2730 ops
2731}
2732
2733pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2734 let before_lines: Vec<&str> = before.lines().collect();
2735 let after_lines: Vec<&str> = after.lines().collect();
2736 let ops = myers_diff(&before_lines, &after_lines);
2737
2738 let mut diff = String::new();
2739 let file = path.unwrap_or("artifact");
2740 diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2741 for &(op, idx) in &ops {
2742 match op {
2743 DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2744 DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2745 DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2746 }
2747 }
2748 diff
2749}
2750
2751pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2752 let path = path
2753 .map(PathBuf::from)
2754 .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2755 let mut materialized = run.clone();
2756 merge_hitl_questions_from_active_log(&mut materialized);
2757 materialize_child_runs_from_stage_metadata(&mut materialized);
2758 if materialized.replay_fixture.is_none() {
2759 materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2760 }
2761 materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2762 sync_run_handoffs(&mut materialized);
2763 refresh_run_observability(&mut materialized, Some(&path));
2764 if let Some(parent) = path.parent() {
2765 std::fs::create_dir_all(parent)
2766 .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2767 }
2768 let json = serde_json::to_string_pretty(&materialized)
2769 .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2770 crate::atomic_io::atomic_write(&path, json.as_bytes())
2771 .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2772 if let Some(observability) = materialized.observability.as_ref() {
2773 publish_action_graph_event(&materialized, observability, &path);
2774 }
2775 Ok(path.to_string_lossy().into_owned())
2776}
2777
2778pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2779 let content = std::fs::read_to_string(path)
2780 .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2781 let mut run: RunRecord = serde_json::from_str(&content)
2782 .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2783 materialize_child_runs_from_stage_metadata(&mut run);
2784 if run.replay_fixture.is_none() {
2785 run.replay_fixture = Some(replay_fixture_from_run(&run));
2786 }
2787 run.persisted_path
2788 .get_or_insert_with(|| path.to_string_lossy().into_owned());
2789 sync_run_handoffs(&mut run);
2790 refresh_run_observability(&mut run, Some(path));
2791 Ok(run)
2792}
2793
2794pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2795 ReplayFixture {
2796 type_name: "replay_fixture".to_string(),
2797 id: new_id("fixture"),
2798 source_run_id: run.id.clone(),
2799 workflow_id: run.workflow_id.clone(),
2800 workflow_name: run.workflow_name.clone(),
2801 created_at: now_rfc3339(),
2802 eval_kind: Some("replay".to_string()),
2803 clarifying_question: None,
2804 expected_status: run.status.clone(),
2805 stage_assertions: run
2806 .stages
2807 .iter()
2808 .map(|stage| ReplayStageAssertion {
2809 node_id: stage.node_id.clone(),
2810 expected_status: stage.status.clone(),
2811 expected_outcome: stage.outcome.clone(),
2812 expected_branch: stage.branch.clone(),
2813 required_artifact_kinds: stage
2814 .artifacts
2815 .iter()
2816 .map(|artifact| artifact.kind.clone())
2817 .collect(),
2818 visible_text_contains: stage
2819 .visible_text
2820 .as_ref()
2821 .filter(|text| !text.is_empty())
2822 .map(|text| text.chars().take(80).collect()),
2823 })
2824 .collect(),
2825 }
2826}
2827
2828pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2829 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2830 return evaluate_clarifying_question(run, fixture);
2831 }
2832 let mut failures = Vec::new();
2833 if run.status != fixture.expected_status {
2834 failures.push(format!(
2835 "run status mismatch: expected {}, got {}",
2836 fixture.expected_status, run.status
2837 ));
2838 }
2839 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2840 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2841 for assertion in &fixture.stage_assertions {
2842 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2843 failures.push(format!("missing stage {}", assertion.node_id));
2844 continue;
2845 };
2846 if stage.status != assertion.expected_status {
2847 failures.push(format!(
2848 "stage {} status mismatch: expected {}, got {}",
2849 assertion.node_id, assertion.expected_status, stage.status
2850 ));
2851 }
2852 if stage.outcome != assertion.expected_outcome {
2853 failures.push(format!(
2854 "stage {} outcome mismatch: expected {}, got {}",
2855 assertion.node_id, assertion.expected_outcome, stage.outcome
2856 ));
2857 }
2858 if stage.branch != assertion.expected_branch {
2859 failures.push(format!(
2860 "stage {} branch mismatch: expected {:?}, got {:?}",
2861 assertion.node_id, assertion.expected_branch, stage.branch
2862 ));
2863 }
2864 for required_kind in &assertion.required_artifact_kinds {
2865 if !stage
2866 .artifacts
2867 .iter()
2868 .any(|artifact| &artifact.kind == required_kind)
2869 {
2870 failures.push(format!(
2871 "stage {} missing artifact kind {}",
2872 assertion.node_id, required_kind
2873 ));
2874 }
2875 }
2876 if let Some(snippet) = &assertion.visible_text_contains {
2877 let actual = stage.visible_text.clone().unwrap_or_default();
2878 if !actual.contains(snippet) {
2879 failures.push(format!(
2880 "stage {} visible text does not contain expected snippet {:?}",
2881 assertion.node_id, snippet
2882 ));
2883 }
2884 }
2885 }
2886
2887 ReplayEvalReport {
2888 pass: failures.is_empty(),
2889 failures,
2890 stage_count: run.stages.len(),
2891 }
2892}
2893
2894fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2895 let mut failures = Vec::new();
2896 let spec = fixture.clarifying_question.clone().unwrap_or_default();
2897 let min_questions = clarifying_min_questions(&spec);
2898 let max_questions = clarifying_max_questions(&spec);
2899 let questions = &run.hitl_questions;
2900
2901 if run.status != fixture.expected_status {
2902 failures.push(format!(
2903 "run status mismatch: expected {}, got {}",
2904 fixture.expected_status, run.status
2905 ));
2906 }
2907 if questions.len() < min_questions {
2908 failures.push(format!(
2909 "expected at least {min_questions} clarifying question(s), got {}",
2910 questions.len()
2911 ));
2912 }
2913 if questions.len() > max_questions {
2914 failures.push(format!(
2915 "expected at most {max_questions} clarifying question(s), got {}",
2916 questions.len()
2917 ));
2918 }
2919
2920 let normalized_expected = spec
2921 .expected_question
2922 .as_deref()
2923 .map(normalize_question_text);
2924 let normalized_accepted = spec
2925 .accepted_questions
2926 .iter()
2927 .map(|question| normalize_question_text(question))
2928 .collect::<Vec<_>>();
2929 let required_terms = spec
2930 .required_terms
2931 .iter()
2932 .map(|term| normalize_question_text(term))
2933 .collect::<Vec<_>>();
2934 let forbidden_terms = spec
2935 .forbidden_terms
2936 .iter()
2937 .map(|term| normalize_question_text(term))
2938 .collect::<Vec<_>>();
2939
2940 let matched = questions.iter().any(|question| {
2941 let normalized = normalize_question_text(&question.prompt);
2942 let matches_expected = normalized_expected
2943 .as_ref()
2944 .is_none_or(|expected| &normalized == expected)
2945 && (normalized_accepted.is_empty()
2946 || normalized_accepted
2947 .iter()
2948 .any(|candidate| candidate == &normalized));
2949 let has_required_terms = required_terms
2950 .iter()
2951 .all(|term| normalized.contains(term.as_str()));
2952 let avoids_forbidden_terms = forbidden_terms
2953 .iter()
2954 .all(|term| !normalized.contains(term.as_str()));
2955 matches_expected && has_required_terms && avoids_forbidden_terms
2956 });
2957
2958 if !questions.is_empty()
2959 && (!normalized_accepted.is_empty()
2960 || normalized_expected.is_some()
2961 || !required_terms.is_empty()
2962 || !forbidden_terms.is_empty())
2963 && !matched
2964 {
2965 failures.push(format!(
2966 "no clarifying question matched fixture; actual questions: {}",
2967 questions
2968 .iter()
2969 .map(|question| format!("{:?}", question.prompt))
2970 .collect::<Vec<_>>()
2971 .join(", ")
2972 ));
2973 }
2974
2975 ReplayEvalReport {
2976 pass: failures.is_empty(),
2977 failures,
2978 stage_count: run.stages.len(),
2979 }
2980}
2981
2982pub fn evaluate_run_suite(
2983 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2984) -> ReplayEvalSuiteReport {
2985 let mut reports = Vec::new();
2986 for (run, fixture, source_path) in cases {
2987 let report = evaluate_run_against_fixture(&run, &fixture);
2988 reports.push(ReplayEvalCaseReport {
2989 run_id: run.id.clone(),
2990 workflow_id: run.workflow_id.clone(),
2991 label: None,
2992 pass: report.pass,
2993 failures: report.failures,
2994 stage_count: report.stage_count,
2995 source_path,
2996 comparison: None,
2997 });
2998 }
2999 let total = reports.len();
3000 let passed = reports.iter().filter(|report| report.pass).count();
3001 let failed = total.saturating_sub(passed);
3002 ReplayEvalSuiteReport {
3003 pass: failed == 0,
3004 total,
3005 passed,
3006 failed,
3007 cases: reports,
3008 }
3009}
3010
3011pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
3012 let mut stage_diffs = Vec::new();
3013 let mut all_node_ids = BTreeSet::new();
3014 let left_by_id: BTreeMap<&str, &RunStageRecord> = left
3015 .stages
3016 .iter()
3017 .map(|s| (s.node_id.as_str(), s))
3018 .collect();
3019 let right_by_id: BTreeMap<&str, &RunStageRecord> = right
3020 .stages
3021 .iter()
3022 .map(|s| (s.node_id.as_str(), s))
3023 .collect();
3024 all_node_ids.extend(left_by_id.keys().copied());
3025 all_node_ids.extend(right_by_id.keys().copied());
3026
3027 for node_id in all_node_ids {
3028 let left_stage = left_by_id.get(node_id).copied();
3029 let right_stage = right_by_id.get(node_id).copied();
3030 match (left_stage, right_stage) {
3031 (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
3032 node_id: node_id.to_string(),
3033 change: "removed".to_string(),
3034 details: vec!["stage missing from right run".to_string()],
3035 }),
3036 (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
3037 node_id: node_id.to_string(),
3038 change: "added".to_string(),
3039 details: vec!["stage missing from left run".to_string()],
3040 }),
3041 (Some(left_stage), Some(right_stage)) => {
3042 let mut details = Vec::new();
3043 if left_stage.status != right_stage.status {
3044 details.push(format!(
3045 "status: {} -> {}",
3046 left_stage.status, right_stage.status
3047 ));
3048 }
3049 if left_stage.outcome != right_stage.outcome {
3050 details.push(format!(
3051 "outcome: {} -> {}",
3052 left_stage.outcome, right_stage.outcome
3053 ));
3054 }
3055 if left_stage.branch != right_stage.branch {
3056 details.push(format!(
3057 "branch: {:?} -> {:?}",
3058 left_stage.branch, right_stage.branch
3059 ));
3060 }
3061 if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
3062 {
3063 details.push(format!(
3064 "produced_artifacts: {} -> {}",
3065 left_stage.produced_artifact_ids.len(),
3066 right_stage.produced_artifact_ids.len()
3067 ));
3068 }
3069 if left_stage.artifacts.len() != right_stage.artifacts.len() {
3070 details.push(format!(
3071 "artifact_records: {} -> {}",
3072 left_stage.artifacts.len(),
3073 right_stage.artifacts.len()
3074 ));
3075 }
3076 if !details.is_empty() {
3077 stage_diffs.push(RunStageDiffRecord {
3078 node_id: node_id.to_string(),
3079 change: "changed".to_string(),
3080 details,
3081 });
3082 }
3083 }
3084 (None, None) => {}
3085 }
3086 }
3087
3088 let mut tool_diffs = Vec::new();
3089 let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
3090 .tool_recordings
3091 .iter()
3092 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3093 .collect();
3094 let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
3095 .tool_recordings
3096 .iter()
3097 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
3098 .collect();
3099 let all_tool_keys: std::collections::BTreeSet<_> = left_tools
3100 .keys()
3101 .chain(right_tools.keys())
3102 .cloned()
3103 .collect();
3104 for key in &all_tool_keys {
3105 let l = left_tools.get(key);
3106 let r = right_tools.get(key);
3107 let result_changed = match (l, r) {
3108 (Some(a), Some(b)) => a.result != b.result,
3109 _ => true,
3110 };
3111 if result_changed {
3112 tool_diffs.push(ToolCallDiffRecord {
3113 tool_name: key.0.clone(),
3114 args_hash: key.1.clone(),
3115 result_changed,
3116 left_result: l.map(|t| t.result.clone()),
3117 right_result: r.map(|t| t.result.clone()),
3118 });
3119 }
3120 }
3121
3122 let left_observability = left.observability.clone().unwrap_or_else(|| {
3123 derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
3124 });
3125 let right_observability = right.observability.clone().unwrap_or_else(|| {
3126 derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
3127 });
3128 let mut observability_diffs = Vec::new();
3129
3130 let left_workers = left_observability
3131 .worker_lineage
3132 .iter()
3133 .map(|worker| {
3134 (
3135 worker.worker_id.clone(),
3136 (
3137 worker.status.clone(),
3138 worker.run_id.clone(),
3139 worker.run_path.clone(),
3140 ),
3141 )
3142 })
3143 .collect::<BTreeMap<_, _>>();
3144 let right_workers = right_observability
3145 .worker_lineage
3146 .iter()
3147 .map(|worker| {
3148 (
3149 worker.worker_id.clone(),
3150 (
3151 worker.status.clone(),
3152 worker.run_id.clone(),
3153 worker.run_path.clone(),
3154 ),
3155 )
3156 })
3157 .collect::<BTreeMap<_, _>>();
3158 let worker_ids = left_workers
3159 .keys()
3160 .chain(right_workers.keys())
3161 .cloned()
3162 .collect::<BTreeSet<_>>();
3163 for worker_id in worker_ids {
3164 match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
3165 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3166 section: "worker_lineage".to_string(),
3167 label: worker_id,
3168 details: vec!["worker missing from right run".to_string()],
3169 }),
3170 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3171 section: "worker_lineage".to_string(),
3172 label: worker_id,
3173 details: vec!["worker missing from left run".to_string()],
3174 }),
3175 (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
3176 let mut details = Vec::new();
3177 if left_worker.0 != right_worker.0 {
3178 details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
3179 }
3180 if left_worker.1 != right_worker.1 {
3181 details.push(format!(
3182 "run_id: {:?} -> {:?}",
3183 left_worker.1, right_worker.1
3184 ));
3185 }
3186 if left_worker.2 != right_worker.2 {
3187 details.push(format!(
3188 "run_path: {:?} -> {:?}",
3189 left_worker.2, right_worker.2
3190 ));
3191 }
3192 observability_diffs.push(RunObservabilityDiffRecord {
3193 section: "worker_lineage".to_string(),
3194 label: worker_id,
3195 details,
3196 });
3197 }
3198 _ => {}
3199 }
3200 }
3201
3202 let left_rounds = left_observability
3203 .planner_rounds
3204 .iter()
3205 .map(|round| (round.stage_id.clone(), round))
3206 .collect::<BTreeMap<_, _>>();
3207 let right_rounds = right_observability
3208 .planner_rounds
3209 .iter()
3210 .map(|round| (round.stage_id.clone(), round))
3211 .collect::<BTreeMap<_, _>>();
3212 let round_ids = left_rounds
3213 .keys()
3214 .chain(right_rounds.keys())
3215 .cloned()
3216 .collect::<BTreeSet<_>>();
3217 for stage_id in round_ids {
3218 match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
3219 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3220 section: "planner_rounds".to_string(),
3221 label: stage_id,
3222 details: vec!["planner summary missing from right run".to_string()],
3223 }),
3224 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3225 section: "planner_rounds".to_string(),
3226 label: stage_id,
3227 details: vec!["planner summary missing from left run".to_string()],
3228 }),
3229 (Some(left_round), Some(right_round)) => {
3230 let mut details = Vec::new();
3231 if left_round.iteration_count != right_round.iteration_count {
3232 details.push(format!(
3233 "iterations: {} -> {}",
3234 left_round.iteration_count, right_round.iteration_count
3235 ));
3236 }
3237 if left_round.tool_execution_count != right_round.tool_execution_count {
3238 details.push(format!(
3239 "tool_executions: {} -> {}",
3240 left_round.tool_execution_count, right_round.tool_execution_count
3241 ));
3242 }
3243 if left_round.native_text_tool_fallback_count
3244 != right_round.native_text_tool_fallback_count
3245 {
3246 details.push(format!(
3247 "native_text_tool_fallbacks: {} -> {}",
3248 left_round.native_text_tool_fallback_count,
3249 right_round.native_text_tool_fallback_count
3250 ));
3251 }
3252 if left_round.native_text_tool_fallback_rejection_count
3253 != right_round.native_text_tool_fallback_rejection_count
3254 {
3255 details.push(format!(
3256 "native_text_tool_fallback_rejections: {} -> {}",
3257 left_round.native_text_tool_fallback_rejection_count,
3258 right_round.native_text_tool_fallback_rejection_count
3259 ));
3260 }
3261 if left_round.empty_completion_retry_count
3262 != right_round.empty_completion_retry_count
3263 {
3264 details.push(format!(
3265 "empty_completion_retries: {} -> {}",
3266 left_round.empty_completion_retry_count,
3267 right_round.empty_completion_retry_count
3268 ));
3269 }
3270 if left_round.research_facts != right_round.research_facts {
3271 details.push(format!(
3272 "research_facts: {:?} -> {:?}",
3273 left_round.research_facts, right_round.research_facts
3274 ));
3275 }
3276 let left_deliverables = left_round
3277 .task_ledger
3278 .as_ref()
3279 .map(|ledger| {
3280 ledger
3281 .deliverables
3282 .iter()
3283 .map(|item| format!("{}:{}", item.id, item.status))
3284 .collect::<Vec<_>>()
3285 })
3286 .unwrap_or_default();
3287 let right_deliverables = right_round
3288 .task_ledger
3289 .as_ref()
3290 .map(|ledger| {
3291 ledger
3292 .deliverables
3293 .iter()
3294 .map(|item| format!("{}:{}", item.id, item.status))
3295 .collect::<Vec<_>>()
3296 })
3297 .unwrap_or_default();
3298 if left_deliverables != right_deliverables {
3299 details.push(format!(
3300 "deliverables: {:?} -> {:?}",
3301 left_deliverables, right_deliverables
3302 ));
3303 }
3304 if left_round.successful_tools != right_round.successful_tools {
3305 details.push(format!(
3306 "successful_tools: {:?} -> {:?}",
3307 left_round.successful_tools, right_round.successful_tools
3308 ));
3309 }
3310 if !details.is_empty() {
3311 observability_diffs.push(RunObservabilityDiffRecord {
3312 section: "planner_rounds".to_string(),
3313 label: left_round.node_id.clone(),
3314 details,
3315 });
3316 }
3317 }
3318 _ => {}
3319 }
3320 }
3321
3322 let left_pointers = left_observability
3323 .transcript_pointers
3324 .iter()
3325 .map(|pointer| {
3326 (
3327 pointer.id.clone(),
3328 (
3329 pointer.available,
3330 pointer.path.clone(),
3331 pointer.location.clone(),
3332 ),
3333 )
3334 })
3335 .collect::<BTreeMap<_, _>>();
3336 let right_pointers = right_observability
3337 .transcript_pointers
3338 .iter()
3339 .map(|pointer| {
3340 (
3341 pointer.id.clone(),
3342 (
3343 pointer.available,
3344 pointer.path.clone(),
3345 pointer.location.clone(),
3346 ),
3347 )
3348 })
3349 .collect::<BTreeMap<_, _>>();
3350 let pointer_ids = left_pointers
3351 .keys()
3352 .chain(right_pointers.keys())
3353 .cloned()
3354 .collect::<BTreeSet<_>>();
3355 for pointer_id in pointer_ids {
3356 match (
3357 left_pointers.get(&pointer_id),
3358 right_pointers.get(&pointer_id),
3359 ) {
3360 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3361 section: "transcript_pointers".to_string(),
3362 label: pointer_id,
3363 details: vec!["pointer missing from right run".to_string()],
3364 }),
3365 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3366 section: "transcript_pointers".to_string(),
3367 label: pointer_id,
3368 details: vec!["pointer missing from left run".to_string()],
3369 }),
3370 (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3371 observability_diffs.push(RunObservabilityDiffRecord {
3372 section: "transcript_pointers".to_string(),
3373 label: pointer_id,
3374 details: vec![format!(
3375 "pointer: {:?} -> {:?}",
3376 left_pointer, right_pointer
3377 )],
3378 });
3379 }
3380 _ => {}
3381 }
3382 }
3383
3384 let left_compactions = left_observability
3385 .compaction_events
3386 .iter()
3387 .map(|event| {
3388 (
3389 event.id.clone(),
3390 (
3391 event.strategy.clone(),
3392 event.archived_messages,
3393 event.snapshot_asset_id.clone(),
3394 event.available,
3395 ),
3396 )
3397 })
3398 .collect::<BTreeMap<_, _>>();
3399 let right_compactions = right_observability
3400 .compaction_events
3401 .iter()
3402 .map(|event| {
3403 (
3404 event.id.clone(),
3405 (
3406 event.strategy.clone(),
3407 event.archived_messages,
3408 event.snapshot_asset_id.clone(),
3409 event.available,
3410 ),
3411 )
3412 })
3413 .collect::<BTreeMap<_, _>>();
3414 let compaction_ids = left_compactions
3415 .keys()
3416 .chain(right_compactions.keys())
3417 .cloned()
3418 .collect::<BTreeSet<_>>();
3419 for compaction_id in compaction_ids {
3420 match (
3421 left_compactions.get(&compaction_id),
3422 right_compactions.get(&compaction_id),
3423 ) {
3424 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3425 section: "compaction_events".to_string(),
3426 label: compaction_id,
3427 details: vec!["compaction event missing from right run".to_string()],
3428 }),
3429 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3430 section: "compaction_events".to_string(),
3431 label: compaction_id,
3432 details: vec!["compaction event missing from left run".to_string()],
3433 }),
3434 (Some(left_event), Some(right_event)) if left_event != right_event => {
3435 observability_diffs.push(RunObservabilityDiffRecord {
3436 section: "compaction_events".to_string(),
3437 label: compaction_id,
3438 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3439 });
3440 }
3441 _ => {}
3442 }
3443 }
3444
3445 let left_daemons = left_observability
3446 .daemon_events
3447 .iter()
3448 .map(|event| {
3449 (
3450 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3451 (
3452 event.name.clone(),
3453 event.persist_path.clone(),
3454 event.payload_summary.clone(),
3455 ),
3456 )
3457 })
3458 .collect::<BTreeMap<_, _>>();
3459 let right_daemons = right_observability
3460 .daemon_events
3461 .iter()
3462 .map(|event| {
3463 (
3464 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3465 (
3466 event.name.clone(),
3467 event.persist_path.clone(),
3468 event.payload_summary.clone(),
3469 ),
3470 )
3471 })
3472 .collect::<BTreeMap<_, _>>();
3473 let daemon_keys = left_daemons
3474 .keys()
3475 .chain(right_daemons.keys())
3476 .cloned()
3477 .collect::<BTreeSet<_>>();
3478 for daemon_key in daemon_keys {
3479 let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3480 match (
3481 left_daemons.get(&daemon_key),
3482 right_daemons.get(&daemon_key),
3483 ) {
3484 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3485 section: "daemon_events".to_string(),
3486 label,
3487 details: vec!["daemon event missing from right run".to_string()],
3488 }),
3489 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3490 section: "daemon_events".to_string(),
3491 label,
3492 details: vec!["daemon event missing from left run".to_string()],
3493 }),
3494 (Some(left_event), Some(right_event)) if left_event != right_event => {
3495 observability_diffs.push(RunObservabilityDiffRecord {
3496 section: "daemon_events".to_string(),
3497 label,
3498 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3499 });
3500 }
3501 _ => {}
3502 }
3503 }
3504
3505 let left_verification = left_observability
3506 .verification_outcomes
3507 .iter()
3508 .map(|item| (item.stage_id.clone(), item))
3509 .collect::<BTreeMap<_, _>>();
3510 let right_verification = right_observability
3511 .verification_outcomes
3512 .iter()
3513 .map(|item| (item.stage_id.clone(), item))
3514 .collect::<BTreeMap<_, _>>();
3515 let verification_ids = left_verification
3516 .keys()
3517 .chain(right_verification.keys())
3518 .cloned()
3519 .collect::<BTreeSet<_>>();
3520 for stage_id in verification_ids {
3521 match (
3522 left_verification.get(&stage_id),
3523 right_verification.get(&stage_id),
3524 ) {
3525 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3526 section: "verification".to_string(),
3527 label: stage_id,
3528 details: vec!["verification missing from right run".to_string()],
3529 }),
3530 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3531 section: "verification".to_string(),
3532 label: stage_id,
3533 details: vec!["verification missing from left run".to_string()],
3534 }),
3535 (Some(left_item), Some(right_item)) if left_item != right_item => {
3536 let mut details = Vec::new();
3537 if left_item.passed != right_item.passed {
3538 details.push(format!(
3539 "passed: {:?} -> {:?}",
3540 left_item.passed, right_item.passed
3541 ));
3542 }
3543 if left_item.summary != right_item.summary {
3544 details.push(format!(
3545 "summary: {:?} -> {:?}",
3546 left_item.summary, right_item.summary
3547 ));
3548 }
3549 observability_diffs.push(RunObservabilityDiffRecord {
3550 section: "verification".to_string(),
3551 label: left_item.node_id.clone(),
3552 details,
3553 });
3554 }
3555 _ => {}
3556 }
3557 }
3558
3559 let left_graph = (
3560 left_observability.action_graph_nodes.len(),
3561 left_observability.action_graph_edges.len(),
3562 );
3563 let right_graph = (
3564 right_observability.action_graph_nodes.len(),
3565 right_observability.action_graph_edges.len(),
3566 );
3567 if left_graph != right_graph {
3568 observability_diffs.push(RunObservabilityDiffRecord {
3569 section: "action_graph".to_string(),
3570 label: "shape".to_string(),
3571 details: vec![format!(
3572 "nodes/edges: {}/{} -> {}/{}",
3573 left_graph.0, left_graph.1, right_graph.0, right_graph.1
3574 )],
3575 });
3576 }
3577
3578 let status_changed = left.status != right.status;
3579 let identical = !status_changed
3580 && stage_diffs.is_empty()
3581 && tool_diffs.is_empty()
3582 && observability_diffs.is_empty()
3583 && left.transitions.len() == right.transitions.len()
3584 && left.artifacts.len() == right.artifacts.len()
3585 && left.checkpoints.len() == right.checkpoints.len();
3586
3587 RunDiffReport {
3588 left_run_id: left.id.clone(),
3589 right_run_id: right.id.clone(),
3590 identical,
3591 status_changed,
3592 left_status: left.status.clone(),
3593 right_status: right.status.clone(),
3594 stage_diffs,
3595 tool_diffs,
3596 observability_diffs,
3597 transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3598 artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3599 checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3600 }
3601}
3602
3603#[cfg(test)]
3604mod tests {
3605 use super::*;
3606 use std::fs;
3607
3608 fn repo_root() -> PathBuf {
3609 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
3610 .parent()
3611 .unwrap()
3612 .parent()
3613 .unwrap()
3614 .to_path_buf()
3615 }
3616
3617 fn minimal_run(status: &str) -> RunRecord {
3618 RunRecord {
3619 type_name: "workflow_run".to_string(),
3620 id: "run_1".to_string(),
3621 workflow_id: "workflow_1".to_string(),
3622 status: status.to_string(),
3623 usage: Some(LlmUsageRecord {
3624 total_duration_ms: 12,
3625 total_cost: 0.01,
3626 input_tokens: 3,
3627 output_tokens: 4,
3628 call_count: 1,
3629 models: vec!["mock".to_string()],
3630 }),
3631 replay_fixture: Some(ReplayFixture {
3632 type_name: "replay_fixture".to_string(),
3633 expected_status: "completed".to_string(),
3634 ..ReplayFixture::default()
3635 }),
3636 ..RunRecord::default()
3637 }
3638 }
3639
3640 #[test]
3641 fn eval_pack_manifest_toml_runs_replay_case() {
3642 let temp = tempfile::tempdir().unwrap();
3643 let run_path = temp.path().join("run.json");
3644 fs::write(
3645 &run_path,
3646 serde_json::to_string(&minimal_run("completed")).unwrap(),
3647 )
3648 .unwrap();
3649 let pack_path = temp.path().join("harn.eval.toml");
3650 fs::write(
3651 &pack_path,
3652 r#"
3653version = 1
3654id = "connector-regressions"
3655name = "Connector regressions"
3656
3657[[cases]]
3658id = "webhook"
3659name = "Webhook normalization"
3660run = "run.json"
3661rubrics = ["status"]
3662
3663[[rubrics]]
3664id = "status"
3665kind = "deterministic"
3666
3667[[rubrics.assertions]]
3668kind = "run-status"
3669expected = "completed"
3670"#,
3671 )
3672 .unwrap();
3673
3674 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3675 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3676
3677 assert!(report.pass);
3678 assert_eq!(report.total, 1);
3679 assert_eq!(report.cases[0].label, "Webhook normalization");
3680 }
3681
3682 #[test]
3683 fn eval_pack_warning_case_does_not_block() {
3684 let temp = tempfile::tempdir().unwrap();
3685 let run_path = temp.path().join("run.json");
3686 fs::write(
3687 &run_path,
3688 serde_json::to_string(&minimal_run("completed")).unwrap(),
3689 )
3690 .unwrap();
3691 let pack_path = temp.path().join("harn.eval.toml");
3692 fs::write(
3693 &pack_path,
3694 r#"
3695version = 1
3696id = "budgets"
3697
3698[[cases]]
3699id = "latency-budget"
3700run = "run.json"
3701severity = "warning"
3702
3703[cases.thresholds]
3704max-latency-ms = 1
3705"#,
3706 )
3707 .unwrap();
3708
3709 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3710 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3711
3712 assert!(report.pass);
3713 assert_eq!(report.warning_failed, 1);
3714 assert!(report.cases[0].warnings[0].contains("latency"));
3715 }
3716
3717 #[test]
3718 fn eval_pack_manifest_runs_persona_ladder() {
3719 let temp = tempfile::tempdir().unwrap();
3720 let pack_path = temp.path().join("harn.eval.toml");
3721 let base_dir = format!("{:?}", repo_root().display().to_string());
3722 let artifact_root = format!("{:?}", temp.path().join("artifacts").display().to_string());
3723 fs::write(
3724 &pack_path,
3725 format!(
3726 r#"
3727version = 1
3728id = "merge-captain-ladders"
3729base_dir = {}
3730
3731[[ladders]]
3732id = "merge-captain-timeout"
3733persona = "merge_captain"
3734artifact-root = {}
3735
3736[ladders.backend]
3737kind = "replay"
3738path = "examples/personas/merge_captain/transcripts/green_pr.jsonl"
3739
3740[[ladders.model-routes]]
3741id = "gemma-value"
3742route = "local/gemma-value"
3743provider = "llama.cpp"
3744model = "gemma"
3745profile = "value"
3746
3747[[ladders.timeout-tiers]]
3748id = "tiny"
3749max-tool-calls = 1
3750
3751[[ladders.timeout-tiers]]
3752id = "balanced"
3753max-tool-calls = 4
3754max-model-calls = 1
3755"#,
3756 base_dir, artifact_root
3757 ),
3758 )
3759 .unwrap();
3760
3761 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3762 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3763
3764 assert!(report.pass);
3765 assert_eq!(report.total, 1);
3766 assert_eq!(report.ladders.len(), 1);
3767 assert_eq!(
3768 report.ladders[0].first_correct_tier.as_deref(),
3769 Some("balanced")
3770 );
3771 assert_eq!(report.ladders[0].tiers[0].outcome, "degraded");
3772 assert_eq!(report.ladders[0].tiers[1].outcome, "correct");
3773 }
3774
3775 #[test]
3776 fn eval_pack_manifest_runs_friction_context_pack_case() {
3777 let temp = tempfile::tempdir().unwrap();
3778 let events_path = temp.path().join("incident-friction.json");
3779 fs::write(
3780 &events_path,
3781 r#"
3782{
3783 "events": [
3784 {
3785 "kind": "repeated_query",
3786 "source": "incident-triage",
3787 "actor": "sre",
3788 "tool": "splunk",
3789 "provider": "splunk",
3790 "redacted_summary": "Checkout incidents need the same Splunk search",
3791 "recurrence_hints": ["checkout incident queries"],
3792 "estimated_time_ms": 300000,
3793 "metadata": {
3794 "query": "index=checkout service=api error",
3795 "capability": "splunk.search",
3796 "secret_ref": "SPLUNK_READ_TOKEN",
3797 "output_slot": "splunk_errors"
3798 }
3799 },
3800 {
3801 "kind": "repeated_query",
3802 "source": "incident-triage",
3803 "actor": "sre",
3804 "tool": "splunk",
3805 "provider": "splunk",
3806 "redacted_summary": "Checkout incident triage repeated the Splunk search",
3807 "recurrence_hints": ["checkout incident queries"],
3808 "estimated_time_ms": 240000,
3809 "metadata": {
3810 "query": "index=checkout service=api error",
3811 "capability": "splunk.search",
3812 "secret_ref": "SPLUNK_READ_TOKEN",
3813 "output_slot": "splunk_errors"
3814 }
3815 }
3816 ]
3817}
3818"#,
3819 )
3820 .unwrap();
3821 let pack_path = temp.path().join("harn.eval.toml");
3822 fs::write(
3823 &pack_path,
3824 r#"
3825version = 1
3826id = "team-learning"
3827name = "Team learning evals"
3828
3829[[fixtures]]
3830id = "incident-friction"
3831kind = "friction-events"
3832path = "incident-friction.json"
3833
3834[[cases]]
3835id = "incident-context-pack"
3836name = "Incident context pack suggestion"
3837friction_events = "incident-friction"
3838rubrics = ["context-pack"]
3839
3840[[rubrics]]
3841id = "context-pack"
3842kind = "friction"
3843
3844[[rubrics.assertions]]
3845kind = "context-pack-suggestion"
3846contains = "incident"
3847expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3848"#,
3849 )
3850 .unwrap();
3851
3852 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3853 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3854
3855 assert!(report.pass);
3856 assert_eq!(report.total, 1);
3857 assert_eq!(report.cases[0].run_id, "friction_events");
3858 assert_eq!(report.cases[0].stage_count, 2);
3859 }
3860}