1use std::collections::{BTreeMap, BTreeSet};
4use std::path::{Path, PathBuf};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 default_run_dir, evaluate_context_pack_suggestion_expectations,
10 generate_context_pack_suggestions, new_id, normalize_friction_events_json, now_rfc3339,
11 parse_json_payload, parse_json_value, sync_run_handoffs, ArtifactRecord, CapabilityPolicy,
12 ContextPackSuggestionExpectation, ContextPackSuggestionOptions, FrictionEvent, HandoffArtifact,
13};
14use crate::event_log::{
15 active_event_log, AnyEventLog, EventLog, LogEvent as EventLogRecord, Topic,
16};
17use crate::llm::vm_value_to_json;
18use crate::triggers::{SignatureStatus, TriggerEvent};
19use crate::value::{VmError, VmValue};
20
21pub const ACTION_GRAPH_NODE_KIND_RUN: &str = "run";
22pub const ACTION_GRAPH_NODE_KIND_TRIGGER: &str = "trigger";
23pub const ACTION_GRAPH_NODE_KIND_PREDICATE: &str = "predicate";
24pub const ACTION_GRAPH_NODE_KIND_TRIGGER_PREDICATE: &str = "trigger_predicate";
25pub const ACTION_GRAPH_NODE_KIND_STAGE: &str = "stage";
26pub const ACTION_GRAPH_NODE_KIND_WORKER: &str = "worker";
27pub const ACTION_GRAPH_NODE_KIND_DISPATCH: &str = "dispatch";
28pub const ACTION_GRAPH_NODE_KIND_A2A_HOP: &str = "a2a_hop";
29pub const ACTION_GRAPH_NODE_KIND_WORKER_ENQUEUE: &str = "worker_enqueue";
30pub const ACTION_GRAPH_NODE_KIND_RETRY: &str = "retry";
31pub const ACTION_GRAPH_NODE_KIND_DLQ: &str = "dlq";
32
33pub const ACTION_GRAPH_EDGE_KIND_ENTRY: &str = "entry";
34pub const ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH: &str = "trigger_dispatch";
35pub const ACTION_GRAPH_EDGE_KIND_A2A_DISPATCH: &str = "a2a_dispatch";
36pub const ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE: &str = "predicate_gate";
37pub const ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN: &str = "replay_chain";
38pub const ACTION_GRAPH_EDGE_KIND_TRANSITION: &str = "transition";
39pub const ACTION_GRAPH_EDGE_KIND_DELEGATES: &str = "delegates";
40pub const ACTION_GRAPH_EDGE_KIND_RETRY: &str = "retry";
41pub const ACTION_GRAPH_EDGE_KIND_DLQ_MOVE: &str = "dlq_move";
42
43#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
44#[serde(default)]
45pub struct LlmUsageRecord {
46 pub input_tokens: i64,
47 pub output_tokens: i64,
48 pub total_duration_ms: i64,
49 pub call_count: i64,
50 pub total_cost: f64,
51 pub models: Vec<String>,
52}
53
54#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
55#[serde(default)]
56pub struct RunStageRecord {
57 pub id: String,
58 pub node_id: String,
59 pub kind: String,
60 pub status: String,
61 pub outcome: String,
62 pub branch: Option<String>,
63 pub started_at: String,
64 pub finished_at: Option<String>,
65 pub visible_text: Option<String>,
66 pub private_reasoning: Option<String>,
67 pub transcript: Option<serde_json::Value>,
68 pub verification: Option<serde_json::Value>,
69 pub usage: Option<LlmUsageRecord>,
70 pub artifacts: Vec<ArtifactRecord>,
71 pub consumed_artifact_ids: Vec<String>,
72 pub produced_artifact_ids: Vec<String>,
73 pub attempts: Vec<RunStageAttemptRecord>,
74 pub metadata: BTreeMap<String, serde_json::Value>,
75}
76
77#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
78#[serde(default)]
79pub struct RunStageAttemptRecord {
80 pub attempt: usize,
81 pub status: String,
82 pub outcome: String,
83 pub branch: Option<String>,
84 pub error: Option<String>,
85 pub verification: Option<serde_json::Value>,
86 pub started_at: String,
87 pub finished_at: Option<String>,
88}
89
90#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(default)]
92pub struct RunTransitionRecord {
93 pub id: String,
94 pub from_stage_id: Option<String>,
95 pub from_node_id: Option<String>,
96 pub to_node_id: String,
97 pub branch: Option<String>,
98 pub timestamp: String,
99 pub consumed_artifact_ids: Vec<String>,
100 pub produced_artifact_ids: Vec<String>,
101}
102
103#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
104#[serde(default)]
105pub struct RunCheckpointRecord {
106 pub id: String,
107 pub ready_nodes: Vec<String>,
108 pub completed_nodes: Vec<String>,
109 pub last_stage_id: Option<String>,
110 pub persisted_at: String,
111 pub reason: String,
112}
113
114#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
115#[serde(default)]
116pub struct ReplayFixture {
117 #[serde(rename = "_type")]
118 pub type_name: String,
119 pub id: String,
120 pub source_run_id: String,
121 pub workflow_id: String,
122 pub workflow_name: Option<String>,
123 pub created_at: String,
124 pub eval_kind: Option<String>,
125 pub clarifying_question: Option<ClarifyingQuestionEvalSpec>,
126 pub expected_status: String,
127 pub stage_assertions: Vec<ReplayStageAssertion>,
128}
129
130#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
131#[serde(default)]
132pub struct ClarifyingQuestionEvalSpec {
133 pub expected_question: Option<String>,
134 pub accepted_questions: Vec<String>,
135 pub required_terms: Vec<String>,
136 pub forbidden_terms: Vec<String>,
137 pub min_questions: usize,
138 pub max_questions: Option<usize>,
139}
140
141#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
142#[serde(default)]
143pub struct ReplayStageAssertion {
144 pub node_id: String,
145 pub expected_status: String,
146 pub expected_outcome: String,
147 pub expected_branch: Option<String>,
148 pub required_artifact_kinds: Vec<String>,
149 pub visible_text_contains: Option<String>,
150}
151
152#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
153#[serde(default)]
154pub struct ReplayEvalReport {
155 pub pass: bool,
156 pub failures: Vec<String>,
157 pub stage_count: usize,
158}
159
160#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
161#[serde(default)]
162pub struct ReplayEvalCaseReport {
163 pub run_id: String,
164 pub workflow_id: String,
165 pub label: Option<String>,
166 pub pass: bool,
167 pub failures: Vec<String>,
168 pub stage_count: usize,
169 pub source_path: Option<String>,
170 pub comparison: Option<RunDiffReport>,
171}
172
173#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
174#[serde(default)]
175pub struct ReplayEvalSuiteReport {
176 pub pass: bool,
177 pub total: usize,
178 pub passed: usize,
179 pub failed: usize,
180 pub cases: Vec<ReplayEvalCaseReport>,
181}
182
183#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(default)]
185pub struct RunDeliverableSummaryRecord {
186 pub id: String,
187 pub text: String,
188 pub status: String,
189 pub note: Option<String>,
190}
191
192#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
193#[serde(default)]
194pub struct RunTaskLedgerSummaryRecord {
195 pub root_task: String,
196 pub rationale: String,
197 pub deliverables: Vec<RunDeliverableSummaryRecord>,
198 pub observations: Vec<String>,
199 pub blocking_count: usize,
200}
201
202#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
203#[serde(default)]
204pub struct RunPlannerRoundRecord {
205 pub stage_id: String,
206 pub node_id: String,
207 pub stage_kind: String,
208 pub status: String,
209 pub outcome: String,
210 pub iteration_count: usize,
211 pub llm_call_count: usize,
212 pub tool_execution_count: usize,
213 pub tool_rejection_count: usize,
214 pub intervention_count: usize,
215 pub compaction_count: usize,
216 pub native_text_tool_fallback_count: usize,
217 pub native_text_tool_fallback_rejection_count: usize,
218 pub empty_completion_retry_count: usize,
219 pub tools_used: Vec<String>,
220 pub successful_tools: Vec<String>,
221 pub ledger_done_rejections: usize,
222 pub task_ledger: Option<RunTaskLedgerSummaryRecord>,
223 pub research_facts: Vec<String>,
224}
225
226#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
227#[serde(default)]
228pub struct RunWorkerLineageRecord {
229 pub worker_id: String,
230 pub worker_name: String,
231 pub parent_stage_id: Option<String>,
232 pub task: String,
233 pub status: String,
234 pub session_id: Option<String>,
235 pub parent_session_id: Option<String>,
236 pub run_id: Option<String>,
237 pub run_path: Option<String>,
238 pub snapshot_path: Option<String>,
239}
240
241#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
242#[serde(default)]
243pub struct RunActionGraphNodeRecord {
244 pub id: String,
245 pub label: String,
246 pub kind: String,
247 pub status: String,
248 pub outcome: String,
249 pub trace_id: Option<String>,
250 pub stage_id: Option<String>,
251 pub node_id: Option<String>,
252 pub worker_id: Option<String>,
253 pub run_id: Option<String>,
254 pub run_path: Option<String>,
255 pub metadata: BTreeMap<String, serde_json::Value>,
256}
257
258#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
259#[serde(default)]
260pub struct RunActionGraphEdgeRecord {
261 pub from_id: String,
262 pub to_id: String,
263 pub kind: String,
264 pub label: Option<String>,
265}
266
267#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
268#[serde(default)]
269pub struct RunVerificationOutcomeRecord {
270 pub stage_id: String,
271 pub node_id: String,
272 pub status: String,
273 pub passed: Option<bool>,
274 pub summary: Option<String>,
275}
276
277#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
278#[serde(default)]
279pub struct RunTranscriptPointerRecord {
280 pub id: String,
281 pub label: String,
282 pub kind: String,
283 pub location: String,
284 pub path: Option<String>,
285 pub available: bool,
286}
287
288#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
289#[serde(default)]
290pub struct CompactionEventRecord {
291 pub id: String,
292 pub transcript_id: Option<String>,
293 pub stage_id: Option<String>,
294 pub node_id: Option<String>,
295 pub mode: String,
296 pub strategy: String,
297 pub archived_messages: usize,
298 pub estimated_tokens_before: usize,
299 pub estimated_tokens_after: usize,
300 pub snapshot_asset_id: Option<String>,
301 pub snapshot_location: String,
302 pub snapshot_path: Option<String>,
303 pub available: bool,
304}
305
306#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
307#[serde(rename_all = "snake_case")]
308pub enum DaemonEventKindRecord {
309 #[default]
310 Spawned,
311 Triggered,
312 Snapshotted,
313 Resumed,
314 Stopped,
315}
316
317#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
318#[serde(default)]
319pub struct DaemonEventRecord {
320 pub daemon_id: String,
321 pub name: String,
322 pub kind: DaemonEventKindRecord,
323 pub timestamp: String,
324 pub persist_path: String,
325 pub payload_summary: Option<String>,
326}
327
328#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
329#[serde(default)]
330pub struct RunObservabilityRecord {
331 pub schema_version: usize,
332 pub planner_rounds: Vec<RunPlannerRoundRecord>,
333 pub research_fact_count: usize,
334 pub action_graph_nodes: Vec<RunActionGraphNodeRecord>,
335 pub action_graph_edges: Vec<RunActionGraphEdgeRecord>,
336 pub worker_lineage: Vec<RunWorkerLineageRecord>,
337 pub verification_outcomes: Vec<RunVerificationOutcomeRecord>,
338 pub transcript_pointers: Vec<RunTranscriptPointerRecord>,
339 pub compaction_events: Vec<CompactionEventRecord>,
340 pub daemon_events: Vec<DaemonEventRecord>,
341}
342
343#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
344#[serde(default)]
345pub struct RunStageDiffRecord {
346 pub node_id: String,
347 pub change: String,
348 pub details: Vec<String>,
349}
350
351#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
352#[serde(default)]
353pub struct ToolCallDiffRecord {
354 pub tool_name: String,
355 pub args_hash: String,
356 pub result_changed: bool,
357 pub left_result: Option<String>,
358 pub right_result: Option<String>,
359}
360
361#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
362#[serde(default)]
363pub struct RunObservabilityDiffRecord {
364 pub section: String,
365 pub label: String,
366 pub details: Vec<String>,
367}
368
369#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
370#[serde(default)]
371pub struct RunDiffReport {
372 pub left_run_id: String,
373 pub right_run_id: String,
374 pub identical: bool,
375 pub status_changed: bool,
376 pub left_status: String,
377 pub right_status: String,
378 pub stage_diffs: Vec<RunStageDiffRecord>,
379 pub tool_diffs: Vec<ToolCallDiffRecord>,
380 pub observability_diffs: Vec<RunObservabilityDiffRecord>,
381 pub transition_count_delta: isize,
382 pub artifact_count_delta: isize,
383 pub checkpoint_count_delta: isize,
384}
385
386#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
387#[serde(default)]
388pub struct EvalSuiteManifest {
389 #[serde(rename = "_type")]
390 pub type_name: String,
391 pub id: String,
392 pub name: Option<String>,
393 pub base_dir: Option<String>,
394 pub cases: Vec<EvalSuiteCase>,
395}
396
397#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
398#[serde(default)]
399pub struct EvalSuiteCase {
400 pub label: Option<String>,
401 pub run_path: String,
402 pub fixture_path: Option<String>,
403 pub compare_to: Option<String>,
404}
405
406#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
407#[serde(default)]
408pub struct EvalPackManifest {
409 pub version: u32,
410 pub id: String,
411 pub name: Option<String>,
412 pub description: Option<String>,
413 pub base_dir: Option<String>,
414 pub package: Option<EvalPackPackage>,
415 pub defaults: EvalPackDefaults,
416 pub fixtures: Vec<EvalPackFixtureRef>,
417 pub rubrics: Vec<EvalPackRubric>,
418 pub judge: Option<EvalPackJudgeConfig>,
419 pub cases: Vec<EvalPackCase>,
420 pub metadata: BTreeMap<String, serde_json::Value>,
421}
422
423#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
424#[serde(default)]
425pub struct EvalPackPackage {
426 pub name: Option<String>,
427 pub version: Option<String>,
428 pub source: Option<String>,
429 pub templates: Vec<String>,
430}
431
432#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
433#[serde(default)]
434pub struct EvalPackDefaults {
435 pub severity: Option<String>,
436 pub fixture_root: Option<String>,
437 pub thresholds: EvalPackThresholds,
438 pub judge: Option<EvalPackJudgeConfig>,
439}
440
441#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
442#[serde(default)]
443pub struct EvalPackFixtureRef {
444 pub id: String,
445 pub kind: String,
446 pub path: Option<String>,
447 #[serde(default, alias = "trace-id")]
448 pub trace_id: Option<String>,
449 pub provider: Option<String>,
450 #[serde(default, alias = "event-kind")]
451 pub event_kind: Option<String>,
452 pub inline: Option<serde_json::Value>,
453 pub metadata: BTreeMap<String, serde_json::Value>,
454}
455
456#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
457#[serde(default)]
458pub struct EvalPackRubric {
459 pub id: String,
460 pub kind: String,
461 pub description: Option<String>,
462 pub prompt: Option<String>,
463 pub assertions: Vec<EvalPackAssertion>,
464 pub judge: Option<EvalPackJudgeConfig>,
465 pub calibration: Vec<EvalPackGoldenExample>,
466 pub thresholds: EvalPackThresholds,
467 pub metadata: BTreeMap<String, serde_json::Value>,
468}
469
470#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
471#[serde(default)]
472pub struct EvalPackAssertion {
473 pub kind: String,
474 pub stage: Option<String>,
475 pub path: Option<String>,
476 pub op: Option<String>,
477 pub expected: Option<serde_json::Value>,
478 pub contains: Option<String>,
479 pub metadata: BTreeMap<String, serde_json::Value>,
480}
481
482#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
483#[serde(default)]
484pub struct EvalPackJudgeConfig {
485 pub model: Option<String>,
486 #[serde(default, alias = "prompt-version")]
487 pub prompt_version: Option<String>,
488 #[serde(default, alias = "tie-break")]
489 pub tie_break: Option<String>,
490 #[serde(default, alias = "confidence-min")]
491 pub confidence_min: Option<f64>,
492 pub temperature: Option<f64>,
493 pub rubric: Option<String>,
494 pub metadata: BTreeMap<String, serde_json::Value>,
495}
496
497#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
498#[serde(default)]
499pub struct EvalPackGoldenExample {
500 pub input: serde_json::Value,
501 pub output: serde_json::Value,
502 pub score: Option<f64>,
503 pub explanation: Option<String>,
504}
505
506#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
507#[serde(default)]
508pub struct EvalPackThresholds {
509 pub severity: Option<String>,
510 #[serde(default, alias = "min-score")]
511 pub min_score: Option<f64>,
512 #[serde(default, alias = "min-confidence")]
513 pub min_confidence: Option<f64>,
514 #[serde(default, alias = "max-cost-usd")]
515 pub max_cost_usd: Option<f64>,
516 #[serde(default, alias = "max-latency-ms")]
517 pub max_latency_ms: Option<i64>,
518 #[serde(default, alias = "max-tokens")]
519 pub max_tokens: Option<i64>,
520 #[serde(default, alias = "max-stage-count")]
521 pub max_stage_count: Option<usize>,
522}
523
524#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
525#[serde(default)]
526pub struct EvalPackCase {
527 pub id: Option<String>,
528 pub name: Option<String>,
529 pub description: Option<String>,
530 pub run: Option<String>,
531 #[serde(default, alias = "run-path")]
532 pub run_path: Option<String>,
533 #[serde(default, alias = "friction-events", alias = "friction_events")]
534 pub friction_events: Option<String>,
535 pub fixture: Option<String>,
536 #[serde(default, alias = "fixture-path")]
537 pub fixture_path: Option<String>,
538 #[serde(default, alias = "compare-to")]
539 pub compare_to: Option<String>,
540 pub rubrics: Vec<String>,
541 pub severity: Option<String>,
542 pub thresholds: EvalPackThresholds,
543 pub metadata: BTreeMap<String, serde_json::Value>,
544}
545
546#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
547#[serde(default)]
548pub struct EvalPackReport {
549 pub pack_id: String,
550 pub pass: bool,
551 pub total: usize,
552 pub passed: usize,
553 pub failed: usize,
554 pub blocking_failed: usize,
555 pub warning_failed: usize,
556 pub informational_failed: usize,
557 pub cases: Vec<EvalPackCaseReport>,
558}
559
560#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
561#[serde(default)]
562pub struct EvalPackCaseReport {
563 pub id: String,
564 pub label: String,
565 pub severity: String,
566 pub pass: bool,
567 pub blocking: bool,
568 pub run_id: String,
569 pub workflow_id: String,
570 pub source_path: Option<String>,
571 pub stage_count: usize,
572 pub failures: Vec<String>,
573 pub warnings: Vec<String>,
574 pub informational: Vec<String>,
575 pub comparison: Option<RunDiffReport>,
576}
577
578#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
579#[serde(default)]
580pub struct RunHitlQuestionRecord {
581 pub request_id: String,
582 pub prompt: String,
583 pub agent: String,
584 pub trace_id: Option<String>,
585 pub asked_at: String,
586}
587
588#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
589#[serde(default)]
590pub struct RunRecord {
591 #[serde(rename = "_type")]
592 pub type_name: String,
593 pub id: String,
594 pub workflow_id: String,
595 pub workflow_name: Option<String>,
596 pub task: String,
597 pub status: String,
598 pub started_at: String,
599 pub finished_at: Option<String>,
600 pub parent_run_id: Option<String>,
601 pub root_run_id: Option<String>,
602 pub stages: Vec<RunStageRecord>,
603 pub transitions: Vec<RunTransitionRecord>,
604 pub checkpoints: Vec<RunCheckpointRecord>,
605 pub pending_nodes: Vec<String>,
606 pub completed_nodes: Vec<String>,
607 pub child_runs: Vec<RunChildRecord>,
608 pub artifacts: Vec<ArtifactRecord>,
609 pub handoffs: Vec<HandoffArtifact>,
610 pub policy: CapabilityPolicy,
611 pub execution: Option<RunExecutionRecord>,
612 pub transcript: Option<serde_json::Value>,
613 pub usage: Option<LlmUsageRecord>,
614 pub replay_fixture: Option<ReplayFixture>,
615 pub observability: Option<RunObservabilityRecord>,
616 pub trace_spans: Vec<RunTraceSpanRecord>,
617 pub tool_recordings: Vec<ToolCallRecord>,
618 pub hitl_questions: Vec<RunHitlQuestionRecord>,
619 pub metadata: BTreeMap<String, serde_json::Value>,
620 pub persisted_path: Option<String>,
621}
622
623#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
624#[serde(default)]
625pub struct ToolCallRecord {
626 pub tool_name: String,
627 pub tool_use_id: String,
628 pub args_hash: String,
629 pub result: String,
630 pub is_rejected: bool,
631 pub duration_ms: u64,
632 pub iteration: usize,
633 pub timestamp: String,
634}
635
636pub fn tool_fixture_hash(tool_name: &str, args: &serde_json::Value) -> String {
638 use std::hash::{Hash, Hasher};
639 let mut hasher = std::collections::hash_map::DefaultHasher::new();
640 tool_name.hash(&mut hasher);
641 let args_str = serde_json::to_string(args).unwrap_or_default();
642 args_str.hash(&mut hasher);
643 format!("{:016x}", hasher.finish())
644}
645
646#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
647#[serde(default)]
648pub struct RunTraceSpanRecord {
649 pub span_id: u64,
650 pub parent_id: Option<u64>,
651 pub kind: String,
652 pub name: String,
653 pub start_ms: u64,
654 pub duration_ms: u64,
655 pub metadata: BTreeMap<String, serde_json::Value>,
656}
657
658#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
659#[serde(default)]
660pub struct RunChildRecord {
661 pub worker_id: String,
662 pub worker_name: String,
663 pub parent_stage_id: Option<String>,
664 pub session_id: Option<String>,
665 pub parent_session_id: Option<String>,
666 pub mutation_scope: Option<String>,
667 pub approval_policy: Option<super::ToolApprovalPolicy>,
668 pub task: String,
669 pub request: Option<serde_json::Value>,
670 pub provenance: Option<serde_json::Value>,
671 pub status: String,
672 pub started_at: String,
673 pub finished_at: Option<String>,
674 pub run_id: Option<String>,
675 pub run_path: Option<String>,
676 pub snapshot_path: Option<String>,
677 pub execution: Option<RunExecutionRecord>,
678}
679
680#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
681#[serde(default)]
682pub struct RunExecutionRecord {
683 pub cwd: Option<String>,
684 pub source_dir: Option<String>,
685 pub env: BTreeMap<String, String>,
686 pub adapter: Option<String>,
687 pub repo_path: Option<String>,
688 pub worktree_path: Option<String>,
689 pub branch: Option<String>,
690 pub base_ref: Option<String>,
691 pub cleanup: Option<String>,
692}
693
694fn compact_json_value(value: &serde_json::Value) -> String {
695 serde_json::to_string(value).unwrap_or_else(|_| value.to_string())
696}
697
698fn normalize_question_text(text: &str) -> String {
699 text.chars()
700 .map(|ch| {
701 if ch.is_ascii_alphanumeric() || ch.is_whitespace() {
702 ch.to_ascii_lowercase()
703 } else {
704 ' '
705 }
706 })
707 .collect::<String>()
708 .split_whitespace()
709 .collect::<Vec<_>>()
710 .join(" ")
711}
712
713fn clarifying_min_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
714 spec.min_questions.max(1)
715}
716
717fn clarifying_max_questions(spec: &ClarifyingQuestionEvalSpec) -> usize {
718 spec.max_questions.unwrap_or(1).max(1)
719}
720
721fn read_topic_records(
722 log: &AnyEventLog,
723 topic: &Topic,
724) -> Vec<(crate::event_log::EventId, EventLogRecord)> {
725 let mut from = None;
726 let mut records = Vec::new();
727 loop {
728 let batch =
729 futures::executor::block_on(log.read_range(topic, from, 256)).unwrap_or_default();
730 if batch.is_empty() {
731 break;
732 }
733 from = batch.last().map(|(event_id, _)| *event_id);
734 records.extend(batch);
735 }
736 records
737}
738
739fn merge_hitl_questions_from_active_log(run: &mut RunRecord) {
740 let Some(log) = active_event_log() else {
741 return;
742 };
743 let topic = Topic::new(crate::HITL_QUESTIONS_TOPIC)
744 .expect("static hitl.questions topic should always be valid");
745 let mut merged = run
746 .hitl_questions
747 .iter()
748 .cloned()
749 .map(|question| (question.request_id.clone(), question))
750 .collect::<BTreeMap<_, _>>();
751
752 for (_, event) in read_topic_records(log.as_ref(), &topic) {
753 if event.kind != "hitl.question_asked" {
754 continue;
755 }
756 let payload = &event.payload;
757 let matches_run = event
758 .headers
759 .get("run_id")
760 .is_some_and(|value| value == &run.id)
761 || payload
762 .get("run_id")
763 .and_then(|value| value.as_str())
764 .is_some_and(|value| value == run.id);
765 if !matches_run {
766 continue;
767 }
768 let request_id = payload
769 .get("request_id")
770 .and_then(|value| value.as_str())
771 .or_else(|| event.headers.get("request_id").map(String::as_str))
772 .unwrap_or_default();
773 let prompt = payload
774 .get("payload")
775 .and_then(|value| value.get("prompt"))
776 .and_then(|value| value.as_str())
777 .unwrap_or_default();
778 if request_id.is_empty() || prompt.is_empty() {
779 continue;
780 }
781 merged.insert(
782 request_id.to_string(),
783 RunHitlQuestionRecord {
784 request_id: request_id.to_string(),
785 prompt: prompt.to_string(),
786 agent: payload
787 .get("agent")
788 .and_then(|value| value.as_str())
789 .unwrap_or_default()
790 .to_string(),
791 trace_id: payload
792 .get("trace_id")
793 .and_then(|value| value.as_str())
794 .map(str::to_string),
795 asked_at: payload
796 .get("requested_at")
797 .and_then(|value| value.as_str())
798 .unwrap_or_default()
799 .to_string(),
800 },
801 );
802 }
803
804 run.hitl_questions = merged.into_values().collect();
805 run.hitl_questions.sort_by(|left, right| {
806 (left.asked_at.as_str(), left.request_id.as_str())
807 .cmp(&(right.asked_at.as_str(), right.request_id.as_str()))
808 });
809}
810
811fn signature_status_label(status: &SignatureStatus) -> &'static str {
812 match status {
813 SignatureStatus::Verified => "verified",
814 SignatureStatus::Unsigned => "unsigned",
815 SignatureStatus::Failed { .. } => "failed",
816 }
817}
818
819fn trigger_event_from_run(run: &RunRecord) -> Option<TriggerEvent> {
820 run.metadata
821 .get("trigger_event")
822 .cloned()
823 .and_then(|value| serde_json::from_value(value).ok())
824}
825
826fn run_trace_id(run: &RunRecord, trigger_event: Option<&TriggerEvent>) -> Option<String> {
827 trigger_event
828 .map(|event| event.trace_id.0.clone())
829 .or_else(|| {
830 run.metadata
831 .get("trace_id")
832 .and_then(|value| value.as_str())
833 .map(str::to_string)
834 })
835}
836
837fn replay_of_event_id_from_run(run: &RunRecord) -> Option<String> {
838 run.metadata
839 .get("replay_of_event_id")
840 .and_then(|value| value.as_str())
841 .map(str::to_string)
842}
843
844fn action_graph_kind_for_stage(stage: &RunStageRecord) -> &'static str {
845 if stage.kind == "condition" {
846 ACTION_GRAPH_NODE_KIND_PREDICATE
847 } else {
848 ACTION_GRAPH_NODE_KIND_STAGE
849 }
850}
851
852fn trigger_node_metadata(trigger_event: &TriggerEvent) -> BTreeMap<String, serde_json::Value> {
853 let mut metadata = BTreeMap::new();
854 metadata.insert(
855 "provider".to_string(),
856 serde_json::json!(trigger_event.provider.as_str()),
857 );
858 metadata.insert(
859 "event_kind".to_string(),
860 serde_json::json!(trigger_event.kind),
861 );
862 metadata.insert(
863 "dedupe_key".to_string(),
864 serde_json::json!(trigger_event.dedupe_key),
865 );
866 metadata.insert(
867 "signature_status".to_string(),
868 serde_json::json!(signature_status_label(&trigger_event.signature_status)),
869 );
870 metadata
871}
872
873fn stage_node_metadata(stage: &RunStageRecord) -> BTreeMap<String, serde_json::Value> {
874 let mut metadata = BTreeMap::new();
875 metadata.insert("stage_kind".to_string(), serde_json::json!(stage.kind));
876 if let Some(branch) = stage.branch.as_ref() {
877 metadata.insert("branch".to_string(), serde_json::json!(branch));
878 }
879 if let Some(worker_id) = stage
880 .metadata
881 .get("worker_id")
882 .and_then(|value| value.as_str())
883 {
884 metadata.insert("worker_id".to_string(), serde_json::json!(worker_id));
885 }
886 metadata
887}
888
889fn append_action_graph_node(
890 nodes: &mut Vec<RunActionGraphNodeRecord>,
891 record: RunActionGraphNodeRecord,
892) {
893 nodes.push(record);
894}
895
896pub async fn append_action_graph_update(
897 headers: BTreeMap<String, String>,
898 payload: serde_json::Value,
899) -> Result<(), crate::event_log::LogError> {
900 let Some(log) = active_event_log() else {
901 return Ok(());
902 };
903 let topic = Topic::new("observability.action_graph")
904 .expect("static observability.action_graph topic should always be valid");
905 let record = EventLogRecord::new("action_graph_update", payload).with_headers(headers);
906 log.append(&topic, record).await.map(|_| ())
907}
908
909fn publish_action_graph_event(
910 run: &RunRecord,
911 observability: &RunObservabilityRecord,
912 path: &Path,
913) {
914 let trigger_event = trigger_event_from_run(run);
915 let mut headers = BTreeMap::new();
916 headers.insert("run_id".to_string(), run.id.clone());
917 headers.insert("workflow_id".to_string(), run.workflow_id.clone());
918 if let Some(trace_id) = run_trace_id(run, trigger_event.as_ref()) {
919 headers.insert("trace_id".to_string(), trace_id);
920 }
921 let payload = serde_json::json!({
922 "run_id": run.id,
923 "workflow_id": run.workflow_id,
924 "persisted_path": path.to_string_lossy(),
925 "status": run.status,
926 "observability": observability,
927 });
928 if let Ok(handle) = tokio::runtime::Handle::try_current() {
929 handle.spawn(async move {
930 let _ = append_action_graph_update(headers, payload).await;
931 });
932 } else {
933 let _ = futures::executor::block_on(append_action_graph_update(headers, payload));
934 }
935}
936
937fn llm_transcript_sidecar_path(run_path: &Path) -> Option<PathBuf> {
938 let stem = run_path.file_stem()?.to_str()?;
939 let parent = run_path.parent().unwrap_or_else(|| Path::new("."));
940 Some(parent.join(format!("{stem}-llm/llm_transcript.jsonl")))
941}
942
943fn json_string_array(value: Option<&serde_json::Value>) -> Vec<String> {
944 value
945 .and_then(|value| value.as_array())
946 .map(|items| {
947 items
948 .iter()
949 .filter_map(|item| item.as_str().map(str::to_string))
950 .collect::<Vec<_>>()
951 })
952 .unwrap_or_default()
953}
954
955fn json_usize(value: Option<&serde_json::Value>) -> usize {
956 value.and_then(|value| value.as_u64()).unwrap_or_default() as usize
957}
958
959fn json_bool(value: Option<&serde_json::Value>) -> Option<bool> {
960 value.and_then(|value| value.as_bool())
961}
962
963fn stage_result_payload(stage: &RunStageRecord) -> Option<&serde_json::Value> {
964 stage
965 .artifacts
966 .iter()
967 .find_map(|artifact| artifact.data.as_ref())
968}
969
970fn task_ledger_summary_from_value(value: &serde_json::Value) -> Option<RunTaskLedgerSummaryRecord> {
971 let deliverables = value
972 .get("deliverables")
973 .and_then(|raw| raw.as_array())
974 .map(|items| {
975 items
976 .iter()
977 .map(|item| RunDeliverableSummaryRecord {
978 id: item
979 .get("id")
980 .and_then(|value| value.as_str())
981 .unwrap_or_default()
982 .to_string(),
983 text: item
984 .get("text")
985 .and_then(|value| value.as_str())
986 .unwrap_or_default()
987 .to_string(),
988 status: item
989 .get("status")
990 .and_then(|value| value.as_str())
991 .unwrap_or_default()
992 .to_string(),
993 note: item
994 .get("note")
995 .and_then(|value| value.as_str())
996 .map(str::to_string),
997 })
998 .collect::<Vec<_>>()
999 })
1000 .unwrap_or_default();
1001 let observations = json_string_array(value.get("observations"));
1002 let root_task = value
1003 .get("root_task")
1004 .and_then(|value| value.as_str())
1005 .unwrap_or_default()
1006 .to_string();
1007 let rationale = value
1008 .get("rationale")
1009 .and_then(|value| value.as_str())
1010 .unwrap_or_default()
1011 .to_string();
1012 if root_task.is_empty()
1013 && rationale.is_empty()
1014 && deliverables.is_empty()
1015 && observations.is_empty()
1016 {
1017 return None;
1018 }
1019 let blocking_count = deliverables
1020 .iter()
1021 .filter(|deliverable| matches!(deliverable.status.as_str(), "open" | "blocked"))
1022 .count();
1023 Some(RunTaskLedgerSummaryRecord {
1024 root_task,
1025 rationale,
1026 deliverables,
1027 observations,
1028 blocking_count,
1029 })
1030}
1031
1032fn compaction_events_from_transcript(
1033 transcript: &serde_json::Value,
1034 stage_id: Option<&str>,
1035 node_id: Option<&str>,
1036 location_prefix: &str,
1037 persisted_path: Option<&Path>,
1038) -> Vec<CompactionEventRecord> {
1039 let transcript_id = transcript
1040 .get("id")
1041 .and_then(|value| value.as_str())
1042 .map(str::to_string);
1043 let asset_ids = transcript
1044 .get("assets")
1045 .and_then(|value| value.as_array())
1046 .map(|assets| {
1047 assets
1048 .iter()
1049 .filter_map(|asset| {
1050 asset
1051 .get("id")
1052 .and_then(|value| value.as_str())
1053 .map(str::to_string)
1054 })
1055 .collect::<BTreeSet<_>>()
1056 })
1057 .unwrap_or_default();
1058 transcript
1059 .get("events")
1060 .and_then(|value| value.as_array())
1061 .map(|events| {
1062 events
1063 .iter()
1064 .filter(|event| {
1065 event.get("kind").and_then(|value| value.as_str()) == Some("compaction")
1066 })
1067 .map(|event| {
1068 let metadata = event.get("metadata");
1069 let snapshot_asset_id = metadata
1070 .and_then(|value| value.get("snapshot_asset_id"))
1071 .and_then(|value| value.as_str())
1072 .map(str::to_string);
1073 let available = snapshot_asset_id
1074 .as_ref()
1075 .is_some_and(|asset_id| asset_ids.contains(asset_id));
1076 let snapshot_location = snapshot_asset_id
1077 .as_ref()
1078 .map(|asset_id| format!("{location_prefix}.assets[{asset_id}]"))
1079 .unwrap_or_else(|| location_prefix.to_string());
1080 CompactionEventRecord {
1081 id: event
1082 .get("id")
1083 .and_then(|value| value.as_str())
1084 .unwrap_or_default()
1085 .to_string(),
1086 transcript_id: transcript_id.clone(),
1087 stage_id: stage_id.map(str::to_string),
1088 node_id: node_id.map(str::to_string),
1089 mode: metadata
1090 .and_then(|value| value.get("mode"))
1091 .and_then(|value| value.as_str())
1092 .unwrap_or_default()
1093 .to_string(),
1094 strategy: metadata
1095 .and_then(|value| value.get("strategy"))
1096 .and_then(|value| value.as_str())
1097 .unwrap_or_default()
1098 .to_string(),
1099 archived_messages: json_usize(
1100 metadata.and_then(|value| value.get("archived_messages")),
1101 ),
1102 estimated_tokens_before: json_usize(
1103 metadata.and_then(|value| value.get("estimated_tokens_before")),
1104 ),
1105 estimated_tokens_after: json_usize(
1106 metadata.and_then(|value| value.get("estimated_tokens_after")),
1107 ),
1108 snapshot_asset_id,
1109 snapshot_location,
1110 snapshot_path: persisted_path
1111 .map(|path| path.to_string_lossy().into_owned()),
1112 available,
1113 }
1114 })
1115 .collect()
1116 })
1117 .unwrap_or_default()
1118}
1119
1120fn daemon_events_from_sidecar(run_path: &Path) -> Vec<DaemonEventRecord> {
1121 let Some(sidecar_path) = llm_transcript_sidecar_path(run_path) else {
1122 return Vec::new();
1123 };
1124 let Ok(content) = std::fs::read_to_string(sidecar_path) else {
1125 return Vec::new();
1126 };
1127
1128 content
1129 .lines()
1130 .filter(|line| !line.trim().is_empty())
1131 .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
1132 .filter(|event| event.get("type").and_then(|value| value.as_str()) == Some("daemon_event"))
1133 .filter_map(|event| serde_json::from_value::<DaemonEventRecord>(event).ok())
1134 .collect()
1135}
1136
1137pub fn derive_run_observability(
1138 run: &RunRecord,
1139 persisted_path: Option<&Path>,
1140) -> RunObservabilityRecord {
1141 let mut action_graph_nodes = Vec::new();
1142 let mut action_graph_edges = Vec::new();
1143 let mut verification_outcomes = Vec::new();
1144 let mut planner_rounds = Vec::new();
1145 let mut transcript_pointers = Vec::new();
1146 let mut compaction_events = Vec::new();
1147 let mut daemon_events = Vec::new();
1148 let mut research_fact_count = 0usize;
1149
1150 let root_node_id = format!("run:{}", run.id);
1151 let trigger_event = trigger_event_from_run(run);
1152 let propagated_trace_id = run_trace_id(run, trigger_event.as_ref());
1153 append_action_graph_node(
1154 &mut action_graph_nodes,
1155 RunActionGraphNodeRecord {
1156 id: root_node_id.clone(),
1157 label: run
1158 .workflow_name
1159 .clone()
1160 .unwrap_or_else(|| run.workflow_id.clone()),
1161 kind: ACTION_GRAPH_NODE_KIND_RUN.to_string(),
1162 status: run.status.clone(),
1163 outcome: run.status.clone(),
1164 trace_id: propagated_trace_id.clone(),
1165 stage_id: None,
1166 node_id: None,
1167 worker_id: None,
1168 run_id: Some(run.id.clone()),
1169 run_path: run.persisted_path.clone(),
1170 metadata: BTreeMap::from([(
1171 "workflow_id".to_string(),
1172 serde_json::json!(run.workflow_id),
1173 )]),
1174 },
1175 );
1176 let mut entry_node_id = root_node_id.clone();
1177 if let Some(trigger_event) = trigger_event.as_ref() {
1178 if let Some(replay_of_event_id) = replay_of_event_id_from_run(run) {
1179 let replay_source_node_id = format!("trigger:{replay_of_event_id}");
1180 append_action_graph_node(
1181 &mut action_graph_nodes,
1182 RunActionGraphNodeRecord {
1183 id: replay_source_node_id.clone(),
1184 label: format!(
1185 "{}:{} (original {})",
1186 trigger_event.provider.as_str(),
1187 trigger_event.kind,
1188 replay_of_event_id
1189 ),
1190 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1191 status: "historical".to_string(),
1192 outcome: "replayed_from".to_string(),
1193 trace_id: Some(trigger_event.trace_id.0.clone()),
1194 stage_id: None,
1195 node_id: None,
1196 worker_id: None,
1197 run_id: Some(run.id.clone()),
1198 run_path: run.persisted_path.clone(),
1199 metadata: trigger_node_metadata(trigger_event),
1200 },
1201 );
1202 action_graph_edges.push(RunActionGraphEdgeRecord {
1203 from_id: replay_source_node_id,
1204 to_id: format!("trigger:{}", trigger_event.id.0),
1205 kind: ACTION_GRAPH_EDGE_KIND_REPLAY_CHAIN.to_string(),
1206 label: Some("replay chain".to_string()),
1207 });
1208 }
1209 let trigger_node_id = format!("trigger:{}", trigger_event.id.0);
1210 append_action_graph_node(
1211 &mut action_graph_nodes,
1212 RunActionGraphNodeRecord {
1213 id: trigger_node_id.clone(),
1214 label: format!("{}:{}", trigger_event.provider.as_str(), trigger_event.kind),
1215 kind: ACTION_GRAPH_NODE_KIND_TRIGGER.to_string(),
1216 status: "received".to_string(),
1217 outcome: signature_status_label(&trigger_event.signature_status).to_string(),
1218 trace_id: Some(trigger_event.trace_id.0.clone()),
1219 stage_id: None,
1220 node_id: None,
1221 worker_id: None,
1222 run_id: Some(run.id.clone()),
1223 run_path: run.persisted_path.clone(),
1224 metadata: trigger_node_metadata(trigger_event),
1225 },
1226 );
1227 action_graph_edges.push(RunActionGraphEdgeRecord {
1228 from_id: root_node_id.clone(),
1229 to_id: trigger_node_id.clone(),
1230 kind: ACTION_GRAPH_EDGE_KIND_ENTRY.to_string(),
1231 label: Some(trigger_event.id.0.clone()),
1232 });
1233 entry_node_id = trigger_node_id;
1234 }
1235
1236 let stage_node_ids = run
1237 .stages
1238 .iter()
1239 .map(|stage| (stage.id.clone(), format!("stage:{}", stage.id)))
1240 .collect::<BTreeMap<_, _>>();
1241 let stage_by_id = run
1242 .stages
1243 .iter()
1244 .map(|stage| (stage.id.as_str(), stage))
1245 .collect::<BTreeMap<_, _>>();
1246 let stage_by_node_id = run
1247 .stages
1248 .iter()
1249 .map(|stage| (stage.node_id.clone(), format!("stage:{}", stage.id)))
1250 .collect::<BTreeMap<_, _>>();
1251
1252 let incoming_nodes = run
1253 .transitions
1254 .iter()
1255 .map(|transition| transition.to_node_id.clone())
1256 .collect::<BTreeSet<_>>();
1257
1258 for stage in &run.stages {
1259 let graph_node_id = stage_node_ids
1260 .get(&stage.id)
1261 .cloned()
1262 .unwrap_or_else(|| format!("stage:{}", stage.id));
1263 append_action_graph_node(
1264 &mut action_graph_nodes,
1265 RunActionGraphNodeRecord {
1266 id: graph_node_id.clone(),
1267 label: stage.node_id.clone(),
1268 kind: action_graph_kind_for_stage(stage).to_string(),
1269 status: stage.status.clone(),
1270 outcome: stage.outcome.clone(),
1271 trace_id: propagated_trace_id.clone(),
1272 stage_id: Some(stage.id.clone()),
1273 node_id: Some(stage.node_id.clone()),
1274 worker_id: stage
1275 .metadata
1276 .get("worker_id")
1277 .and_then(|value| value.as_str())
1278 .map(str::to_string),
1279 run_id: None,
1280 run_path: None,
1281 metadata: stage_node_metadata(stage),
1282 },
1283 );
1284 if !incoming_nodes.contains(&stage.node_id) {
1285 action_graph_edges.push(RunActionGraphEdgeRecord {
1286 from_id: entry_node_id.clone(),
1287 to_id: graph_node_id.clone(),
1288 kind: if trigger_event.is_some() {
1289 ACTION_GRAPH_EDGE_KIND_TRIGGER_DISPATCH.to_string()
1290 } else {
1291 ACTION_GRAPH_EDGE_KIND_ENTRY.to_string()
1292 },
1293 label: None,
1294 });
1295 }
1296
1297 if stage.kind == "verify" || stage.verification.is_some() {
1298 let passed = json_bool(
1299 stage
1300 .verification
1301 .as_ref()
1302 .and_then(|value| value.get("pass")),
1303 )
1304 .or_else(|| {
1305 json_bool(
1306 stage
1307 .verification
1308 .as_ref()
1309 .and_then(|value| value.get("success")),
1310 )
1311 })
1312 .or_else(|| {
1313 if stage.status == "completed" && stage.outcome == "success" {
1314 Some(true)
1315 } else if stage.status == "failed" || stage.outcome == "failed" {
1316 Some(false)
1317 } else {
1318 None
1319 }
1320 });
1321 verification_outcomes.push(RunVerificationOutcomeRecord {
1322 stage_id: stage.id.clone(),
1323 node_id: stage.node_id.clone(),
1324 status: stage.status.clone(),
1325 passed,
1326 summary: stage
1327 .verification
1328 .as_ref()
1329 .map(compact_json_value)
1330 .or_else(|| {
1331 stage
1332 .visible_text
1333 .as_ref()
1334 .filter(|value| !value.trim().is_empty())
1335 .cloned()
1336 }),
1337 });
1338 }
1339
1340 if stage.transcript.is_some() {
1341 transcript_pointers.push(RunTranscriptPointerRecord {
1342 id: format!("stage:{}:transcript", stage.id),
1343 label: format!("Stage {} transcript", stage.node_id),
1344 kind: "embedded_transcript".to_string(),
1345 location: format!("run.stages[{}].transcript", stage.node_id),
1346 path: run.persisted_path.clone(),
1347 available: true,
1348 });
1349 if let Some(transcript) = stage.transcript.as_ref() {
1350 compaction_events.extend(compaction_events_from_transcript(
1351 transcript,
1352 Some(&stage.id),
1353 Some(&stage.node_id),
1354 &format!("run.stages[{}].transcript", stage.node_id),
1355 persisted_path,
1356 ));
1357 }
1358 }
1359
1360 if let Some(payload) = stage_result_payload(stage) {
1361 let trace = payload.get("trace");
1362 let task_ledger = payload
1363 .get("task_ledger")
1364 .and_then(task_ledger_summary_from_value);
1365 let research_facts = task_ledger
1366 .as_ref()
1367 .map(|ledger| ledger.observations.clone())
1368 .unwrap_or_default();
1369 research_fact_count += research_facts.len();
1370 let tools_payload = payload.get("tools");
1371 let tools_used = json_string_array(
1372 tools_payload
1373 .and_then(|tools| tools.get("calls"))
1374 .or_else(|| trace.and_then(|trace| trace.get("tools_used"))),
1375 );
1376 let successful_tools =
1377 json_string_array(tools_payload.and_then(|tools| tools.get("successful")));
1378 let planner_round = RunPlannerRoundRecord {
1379 stage_id: stage.id.clone(),
1380 node_id: stage.node_id.clone(),
1381 stage_kind: stage.kind.clone(),
1382 status: stage.status.clone(),
1383 outcome: stage.outcome.clone(),
1384 iteration_count: json_usize(trace.and_then(|trace| trace.get("iterations"))),
1385 llm_call_count: json_usize(trace.and_then(|trace| trace.get("llm_calls"))),
1386 tool_execution_count: json_usize(
1387 trace.and_then(|trace| trace.get("tool_executions")),
1388 ),
1389 tool_rejection_count: json_usize(
1390 trace.and_then(|trace| trace.get("tool_rejections")),
1391 ),
1392 intervention_count: json_usize(trace.and_then(|trace| trace.get("interventions"))),
1393 compaction_count: json_usize(trace.and_then(|trace| trace.get("compactions"))),
1394 native_text_tool_fallback_count: json_usize(
1395 trace.and_then(|trace| trace.get("native_text_tool_fallbacks")),
1396 ),
1397 native_text_tool_fallback_rejection_count: json_usize(
1398 trace.and_then(|trace| trace.get("native_text_tool_fallback_rejections")),
1399 ),
1400 empty_completion_retry_count: json_usize(
1401 trace.and_then(|trace| trace.get("empty_completion_retries")),
1402 ),
1403 tools_used,
1404 successful_tools,
1405 ledger_done_rejections: json_usize(payload.get("ledger_done_rejections")),
1406 task_ledger,
1407 research_facts,
1408 };
1409 let has_agentic_detail = planner_round.iteration_count > 0
1410 || planner_round.llm_call_count > 0
1411 || planner_round.tool_execution_count > 0
1412 || planner_round.native_text_tool_fallback_count > 0
1413 || planner_round.native_text_tool_fallback_rejection_count > 0
1414 || planner_round.empty_completion_retry_count > 0
1415 || planner_round.ledger_done_rejections > 0
1416 || planner_round.task_ledger.is_some()
1417 || !planner_round.tools_used.is_empty()
1418 || !planner_round.successful_tools.is_empty();
1419 if has_agentic_detail {
1420 planner_rounds.push(planner_round);
1421 }
1422 }
1423 }
1424
1425 for transition in &run.transitions {
1426 let Some(to_id) = stage_by_node_id.get(&transition.to_node_id).cloned() else {
1427 continue;
1428 };
1429 let from_stage = transition
1430 .from_stage_id
1431 .as_deref()
1432 .and_then(|stage_id| stage_by_id.get(stage_id).copied());
1433 let from_id = transition
1434 .from_stage_id
1435 .as_ref()
1436 .and_then(|stage_id| stage_node_ids.get(stage_id))
1437 .cloned()
1438 .or_else(|| {
1439 transition
1440 .from_node_id
1441 .as_ref()
1442 .and_then(|node_id| stage_by_node_id.get(node_id))
1443 .cloned()
1444 })
1445 .unwrap_or_else(|| root_node_id.clone());
1446 action_graph_edges.push(RunActionGraphEdgeRecord {
1447 from_id,
1448 to_id,
1449 kind: if from_stage.is_some_and(|stage| stage.kind == "condition") {
1450 ACTION_GRAPH_EDGE_KIND_PREDICATE_GATE.to_string()
1451 } else {
1452 ACTION_GRAPH_EDGE_KIND_TRANSITION.to_string()
1453 },
1454 label: transition.branch.clone(),
1455 });
1456 }
1457
1458 let worker_lineage = run
1459 .child_runs
1460 .iter()
1461 .map(|child| {
1462 let worker_node_id = format!("worker:{}", child.worker_id);
1463 append_action_graph_node(
1464 &mut action_graph_nodes,
1465 RunActionGraphNodeRecord {
1466 id: worker_node_id.clone(),
1467 label: child.worker_name.clone(),
1468 kind: ACTION_GRAPH_NODE_KIND_WORKER.to_string(),
1469 status: child.status.clone(),
1470 outcome: child.status.clone(),
1471 trace_id: propagated_trace_id.clone(),
1472 stage_id: child.parent_stage_id.clone(),
1473 node_id: None,
1474 worker_id: Some(child.worker_id.clone()),
1475 run_id: child.run_id.clone(),
1476 run_path: child.run_path.clone(),
1477 metadata: BTreeMap::from([
1478 (
1479 "worker_name".to_string(),
1480 serde_json::json!(child.worker_name),
1481 ),
1482 ("task".to_string(), serde_json::json!(child.task)),
1483 ]),
1484 },
1485 );
1486 if let Some(parent_stage_id) = child.parent_stage_id.as_ref() {
1487 if let Some(stage_node_id) = stage_node_ids.get(parent_stage_id) {
1488 action_graph_edges.push(RunActionGraphEdgeRecord {
1489 from_id: stage_node_id.clone(),
1490 to_id: worker_node_id,
1491 kind: ACTION_GRAPH_EDGE_KIND_DELEGATES.to_string(),
1492 label: Some(child.worker_name.clone()),
1493 });
1494 }
1495 }
1496 RunWorkerLineageRecord {
1497 worker_id: child.worker_id.clone(),
1498 worker_name: child.worker_name.clone(),
1499 parent_stage_id: child.parent_stage_id.clone(),
1500 task: child.task.clone(),
1501 status: child.status.clone(),
1502 session_id: child.session_id.clone(),
1503 parent_session_id: child.parent_session_id.clone(),
1504 run_id: child.run_id.clone(),
1505 run_path: child.run_path.clone(),
1506 snapshot_path: child.snapshot_path.clone(),
1507 }
1508 })
1509 .collect::<Vec<_>>();
1510
1511 if run.transcript.is_some() {
1512 transcript_pointers.push(RunTranscriptPointerRecord {
1513 id: "run:transcript".to_string(),
1514 label: "Run transcript".to_string(),
1515 kind: "embedded_transcript".to_string(),
1516 location: "run.transcript".to_string(),
1517 path: run.persisted_path.clone(),
1518 available: true,
1519 });
1520 if let Some(transcript) = run.transcript.as_ref() {
1521 compaction_events.extend(compaction_events_from_transcript(
1522 transcript,
1523 None,
1524 None,
1525 "run.transcript",
1526 persisted_path,
1527 ));
1528 }
1529 }
1530
1531 if let Some(path) = persisted_path {
1532 if let Some(sidecar_path) = llm_transcript_sidecar_path(path) {
1533 transcript_pointers.push(RunTranscriptPointerRecord {
1534 id: "run:llm_transcript".to_string(),
1535 label: "LLM transcript sidecar".to_string(),
1536 kind: "llm_jsonl".to_string(),
1537 location: "run sidecar".to_string(),
1538 path: Some(sidecar_path.to_string_lossy().into_owned()),
1539 available: sidecar_path.exists(),
1540 });
1541 }
1542 daemon_events.extend(daemon_events_from_sidecar(path));
1543 }
1544
1545 RunObservabilityRecord {
1546 schema_version: 4,
1547 planner_rounds,
1548 research_fact_count,
1549 action_graph_nodes,
1550 action_graph_edges,
1551 worker_lineage,
1552 verification_outcomes,
1553 transcript_pointers,
1554 compaction_events,
1555 daemon_events,
1556 }
1557}
1558
1559fn refresh_run_observability(run: &mut RunRecord, persisted_path: Option<&Path>) {
1560 run.observability = Some(derive_run_observability(run, persisted_path));
1561}
1562
1563pub fn normalize_run_record(value: &VmValue) -> Result<RunRecord, VmError> {
1564 let mut run: RunRecord = parse_json_payload(vm_value_to_json(value), "run_record")?;
1565 if run.type_name.is_empty() {
1566 run.type_name = "run_record".to_string();
1567 }
1568 if run.id.is_empty() {
1569 run.id = new_id("run");
1570 }
1571 if run.started_at.is_empty() {
1572 run.started_at = now_rfc3339();
1573 }
1574 if run.status.is_empty() {
1575 run.status = "running".to_string();
1576 }
1577 if run.root_run_id.is_none() {
1578 run.root_run_id = Some(run.id.clone());
1579 }
1580 if run.replay_fixture.is_none() {
1581 run.replay_fixture = Some(replay_fixture_from_run(&run));
1582 }
1583 merge_hitl_questions_from_active_log(&mut run);
1584 sync_run_handoffs(&mut run);
1585 if run.observability.is_none() {
1586 let persisted_path = run.persisted_path.clone();
1587 let persisted = persisted_path.as_deref().map(Path::new);
1588 refresh_run_observability(&mut run, persisted);
1589 }
1590 Ok(run)
1591}
1592
1593pub fn normalize_eval_suite_manifest(value: &VmValue) -> Result<EvalSuiteManifest, VmError> {
1594 let mut manifest: EvalSuiteManifest = parse_json_value(value)?;
1595 if manifest.type_name.is_empty() {
1596 manifest.type_name = "eval_suite_manifest".to_string();
1597 }
1598 if manifest.id.is_empty() {
1599 manifest.id = new_id("eval_suite");
1600 }
1601 Ok(manifest)
1602}
1603
1604pub fn load_eval_suite_manifest(path: &Path) -> Result<EvalSuiteManifest, VmError> {
1605 let content = std::fs::read_to_string(path)
1606 .map_err(|e| VmError::Runtime(format!("failed to read eval suite manifest: {e}")))?;
1607 let mut manifest: EvalSuiteManifest = serde_json::from_str(&content)
1608 .map_err(|e| VmError::Runtime(format!("failed to parse eval suite manifest: {e}")))?;
1609 if manifest.base_dir.is_none() {
1610 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1611 }
1612 Ok(manifest)
1613}
1614
1615pub fn load_eval_pack_manifest(path: &Path) -> Result<EvalPackManifest, VmError> {
1616 let content = std::fs::read_to_string(path)
1617 .map_err(|e| VmError::Runtime(format!("failed to read eval pack manifest: {e}")))?;
1618 let mut manifest: EvalPackManifest =
1619 if path.extension().and_then(|ext| ext.to_str()) == Some("json") {
1620 serde_json::from_str(&content)
1621 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack JSON: {e}")))?
1622 } else {
1623 toml::from_str(&content)
1624 .map_err(|e| VmError::Runtime(format!("failed to parse eval pack TOML: {e}")))?
1625 };
1626 normalize_eval_pack_manifest(&mut manifest);
1627 if manifest.base_dir.is_none() {
1628 manifest.base_dir = path.parent().map(|parent| parent.display().to_string());
1629 }
1630 Ok(manifest)
1631}
1632
1633fn normalize_eval_pack_manifest(manifest: &mut EvalPackManifest) {
1634 if manifest.version == 0 {
1635 manifest.version = 1;
1636 }
1637 if manifest.id.is_empty() {
1638 manifest.id = manifest
1639 .name
1640 .clone()
1641 .filter(|name| !name.trim().is_empty())
1642 .unwrap_or_else(|| new_id("eval_pack"));
1643 }
1644}
1645
1646fn load_replay_fixture(path: &Path) -> Result<ReplayFixture, VmError> {
1647 let content = std::fs::read_to_string(path)
1648 .map_err(|e| VmError::Runtime(format!("failed to read replay fixture: {e}")))?;
1649 serde_json::from_str(&content)
1650 .map_err(|e| VmError::Runtime(format!("failed to parse replay fixture: {e}")))
1651}
1652
1653fn load_run_record_from_fixture_ref(
1654 fixture: &EvalPackFixtureRef,
1655 base_dir: Option<&Path>,
1656) -> Result<RunRecord, VmError> {
1657 if let Some(inline) = &fixture.inline {
1658 let run: RunRecord = serde_json::from_value(inline.clone())
1659 .map_err(|e| VmError::Runtime(format!("failed to parse inline run record: {e}")))?;
1660 return Ok(run);
1661 }
1662 let path = fixture.path.as_deref().ok_or_else(|| {
1663 VmError::Runtime(format!(
1664 "fixture '{}' is missing path or inline run",
1665 fixture.id
1666 ))
1667 })?;
1668 load_run_record(&resolve_manifest_path(base_dir, path))
1669}
1670
1671fn load_replay_fixture_from_ref(
1672 fixture: &EvalPackFixtureRef,
1673 base_dir: Option<&Path>,
1674) -> Result<ReplayFixture, VmError> {
1675 if let Some(inline) = &fixture.inline {
1676 return serde_json::from_value(inline.clone())
1677 .map_err(|e| VmError::Runtime(format!("failed to parse inline replay fixture: {e}")));
1678 }
1679 let path = fixture.path.as_deref().ok_or_else(|| {
1680 VmError::Runtime(format!(
1681 "fixture '{}' is missing path or inline replay fixture",
1682 fixture.id
1683 ))
1684 })?;
1685 load_replay_fixture(&resolve_manifest_path(base_dir, path))
1686}
1687
1688fn resolve_manifest_path(base_dir: Option<&Path>, path: &str) -> PathBuf {
1689 let path_buf = PathBuf::from(path);
1690 if path_buf.is_absolute() {
1691 path_buf
1692 } else if let Some(base_dir) = base_dir {
1693 base_dir.join(path_buf)
1694 } else {
1695 path_buf
1696 }
1697}
1698
1699pub fn evaluate_run_suite_manifest(
1700 manifest: &EvalSuiteManifest,
1701) -> Result<ReplayEvalSuiteReport, VmError> {
1702 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1703 let mut reports = Vec::new();
1704 for case in &manifest.cases {
1705 let run_path = resolve_manifest_path(base_dir, &case.run_path);
1706 let run = load_run_record(&run_path)?;
1707 let fixture = match &case.fixture_path {
1708 Some(path) => load_replay_fixture(&resolve_manifest_path(base_dir, path))?,
1709 None => run
1710 .replay_fixture
1711 .clone()
1712 .unwrap_or_else(|| replay_fixture_from_run(&run)),
1713 };
1714 let eval = evaluate_run_against_fixture(&run, &fixture);
1715 let mut pass = eval.pass;
1716 let mut failures = eval.failures;
1717 let comparison = match &case.compare_to {
1718 Some(path) => {
1719 let baseline_path = resolve_manifest_path(base_dir, path);
1720 let baseline = load_run_record(&baseline_path)?;
1721 let diff = diff_run_records(&baseline, &run);
1722 if !diff.identical {
1723 pass = false;
1724 failures.push(format!(
1725 "run differs from baseline {} with {} stage changes",
1726 baseline_path.display(),
1727 diff.stage_diffs.len()
1728 ));
1729 }
1730 Some(diff)
1731 }
1732 None => None,
1733 };
1734 reports.push(ReplayEvalCaseReport {
1735 run_id: run.id.clone(),
1736 workflow_id: run.workflow_id.clone(),
1737 label: case.label.clone(),
1738 pass,
1739 failures,
1740 stage_count: eval.stage_count,
1741 source_path: Some(run_path.display().to_string()),
1742 comparison,
1743 });
1744 }
1745 let total = reports.len();
1746 let passed = reports.iter().filter(|report| report.pass).count();
1747 let failed = total.saturating_sub(passed);
1748 Ok(ReplayEvalSuiteReport {
1749 pass: failed == 0,
1750 total,
1751 passed,
1752 failed,
1753 cases: reports,
1754 })
1755}
1756
1757pub fn evaluate_eval_pack_manifest(manifest: &EvalPackManifest) -> Result<EvalPackReport, VmError> {
1758 let base_dir = manifest.base_dir.as_deref().map(Path::new);
1759 let fixture_base_dir_buf = manifest
1760 .defaults
1761 .fixture_root
1762 .as_deref()
1763 .map(|root| resolve_manifest_path(base_dir, root));
1764 let fixture_base_dir = fixture_base_dir_buf.as_deref().or(base_dir);
1765 let fixtures_by_id: BTreeMap<&str, &EvalPackFixtureRef> = manifest
1766 .fixtures
1767 .iter()
1768 .filter(|fixture| !fixture.id.is_empty())
1769 .map(|fixture| (fixture.id.as_str(), fixture))
1770 .collect();
1771 let rubrics_by_id: BTreeMap<&str, &EvalPackRubric> = manifest
1772 .rubrics
1773 .iter()
1774 .filter(|rubric| !rubric.id.is_empty())
1775 .map(|rubric| (rubric.id.as_str(), rubric))
1776 .collect();
1777
1778 let mut reports = Vec::new();
1779 for (index, case) in manifest.cases.iter().enumerate() {
1780 let case_id = case
1781 .id
1782 .clone()
1783 .filter(|id| !id.trim().is_empty())
1784 .unwrap_or_else(|| format!("case_{}", index + 1));
1785 let label = case
1786 .name
1787 .clone()
1788 .or_else(|| case.id.clone())
1789 .unwrap_or_else(|| case_id.clone());
1790 let severity = eval_pack_case_severity(manifest, case);
1791 let blocking = severity == "blocking";
1792 let mut failures = Vec::new();
1793 let mut warnings = Vec::new();
1794 let mut informational = Vec::new();
1795
1796 if case.friction_events.is_some() {
1797 let report = evaluate_eval_pack_friction_case(
1798 manifest,
1799 case,
1800 &case_id,
1801 &label,
1802 &severity,
1803 blocking,
1804 base_dir,
1805 fixture_base_dir,
1806 &fixtures_by_id,
1807 &rubrics_by_id,
1808 )?;
1809 reports.push(report);
1810 continue;
1811 }
1812
1813 let run = load_eval_pack_case_run(case, base_dir, fixture_base_dir, &fixtures_by_id)?;
1814 let fixture =
1815 load_eval_pack_case_fixture(case, base_dir, fixture_base_dir, &fixtures_by_id, &run)?;
1816 let eval = evaluate_run_against_fixture(&run, &fixture);
1817 failures.extend(eval.failures);
1818 apply_eval_pack_thresholds(&run, &manifest.defaults.thresholds, &mut failures);
1819 apply_eval_pack_thresholds(&run, &case.thresholds, &mut failures);
1820
1821 let comparison = match &case.compare_to {
1822 Some(path) => {
1823 let baseline_path = resolve_manifest_path(base_dir, path);
1824 let baseline = load_run_record(&baseline_path)?;
1825 let diff = diff_run_records(&baseline, &run);
1826 if !diff.identical {
1827 failures.push(format!(
1828 "run differs from baseline {} with {} stage changes",
1829 baseline_path.display(),
1830 diff.stage_diffs.len()
1831 ));
1832 }
1833 Some(diff)
1834 }
1835 None => None,
1836 };
1837
1838 for rubric_id in &case.rubrics {
1839 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1840 failures.push(format!("case references unknown rubric '{rubric_id}'"));
1841 continue;
1842 };
1843 apply_eval_pack_rubric(rubric, &run, &mut failures, &mut warnings);
1844 }
1845
1846 let pass = failures.is_empty() || !blocking;
1847 if !failures.is_empty() && !blocking {
1848 if severity == "warning" {
1849 warnings.append(&mut failures);
1850 } else {
1851 informational.append(&mut failures);
1852 }
1853 }
1854 reports.push(EvalPackCaseReport {
1855 id: case_id,
1856 label,
1857 severity,
1858 pass,
1859 blocking,
1860 run_id: run.id.clone(),
1861 workflow_id: run.workflow_id.clone(),
1862 source_path: eval_pack_case_source_path(
1863 case,
1864 base_dir,
1865 fixture_base_dir,
1866 &fixtures_by_id,
1867 ),
1868 stage_count: eval.stage_count,
1869 failures,
1870 warnings,
1871 informational,
1872 comparison,
1873 });
1874 }
1875
1876 let total = reports.len();
1877 let blocking_failed = reports
1878 .iter()
1879 .filter(|report| report.blocking && !report.failures.is_empty())
1880 .count();
1881 let warning_failed = reports
1882 .iter()
1883 .filter(|report| !report.warnings.is_empty())
1884 .count();
1885 let informational_failed = reports
1886 .iter()
1887 .filter(|report| !report.informational.is_empty())
1888 .count();
1889 let passed = reports.iter().filter(|report| report.pass).count();
1890 Ok(EvalPackReport {
1891 pack_id: manifest.id.clone(),
1892 pass: blocking_failed == 0,
1893 total,
1894 passed,
1895 failed: total.saturating_sub(passed),
1896 blocking_failed,
1897 warning_failed,
1898 informational_failed,
1899 cases: reports,
1900 })
1901}
1902
1903#[allow(clippy::too_many_arguments)]
1904fn evaluate_eval_pack_friction_case(
1905 manifest: &EvalPackManifest,
1906 case: &EvalPackCase,
1907 case_id: &str,
1908 label: &str,
1909 severity: &str,
1910 blocking: bool,
1911 base_dir: Option<&Path>,
1912 fixture_base_dir: Option<&Path>,
1913 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
1914 rubrics_by_id: &BTreeMap<&str, &EvalPackRubric>,
1915) -> Result<EvalPackCaseReport, VmError> {
1916 let mut failures = Vec::new();
1917 let mut warnings = Vec::new();
1918 let mut informational = Vec::new();
1919 let events =
1920 load_eval_pack_case_friction_events(case, base_dir, fixture_base_dir, fixtures_by_id)?;
1921 let options = friction_suggestion_options(case, manifest);
1922 let suggestions = generate_context_pack_suggestions(&events, &options);
1923
1924 for rubric_id in &case.rubrics {
1925 let Some(rubric) = rubrics_by_id.get(rubric_id.as_str()) else {
1926 failures.push(format!("case references unknown rubric '{rubric_id}'"));
1927 continue;
1928 };
1929 apply_eval_pack_friction_rubric(rubric, &suggestions, &mut failures, &mut warnings);
1930 }
1931
1932 if case.rubrics.is_empty() && suggestions.is_empty() {
1933 failures.push("friction fixture produced no context-pack suggestions".to_string());
1934 }
1935
1936 let pass = failures.is_empty() || !blocking;
1937 if !failures.is_empty() && !blocking {
1938 if severity == "warning" {
1939 warnings.append(&mut failures);
1940 } else {
1941 informational.append(&mut failures);
1942 }
1943 }
1944
1945 Ok(EvalPackCaseReport {
1946 id: case_id.to_string(),
1947 label: label.to_string(),
1948 severity: severity.to_string(),
1949 pass,
1950 blocking,
1951 run_id: "friction_events".to_string(),
1952 workflow_id: String::new(),
1953 source_path: eval_pack_case_friction_source_path(
1954 case,
1955 base_dir,
1956 fixture_base_dir,
1957 fixtures_by_id,
1958 ),
1959 stage_count: events.len(),
1960 failures,
1961 warnings,
1962 informational,
1963 comparison: None,
1964 })
1965}
1966
1967fn eval_pack_case_severity(manifest: &EvalPackManifest, case: &EvalPackCase) -> String {
1968 normalize_eval_pack_severity(
1969 case.severity
1970 .as_deref()
1971 .or(case.thresholds.severity.as_deref())
1972 .or(manifest.defaults.severity.as_deref())
1973 .or(manifest.defaults.thresholds.severity.as_deref())
1974 .unwrap_or("blocking"),
1975 )
1976}
1977
1978fn normalize_eval_pack_severity(value: &str) -> String {
1979 match value.trim().to_ascii_lowercase().as_str() {
1980 "warn" | "warning" => "warning".to_string(),
1981 "info" | "informational" => "informational".to_string(),
1982 _ => "blocking".to_string(),
1983 }
1984}
1985
1986fn load_eval_pack_case_run(
1987 case: &EvalPackCase,
1988 base_dir: Option<&Path>,
1989 fixture_base_dir: Option<&Path>,
1990 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
1991) -> Result<RunRecord, VmError> {
1992 if let Some(run_ref) = case.run.as_deref().or(case.run_path.as_deref()) {
1993 if let Some(fixture) = fixtures_by_id.get(run_ref) {
1994 return load_run_record_from_fixture_ref(fixture, fixture_base_dir);
1995 }
1996 return load_run_record(&resolve_manifest_path(base_dir, run_ref));
1997 }
1998 Err(VmError::Runtime(
1999 "eval pack case is missing run or run_path".to_string(),
2000 ))
2001}
2002
2003fn load_eval_pack_case_fixture(
2004 case: &EvalPackCase,
2005 base_dir: Option<&Path>,
2006 fixture_base_dir: Option<&Path>,
2007 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2008 run: &RunRecord,
2009) -> Result<ReplayFixture, VmError> {
2010 if let Some(fixture_ref) = case.fixture.as_deref().or(case.fixture_path.as_deref()) {
2011 if let Some(fixture) = fixtures_by_id.get(fixture_ref) {
2012 return load_replay_fixture_from_ref(fixture, fixture_base_dir);
2013 }
2014 return load_replay_fixture(&resolve_manifest_path(base_dir, fixture_ref));
2015 }
2016 Ok(run
2017 .replay_fixture
2018 .clone()
2019 .unwrap_or_else(|| replay_fixture_from_run(run)))
2020}
2021
2022fn eval_pack_case_source_path(
2023 case: &EvalPackCase,
2024 base_dir: Option<&Path>,
2025 fixture_base_dir: Option<&Path>,
2026 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2027) -> Option<String> {
2028 let run_ref = case.run.as_deref().or(case.run_path.as_deref())?;
2029 if let Some(fixture) = fixtures_by_id.get(run_ref) {
2030 return fixture.path.as_ref().map(|path| {
2031 resolve_manifest_path(fixture_base_dir, path)
2032 .display()
2033 .to_string()
2034 });
2035 }
2036 Some(
2037 resolve_manifest_path(base_dir, run_ref)
2038 .display()
2039 .to_string(),
2040 )
2041}
2042
2043fn load_eval_pack_case_friction_events(
2044 case: &EvalPackCase,
2045 base_dir: Option<&Path>,
2046 fixture_base_dir: Option<&Path>,
2047 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2048) -> Result<Vec<FrictionEvent>, VmError> {
2049 let event_ref = case.friction_events.as_deref().ok_or_else(|| {
2050 VmError::Runtime("eval pack friction case is missing friction_events".to_string())
2051 })?;
2052 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2053 return load_friction_events_from_fixture_ref(fixture, fixture_base_dir);
2054 }
2055 load_friction_events_from_path(&resolve_manifest_path(base_dir, event_ref))
2056}
2057
2058fn load_friction_events_from_fixture_ref(
2059 fixture: &EvalPackFixtureRef,
2060 base_dir: Option<&Path>,
2061) -> Result<Vec<FrictionEvent>, VmError> {
2062 if let Some(inline) = &fixture.inline {
2063 return normalize_friction_events_json(inline.clone());
2064 }
2065 let path = fixture.path.as_deref().ok_or_else(|| {
2066 VmError::Runtime(format!(
2067 "fixture '{}' is missing path or inline friction events",
2068 fixture.id
2069 ))
2070 })?;
2071 load_friction_events_from_path(&resolve_manifest_path(base_dir, path))
2072}
2073
2074fn load_friction_events_from_path(path: &Path) -> Result<Vec<FrictionEvent>, VmError> {
2075 let content = std::fs::read_to_string(path)
2076 .map_err(|e| VmError::Runtime(format!("failed to read friction events fixture: {e}")))?;
2077 let value: serde_json::Value = serde_json::from_str(&content)
2078 .map_err(|e| VmError::Runtime(format!("failed to parse friction events fixture: {e}")))?;
2079 normalize_friction_events_json(value)
2080}
2081
2082fn eval_pack_case_friction_source_path(
2083 case: &EvalPackCase,
2084 base_dir: Option<&Path>,
2085 fixture_base_dir: Option<&Path>,
2086 fixtures_by_id: &BTreeMap<&str, &EvalPackFixtureRef>,
2087) -> Option<String> {
2088 let event_ref = case.friction_events.as_deref()?;
2089 if let Some(fixture) = fixtures_by_id.get(event_ref) {
2090 return fixture.path.as_ref().map(|path| {
2091 resolve_manifest_path(fixture_base_dir, path)
2092 .display()
2093 .to_string()
2094 });
2095 }
2096 Some(
2097 resolve_manifest_path(base_dir, event_ref)
2098 .display()
2099 .to_string(),
2100 )
2101}
2102
2103fn friction_suggestion_options(
2104 case: &EvalPackCase,
2105 manifest: &EvalPackManifest,
2106) -> ContextPackSuggestionOptions {
2107 let min_occurrences = case
2108 .metadata
2109 .get("min_occurrences")
2110 .or_else(|| manifest.metadata.get("min_occurrences"))
2111 .and_then(|value| value.as_u64())
2112 .unwrap_or(2) as usize;
2113 let owner = case
2114 .metadata
2115 .get("owner")
2116 .or_else(|| manifest.metadata.get("owner"))
2117 .and_then(|value| value.as_str())
2118 .map(str::to_string)
2119 .or_else(|| {
2120 manifest
2121 .package
2122 .as_ref()
2123 .and_then(|package| package.name.clone())
2124 });
2125 ContextPackSuggestionOptions {
2126 min_occurrences,
2127 owner,
2128 }
2129}
2130
2131fn apply_eval_pack_thresholds(
2132 run: &RunRecord,
2133 thresholds: &EvalPackThresholds,
2134 failures: &mut Vec<String>,
2135) {
2136 if let Some(max_stage_count) = thresholds.max_stage_count {
2137 if run.stages.len() > max_stage_count {
2138 failures.push(format!(
2139 "stage count {} exceeds threshold {}",
2140 run.stages.len(),
2141 max_stage_count
2142 ));
2143 }
2144 }
2145 if let Some(max_latency_ms) = thresholds.max_latency_ms {
2146 let actual = run
2147 .usage
2148 .as_ref()
2149 .map(|usage| usage.total_duration_ms)
2150 .unwrap_or_default();
2151 if actual > max_latency_ms {
2152 failures.push(format!(
2153 "latency {actual}ms exceeds threshold {max_latency_ms}ms"
2154 ));
2155 }
2156 }
2157 if let Some(max_cost_usd) = thresholds.max_cost_usd {
2158 let actual = run
2159 .usage
2160 .as_ref()
2161 .map(|usage| usage.total_cost)
2162 .unwrap_or_default();
2163 if actual > max_cost_usd {
2164 failures.push(format!(
2165 "cost ${actual:.6} exceeds threshold ${max_cost_usd:.6}"
2166 ));
2167 }
2168 }
2169 if let Some(max_tokens) = thresholds.max_tokens {
2170 let actual = run
2171 .usage
2172 .as_ref()
2173 .map(|usage| usage.input_tokens + usage.output_tokens)
2174 .unwrap_or_default();
2175 if actual > max_tokens {
2176 failures.push(format!(
2177 "token count {actual} exceeds threshold {max_tokens}"
2178 ));
2179 }
2180 }
2181}
2182
2183fn apply_eval_pack_rubric(
2184 rubric: &EvalPackRubric,
2185 run: &RunRecord,
2186 failures: &mut Vec<String>,
2187 warnings: &mut Vec<String>,
2188) {
2189 match rubric.kind.as_str() {
2190 "" | "deterministic" | "replay" | "budget" | "hitl" | "side-effect" => {
2191 apply_eval_pack_thresholds(run, &rubric.thresholds, failures);
2192 for assertion in &rubric.assertions {
2193 apply_eval_pack_assertion(rubric, assertion, run, failures);
2194 }
2195 }
2196 "llm-judge" | "llm_as_judge" | "judge" => {
2197 let severity = normalize_eval_pack_severity(
2198 rubric.thresholds.severity.as_deref().unwrap_or("blocking"),
2199 );
2200 let message = format!(
2201 "rubric '{}' requires an external LLM judge and was not run locally",
2202 rubric.id
2203 );
2204 if severity == "blocking" {
2205 failures.push(message);
2206 } else {
2207 warnings.push(message);
2208 }
2209 }
2210 other => warnings.push(format!(
2211 "rubric '{}' has unknown kind '{}' and was not run locally",
2212 rubric.id, other
2213 )),
2214 }
2215}
2216
2217fn apply_eval_pack_friction_rubric(
2218 rubric: &EvalPackRubric,
2219 suggestions: &[super::ContextPackSuggestion],
2220 failures: &mut Vec<String>,
2221 warnings: &mut Vec<String>,
2222) {
2223 match rubric.kind.as_str() {
2224 "" | "deterministic" | "friction" | "context-pack-suggestion" => {
2225 let mut expectations = Vec::new();
2226 for assertion in &rubric.assertions {
2227 match assertion.kind.as_str() {
2228 "context-pack-suggestion" | "context_pack_suggestion" | "suggestion" => {
2229 let expectation = context_pack_expectation_from_assertion(assertion);
2230 expectations.push(expectation);
2231 }
2232 other => failures.push(format!(
2233 "rubric '{}' has unsupported friction assertion kind '{}'",
2234 rubric.id, other
2235 )),
2236 }
2237 }
2238 failures.extend(evaluate_context_pack_suggestion_expectations(
2239 suggestions,
2240 &expectations,
2241 ));
2242 }
2243 other => warnings.push(format!(
2244 "rubric '{}' has unknown friction kind '{}' and was not run locally",
2245 rubric.id, other
2246 )),
2247 }
2248}
2249
2250fn context_pack_expectation_from_assertion(
2251 assertion: &EvalPackAssertion,
2252) -> ContextPackSuggestionExpectation {
2253 let expected = assertion
2254 .expected
2255 .as_ref()
2256 .and_then(|value| value.as_object());
2257 let expected_string = assertion.expected.as_ref().and_then(|value| value.as_str());
2258 ContextPackSuggestionExpectation {
2259 min_suggestions: expected
2260 .and_then(|map| map.get("min_suggestions"))
2261 .and_then(|value| value.as_u64())
2262 .map(|value| value as usize),
2263 recommended_artifact: expected
2264 .and_then(|map| map.get("recommended_artifact"))
2265 .and_then(|value| value.as_str())
2266 .map(str::to_string)
2267 .or_else(|| expected_string.map(str::to_string)),
2268 title_contains: assertion.contains.clone().or_else(|| {
2269 expected
2270 .and_then(|map| map.get("title_contains"))
2271 .and_then(|value| value.as_str())
2272 .map(str::to_string)
2273 }),
2274 manifest_name_contains: expected
2275 .and_then(|map| map.get("manifest_name_contains"))
2276 .and_then(|value| value.as_str())
2277 .map(str::to_string),
2278 required_capability: expected
2279 .and_then(|map| map.get("required_capability"))
2280 .and_then(|value| value.as_str())
2281 .map(str::to_string),
2282 required_output_slot: expected
2283 .and_then(|map| map.get("required_output_slot"))
2284 .and_then(|value| value.as_str())
2285 .map(str::to_string),
2286 }
2287}
2288
2289fn apply_eval_pack_assertion(
2290 rubric: &EvalPackRubric,
2291 assertion: &EvalPackAssertion,
2292 run: &RunRecord,
2293 failures: &mut Vec<String>,
2294) {
2295 match assertion.kind.as_str() {
2296 "run-status" | "run_status" | "status" => {
2297 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2298 if let Some(expected) = expected {
2299 if run.status != expected {
2300 failures.push(format!(
2301 "rubric '{}' expected run status {}, got {}",
2302 rubric.id, expected, run.status
2303 ));
2304 }
2305 }
2306 }
2307 "stage-status" | "stage_status" => {
2308 let Some(stage_id) = assertion.stage.as_deref() else {
2309 failures.push(format!(
2310 "rubric '{}' stage-status assertion missing stage",
2311 rubric.id
2312 ));
2313 return;
2314 };
2315 let expected = assertion.expected.as_ref().and_then(|value| value.as_str());
2316 let Some(expected) = expected else {
2317 failures.push(format!(
2318 "rubric '{}' stage-status assertion missing expected string",
2319 rubric.id
2320 ));
2321 return;
2322 };
2323 match run.stages.iter().find(|stage| stage.node_id == stage_id) {
2324 Some(stage) if stage.status == expected => {}
2325 Some(stage) => failures.push(format!(
2326 "rubric '{}' expected stage {} status {}, got {}",
2327 rubric.id, stage_id, expected, stage.status
2328 )),
2329 None => failures.push(format!(
2330 "rubric '{}' expected stage {} to exist",
2331 rubric.id, stage_id
2332 )),
2333 }
2334 }
2335 "visible-text-contains" | "visible_text_contains" => {
2336 let Some(needle) = assertion.contains.as_deref() else {
2337 failures.push(format!(
2338 "rubric '{}' visible-text assertion missing contains",
2339 rubric.id
2340 ));
2341 return;
2342 };
2343 let matched = match assertion.stage.as_deref() {
2344 Some(stage_id) => run
2345 .stages
2346 .iter()
2347 .find(|stage| stage.node_id == stage_id)
2348 .and_then(|stage| stage.visible_text.as_deref())
2349 .is_some_and(|text| text.contains(needle)),
2350 None => run
2351 .stages
2352 .iter()
2353 .filter_map(|stage| stage.visible_text.as_deref())
2354 .any(|text| text.contains(needle)),
2355 };
2356 if !matched {
2357 failures.push(format!(
2358 "rubric '{}' expected visible text to contain {:?}",
2359 rubric.id, needle
2360 ));
2361 }
2362 }
2363 "hitl-question-contains" | "hitl_question_contains" => {
2364 let Some(needle) = assertion.contains.as_deref() else {
2365 failures.push(format!(
2366 "rubric '{}' HITL assertion missing contains",
2367 rubric.id
2368 ));
2369 return;
2370 };
2371 if !run
2372 .hitl_questions
2373 .iter()
2374 .any(|question| question.prompt.contains(needle))
2375 {
2376 failures.push(format!(
2377 "rubric '{}' expected HITL question to contain {:?}",
2378 rubric.id, needle
2379 ));
2380 }
2381 }
2382 "" => {}
2383 other => failures.push(format!(
2384 "rubric '{}' has unsupported assertion kind '{}'",
2385 rubric.id, other
2386 )),
2387 }
2388}
2389
2390#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2392pub(crate) enum DiffOp {
2393 Equal,
2394 Delete,
2395 Insert,
2396}
2397
2398pub(crate) fn myers_diff(a: &[&str], b: &[&str]) -> Vec<(DiffOp, usize)> {
2402 let n = a.len() as isize;
2403 let m = b.len() as isize;
2404 if n == 0 && m == 0 {
2405 return Vec::new();
2406 }
2407 if n == 0 {
2408 return (0..m as usize).map(|j| (DiffOp::Insert, j)).collect();
2409 }
2410 if m == 0 {
2411 return (0..n as usize).map(|i| (DiffOp::Delete, i)).collect();
2412 }
2413
2414 let max_d = (n + m) as usize;
2415 let offset = max_d as isize;
2416 let v_size = 2 * max_d + 1;
2417 let mut v = vec![0isize; v_size];
2418 let mut trace: Vec<Vec<isize>> = Vec::new();
2420
2421 'outer: for d in 0..=max_d as isize {
2422 trace.push(v.clone());
2423 let mut new_v = v.clone();
2424 for k in (-d..=d).step_by(2) {
2425 let ki = (k + offset) as usize;
2426 let mut x = if k == -d || (k != d && v[ki - 1] < v[ki + 1]) {
2427 v[ki + 1]
2428 } else {
2429 v[ki - 1] + 1
2430 };
2431 let mut y = x - k;
2432 while x < n && y < m && a[x as usize] == b[y as usize] {
2433 x += 1;
2434 y += 1;
2435 }
2436 new_v[ki] = x;
2437 if x >= n && y >= m {
2438 let _ = new_v;
2439 break 'outer;
2440 }
2441 }
2442 v = new_v;
2443 }
2444
2445 let mut ops: Vec<(DiffOp, usize)> = Vec::new();
2446 let mut x = n;
2447 let mut y = m;
2448 for d in (1..trace.len() as isize).rev() {
2449 let k = x - y;
2450 let v_prev = &trace[d as usize];
2451 let prev_k = if k == -d
2452 || (k != d && v_prev[(k - 1 + offset) as usize] < v_prev[(k + 1 + offset) as usize])
2453 {
2454 k + 1
2455 } else {
2456 k - 1
2457 };
2458 let prev_x = v_prev[(prev_k + offset) as usize];
2459 let prev_y = prev_x - prev_k;
2460
2461 while x > prev_x && y > prev_y {
2462 x -= 1;
2463 y -= 1;
2464 ops.push((DiffOp::Equal, x as usize));
2465 }
2466 if prev_k < k {
2467 x -= 1;
2468 ops.push((DiffOp::Delete, x as usize));
2469 } else {
2470 y -= 1;
2471 ops.push((DiffOp::Insert, y as usize));
2472 }
2473 }
2474 while x > 0 && y > 0 {
2475 x -= 1;
2476 y -= 1;
2477 ops.push((DiffOp::Equal, x as usize));
2478 }
2479 ops.reverse();
2480 ops
2481}
2482
2483pub fn render_unified_diff(path: Option<&str>, before: &str, after: &str) -> String {
2484 let before_lines: Vec<&str> = before.lines().collect();
2485 let after_lines: Vec<&str> = after.lines().collect();
2486 let ops = myers_diff(&before_lines, &after_lines);
2487
2488 let mut diff = String::new();
2489 let file = path.unwrap_or("artifact");
2490 diff.push_str(&format!("--- a/{file}\n+++ b/{file}\n"));
2491 for &(op, idx) in &ops {
2492 match op {
2493 DiffOp::Equal => diff.push_str(&format!(" {}\n", before_lines[idx])),
2494 DiffOp::Delete => diff.push_str(&format!("-{}\n", before_lines[idx])),
2495 DiffOp::Insert => diff.push_str(&format!("+{}\n", after_lines[idx])),
2496 }
2497 }
2498 diff
2499}
2500
2501pub fn save_run_record(run: &RunRecord, path: Option<&str>) -> Result<String, VmError> {
2502 let path = path
2503 .map(PathBuf::from)
2504 .unwrap_or_else(|| default_run_dir().join(format!("{}.json", run.id)));
2505 let mut materialized = run.clone();
2506 merge_hitl_questions_from_active_log(&mut materialized);
2507 if materialized.replay_fixture.is_none() {
2508 materialized.replay_fixture = Some(replay_fixture_from_run(&materialized));
2509 }
2510 materialized.persisted_path = Some(path.to_string_lossy().into_owned());
2511 sync_run_handoffs(&mut materialized);
2512 refresh_run_observability(&mut materialized, Some(&path));
2513 if let Some(parent) = path.parent() {
2514 std::fs::create_dir_all(parent)
2515 .map_err(|e| VmError::Runtime(format!("failed to create run directory: {e}")))?;
2516 }
2517 let json = serde_json::to_string_pretty(&materialized)
2518 .map_err(|e| VmError::Runtime(format!("failed to encode run record: {e}")))?;
2519 let tmp_path = path.with_extension("json.tmp");
2521 std::fs::write(&tmp_path, &json)
2522 .map_err(|e| VmError::Runtime(format!("failed to persist run record: {e}")))?;
2523 std::fs::rename(&tmp_path, &path).map_err(|e| {
2524 let _ = std::fs::write(&path, &json);
2526 VmError::Runtime(format!("failed to finalize run record: {e}"))
2527 })?;
2528 if let Some(observability) = materialized.observability.as_ref() {
2529 publish_action_graph_event(&materialized, observability, &path);
2530 }
2531 Ok(path.to_string_lossy().into_owned())
2532}
2533
2534pub fn load_run_record(path: &Path) -> Result<RunRecord, VmError> {
2535 let content = std::fs::read_to_string(path)
2536 .map_err(|e| VmError::Runtime(format!("failed to read run record: {e}")))?;
2537 let mut run: RunRecord = serde_json::from_str(&content)
2538 .map_err(|e| VmError::Runtime(format!("failed to parse run record: {e}")))?;
2539 if run.replay_fixture.is_none() {
2540 run.replay_fixture = Some(replay_fixture_from_run(&run));
2541 }
2542 run.persisted_path
2543 .get_or_insert_with(|| path.to_string_lossy().into_owned());
2544 sync_run_handoffs(&mut run);
2545 refresh_run_observability(&mut run, Some(path));
2546 Ok(run)
2547}
2548
2549pub fn replay_fixture_from_run(run: &RunRecord) -> ReplayFixture {
2550 ReplayFixture {
2551 type_name: "replay_fixture".to_string(),
2552 id: new_id("fixture"),
2553 source_run_id: run.id.clone(),
2554 workflow_id: run.workflow_id.clone(),
2555 workflow_name: run.workflow_name.clone(),
2556 created_at: now_rfc3339(),
2557 eval_kind: Some("replay".to_string()),
2558 clarifying_question: None,
2559 expected_status: run.status.clone(),
2560 stage_assertions: run
2561 .stages
2562 .iter()
2563 .map(|stage| ReplayStageAssertion {
2564 node_id: stage.node_id.clone(),
2565 expected_status: stage.status.clone(),
2566 expected_outcome: stage.outcome.clone(),
2567 expected_branch: stage.branch.clone(),
2568 required_artifact_kinds: stage
2569 .artifacts
2570 .iter()
2571 .map(|artifact| artifact.kind.clone())
2572 .collect(),
2573 visible_text_contains: stage
2574 .visible_text
2575 .as_ref()
2576 .filter(|text| !text.is_empty())
2577 .map(|text| text.chars().take(80).collect()),
2578 })
2579 .collect(),
2580 }
2581}
2582
2583pub fn evaluate_run_against_fixture(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2584 if fixture.eval_kind.as_deref() == Some("clarifying_question") {
2585 return evaluate_clarifying_question(run, fixture);
2586 }
2587 let mut failures = Vec::new();
2588 if run.status != fixture.expected_status {
2589 failures.push(format!(
2590 "run status mismatch: expected {}, got {}",
2591 fixture.expected_status, run.status
2592 ));
2593 }
2594 let stages_by_id: BTreeMap<&str, &RunStageRecord> =
2595 run.stages.iter().map(|s| (s.node_id.as_str(), s)).collect();
2596 for assertion in &fixture.stage_assertions {
2597 let Some(stage) = stages_by_id.get(assertion.node_id.as_str()) else {
2598 failures.push(format!("missing stage {}", assertion.node_id));
2599 continue;
2600 };
2601 if stage.status != assertion.expected_status {
2602 failures.push(format!(
2603 "stage {} status mismatch: expected {}, got {}",
2604 assertion.node_id, assertion.expected_status, stage.status
2605 ));
2606 }
2607 if stage.outcome != assertion.expected_outcome {
2608 failures.push(format!(
2609 "stage {} outcome mismatch: expected {}, got {}",
2610 assertion.node_id, assertion.expected_outcome, stage.outcome
2611 ));
2612 }
2613 if stage.branch != assertion.expected_branch {
2614 failures.push(format!(
2615 "stage {} branch mismatch: expected {:?}, got {:?}",
2616 assertion.node_id, assertion.expected_branch, stage.branch
2617 ));
2618 }
2619 for required_kind in &assertion.required_artifact_kinds {
2620 if !stage
2621 .artifacts
2622 .iter()
2623 .any(|artifact| &artifact.kind == required_kind)
2624 {
2625 failures.push(format!(
2626 "stage {} missing artifact kind {}",
2627 assertion.node_id, required_kind
2628 ));
2629 }
2630 }
2631 if let Some(snippet) = &assertion.visible_text_contains {
2632 let actual = stage.visible_text.clone().unwrap_or_default();
2633 if !actual.contains(snippet) {
2634 failures.push(format!(
2635 "stage {} visible text does not contain expected snippet {:?}",
2636 assertion.node_id, snippet
2637 ));
2638 }
2639 }
2640 }
2641
2642 ReplayEvalReport {
2643 pass: failures.is_empty(),
2644 failures,
2645 stage_count: run.stages.len(),
2646 }
2647}
2648
2649fn evaluate_clarifying_question(run: &RunRecord, fixture: &ReplayFixture) -> ReplayEvalReport {
2650 let mut failures = Vec::new();
2651 let spec = fixture.clarifying_question.clone().unwrap_or_default();
2652 let min_questions = clarifying_min_questions(&spec);
2653 let max_questions = clarifying_max_questions(&spec);
2654 let questions = &run.hitl_questions;
2655
2656 if run.status != fixture.expected_status {
2657 failures.push(format!(
2658 "run status mismatch: expected {}, got {}",
2659 fixture.expected_status, run.status
2660 ));
2661 }
2662 if questions.len() < min_questions {
2663 failures.push(format!(
2664 "expected at least {min_questions} clarifying question(s), got {}",
2665 questions.len()
2666 ));
2667 }
2668 if questions.len() > max_questions {
2669 failures.push(format!(
2670 "expected at most {max_questions} clarifying question(s), got {}",
2671 questions.len()
2672 ));
2673 }
2674
2675 let normalized_expected = spec
2676 .expected_question
2677 .as_deref()
2678 .map(normalize_question_text);
2679 let normalized_accepted = spec
2680 .accepted_questions
2681 .iter()
2682 .map(|question| normalize_question_text(question))
2683 .collect::<Vec<_>>();
2684 let required_terms = spec
2685 .required_terms
2686 .iter()
2687 .map(|term| normalize_question_text(term))
2688 .collect::<Vec<_>>();
2689 let forbidden_terms = spec
2690 .forbidden_terms
2691 .iter()
2692 .map(|term| normalize_question_text(term))
2693 .collect::<Vec<_>>();
2694
2695 let matched = questions.iter().any(|question| {
2696 let normalized = normalize_question_text(&question.prompt);
2697 let matches_expected = normalized_expected
2698 .as_ref()
2699 .is_none_or(|expected| &normalized == expected)
2700 && (normalized_accepted.is_empty()
2701 || normalized_accepted
2702 .iter()
2703 .any(|candidate| candidate == &normalized));
2704 let has_required_terms = required_terms
2705 .iter()
2706 .all(|term| normalized.contains(term.as_str()));
2707 let avoids_forbidden_terms = forbidden_terms
2708 .iter()
2709 .all(|term| !normalized.contains(term.as_str()));
2710 matches_expected && has_required_terms && avoids_forbidden_terms
2711 });
2712
2713 if !questions.is_empty()
2714 && (!normalized_accepted.is_empty()
2715 || normalized_expected.is_some()
2716 || !required_terms.is_empty()
2717 || !forbidden_terms.is_empty())
2718 && !matched
2719 {
2720 failures.push(format!(
2721 "no clarifying question matched fixture; actual questions: {}",
2722 questions
2723 .iter()
2724 .map(|question| format!("{:?}", question.prompt))
2725 .collect::<Vec<_>>()
2726 .join(", ")
2727 ));
2728 }
2729
2730 ReplayEvalReport {
2731 pass: failures.is_empty(),
2732 failures,
2733 stage_count: run.stages.len(),
2734 }
2735}
2736
2737pub fn evaluate_run_suite(
2738 cases: Vec<(RunRecord, ReplayFixture, Option<String>)>,
2739) -> ReplayEvalSuiteReport {
2740 let mut reports = Vec::new();
2741 for (run, fixture, source_path) in cases {
2742 let report = evaluate_run_against_fixture(&run, &fixture);
2743 reports.push(ReplayEvalCaseReport {
2744 run_id: run.id.clone(),
2745 workflow_id: run.workflow_id.clone(),
2746 label: None,
2747 pass: report.pass,
2748 failures: report.failures,
2749 stage_count: report.stage_count,
2750 source_path,
2751 comparison: None,
2752 });
2753 }
2754 let total = reports.len();
2755 let passed = reports.iter().filter(|report| report.pass).count();
2756 let failed = total.saturating_sub(passed);
2757 ReplayEvalSuiteReport {
2758 pass: failed == 0,
2759 total,
2760 passed,
2761 failed,
2762 cases: reports,
2763 }
2764}
2765
2766pub fn diff_run_records(left: &RunRecord, right: &RunRecord) -> RunDiffReport {
2767 let mut stage_diffs = Vec::new();
2768 let mut all_node_ids = BTreeSet::new();
2769 let left_by_id: BTreeMap<&str, &RunStageRecord> = left
2770 .stages
2771 .iter()
2772 .map(|s| (s.node_id.as_str(), s))
2773 .collect();
2774 let right_by_id: BTreeMap<&str, &RunStageRecord> = right
2775 .stages
2776 .iter()
2777 .map(|s| (s.node_id.as_str(), s))
2778 .collect();
2779 all_node_ids.extend(left_by_id.keys().copied());
2780 all_node_ids.extend(right_by_id.keys().copied());
2781
2782 for node_id in all_node_ids {
2783 let left_stage = left_by_id.get(node_id).copied();
2784 let right_stage = right_by_id.get(node_id).copied();
2785 match (left_stage, right_stage) {
2786 (Some(_), None) => stage_diffs.push(RunStageDiffRecord {
2787 node_id: node_id.to_string(),
2788 change: "removed".to_string(),
2789 details: vec!["stage missing from right run".to_string()],
2790 }),
2791 (None, Some(_)) => stage_diffs.push(RunStageDiffRecord {
2792 node_id: node_id.to_string(),
2793 change: "added".to_string(),
2794 details: vec!["stage missing from left run".to_string()],
2795 }),
2796 (Some(left_stage), Some(right_stage)) => {
2797 let mut details = Vec::new();
2798 if left_stage.status != right_stage.status {
2799 details.push(format!(
2800 "status: {} -> {}",
2801 left_stage.status, right_stage.status
2802 ));
2803 }
2804 if left_stage.outcome != right_stage.outcome {
2805 details.push(format!(
2806 "outcome: {} -> {}",
2807 left_stage.outcome, right_stage.outcome
2808 ));
2809 }
2810 if left_stage.branch != right_stage.branch {
2811 details.push(format!(
2812 "branch: {:?} -> {:?}",
2813 left_stage.branch, right_stage.branch
2814 ));
2815 }
2816 if left_stage.produced_artifact_ids.len() != right_stage.produced_artifact_ids.len()
2817 {
2818 details.push(format!(
2819 "produced_artifacts: {} -> {}",
2820 left_stage.produced_artifact_ids.len(),
2821 right_stage.produced_artifact_ids.len()
2822 ));
2823 }
2824 if left_stage.artifacts.len() != right_stage.artifacts.len() {
2825 details.push(format!(
2826 "artifact_records: {} -> {}",
2827 left_stage.artifacts.len(),
2828 right_stage.artifacts.len()
2829 ));
2830 }
2831 if !details.is_empty() {
2832 stage_diffs.push(RunStageDiffRecord {
2833 node_id: node_id.to_string(),
2834 change: "changed".to_string(),
2835 details,
2836 });
2837 }
2838 }
2839 (None, None) => {}
2840 }
2841 }
2842
2843 let mut tool_diffs = Vec::new();
2844 let left_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = left
2845 .tool_recordings
2846 .iter()
2847 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
2848 .collect();
2849 let right_tools: std::collections::BTreeMap<(String, String), &ToolCallRecord> = right
2850 .tool_recordings
2851 .iter()
2852 .map(|r| ((r.tool_name.clone(), r.args_hash.clone()), r))
2853 .collect();
2854 let all_tool_keys: std::collections::BTreeSet<_> = left_tools
2855 .keys()
2856 .chain(right_tools.keys())
2857 .cloned()
2858 .collect();
2859 for key in &all_tool_keys {
2860 let l = left_tools.get(key);
2861 let r = right_tools.get(key);
2862 let result_changed = match (l, r) {
2863 (Some(a), Some(b)) => a.result != b.result,
2864 _ => true,
2865 };
2866 if result_changed {
2867 tool_diffs.push(ToolCallDiffRecord {
2868 tool_name: key.0.clone(),
2869 args_hash: key.1.clone(),
2870 result_changed,
2871 left_result: l.map(|t| t.result.clone()),
2872 right_result: r.map(|t| t.result.clone()),
2873 });
2874 }
2875 }
2876
2877 let left_observability = left.observability.clone().unwrap_or_else(|| {
2878 derive_run_observability(left, left.persisted_path.as_deref().map(Path::new))
2879 });
2880 let right_observability = right.observability.clone().unwrap_or_else(|| {
2881 derive_run_observability(right, right.persisted_path.as_deref().map(Path::new))
2882 });
2883 let mut observability_diffs = Vec::new();
2884
2885 let left_workers = left_observability
2886 .worker_lineage
2887 .iter()
2888 .map(|worker| {
2889 (
2890 worker.worker_id.clone(),
2891 (
2892 worker.status.clone(),
2893 worker.run_id.clone(),
2894 worker.run_path.clone(),
2895 ),
2896 )
2897 })
2898 .collect::<BTreeMap<_, _>>();
2899 let right_workers = right_observability
2900 .worker_lineage
2901 .iter()
2902 .map(|worker| {
2903 (
2904 worker.worker_id.clone(),
2905 (
2906 worker.status.clone(),
2907 worker.run_id.clone(),
2908 worker.run_path.clone(),
2909 ),
2910 )
2911 })
2912 .collect::<BTreeMap<_, _>>();
2913 let worker_ids = left_workers
2914 .keys()
2915 .chain(right_workers.keys())
2916 .cloned()
2917 .collect::<BTreeSet<_>>();
2918 for worker_id in worker_ids {
2919 match (left_workers.get(&worker_id), right_workers.get(&worker_id)) {
2920 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
2921 section: "worker_lineage".to_string(),
2922 label: worker_id,
2923 details: vec!["worker missing from right run".to_string()],
2924 }),
2925 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
2926 section: "worker_lineage".to_string(),
2927 label: worker_id,
2928 details: vec!["worker missing from left run".to_string()],
2929 }),
2930 (Some(left_worker), Some(right_worker)) if left_worker != right_worker => {
2931 let mut details = Vec::new();
2932 if left_worker.0 != right_worker.0 {
2933 details.push(format!("status: {} -> {}", left_worker.0, right_worker.0));
2934 }
2935 if left_worker.1 != right_worker.1 {
2936 details.push(format!(
2937 "run_id: {:?} -> {:?}",
2938 left_worker.1, right_worker.1
2939 ));
2940 }
2941 if left_worker.2 != right_worker.2 {
2942 details.push(format!(
2943 "run_path: {:?} -> {:?}",
2944 left_worker.2, right_worker.2
2945 ));
2946 }
2947 observability_diffs.push(RunObservabilityDiffRecord {
2948 section: "worker_lineage".to_string(),
2949 label: worker_id,
2950 details,
2951 });
2952 }
2953 _ => {}
2954 }
2955 }
2956
2957 let left_rounds = left_observability
2958 .planner_rounds
2959 .iter()
2960 .map(|round| (round.stage_id.clone(), round))
2961 .collect::<BTreeMap<_, _>>();
2962 let right_rounds = right_observability
2963 .planner_rounds
2964 .iter()
2965 .map(|round| (round.stage_id.clone(), round))
2966 .collect::<BTreeMap<_, _>>();
2967 let round_ids = left_rounds
2968 .keys()
2969 .chain(right_rounds.keys())
2970 .cloned()
2971 .collect::<BTreeSet<_>>();
2972 for stage_id in round_ids {
2973 match (left_rounds.get(&stage_id), right_rounds.get(&stage_id)) {
2974 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
2975 section: "planner_rounds".to_string(),
2976 label: stage_id,
2977 details: vec!["planner summary missing from right run".to_string()],
2978 }),
2979 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
2980 section: "planner_rounds".to_string(),
2981 label: stage_id,
2982 details: vec!["planner summary missing from left run".to_string()],
2983 }),
2984 (Some(left_round), Some(right_round)) => {
2985 let mut details = Vec::new();
2986 if left_round.iteration_count != right_round.iteration_count {
2987 details.push(format!(
2988 "iterations: {} -> {}",
2989 left_round.iteration_count, right_round.iteration_count
2990 ));
2991 }
2992 if left_round.tool_execution_count != right_round.tool_execution_count {
2993 details.push(format!(
2994 "tool_executions: {} -> {}",
2995 left_round.tool_execution_count, right_round.tool_execution_count
2996 ));
2997 }
2998 if left_round.native_text_tool_fallback_count
2999 != right_round.native_text_tool_fallback_count
3000 {
3001 details.push(format!(
3002 "native_text_tool_fallbacks: {} -> {}",
3003 left_round.native_text_tool_fallback_count,
3004 right_round.native_text_tool_fallback_count
3005 ));
3006 }
3007 if left_round.native_text_tool_fallback_rejection_count
3008 != right_round.native_text_tool_fallback_rejection_count
3009 {
3010 details.push(format!(
3011 "native_text_tool_fallback_rejections: {} -> {}",
3012 left_round.native_text_tool_fallback_rejection_count,
3013 right_round.native_text_tool_fallback_rejection_count
3014 ));
3015 }
3016 if left_round.empty_completion_retry_count
3017 != right_round.empty_completion_retry_count
3018 {
3019 details.push(format!(
3020 "empty_completion_retries: {} -> {}",
3021 left_round.empty_completion_retry_count,
3022 right_round.empty_completion_retry_count
3023 ));
3024 }
3025 if left_round.research_facts != right_round.research_facts {
3026 details.push(format!(
3027 "research_facts: {:?} -> {:?}",
3028 left_round.research_facts, right_round.research_facts
3029 ));
3030 }
3031 let left_deliverables = left_round
3032 .task_ledger
3033 .as_ref()
3034 .map(|ledger| {
3035 ledger
3036 .deliverables
3037 .iter()
3038 .map(|item| format!("{}:{}", item.id, item.status))
3039 .collect::<Vec<_>>()
3040 })
3041 .unwrap_or_default();
3042 let right_deliverables = right_round
3043 .task_ledger
3044 .as_ref()
3045 .map(|ledger| {
3046 ledger
3047 .deliverables
3048 .iter()
3049 .map(|item| format!("{}:{}", item.id, item.status))
3050 .collect::<Vec<_>>()
3051 })
3052 .unwrap_or_default();
3053 if left_deliverables != right_deliverables {
3054 details.push(format!(
3055 "deliverables: {:?} -> {:?}",
3056 left_deliverables, right_deliverables
3057 ));
3058 }
3059 if left_round.successful_tools != right_round.successful_tools {
3060 details.push(format!(
3061 "successful_tools: {:?} -> {:?}",
3062 left_round.successful_tools, right_round.successful_tools
3063 ));
3064 }
3065 if !details.is_empty() {
3066 observability_diffs.push(RunObservabilityDiffRecord {
3067 section: "planner_rounds".to_string(),
3068 label: left_round.node_id.clone(),
3069 details,
3070 });
3071 }
3072 }
3073 _ => {}
3074 }
3075 }
3076
3077 let left_pointers = left_observability
3078 .transcript_pointers
3079 .iter()
3080 .map(|pointer| {
3081 (
3082 pointer.id.clone(),
3083 (
3084 pointer.available,
3085 pointer.path.clone(),
3086 pointer.location.clone(),
3087 ),
3088 )
3089 })
3090 .collect::<BTreeMap<_, _>>();
3091 let right_pointers = right_observability
3092 .transcript_pointers
3093 .iter()
3094 .map(|pointer| {
3095 (
3096 pointer.id.clone(),
3097 (
3098 pointer.available,
3099 pointer.path.clone(),
3100 pointer.location.clone(),
3101 ),
3102 )
3103 })
3104 .collect::<BTreeMap<_, _>>();
3105 let pointer_ids = left_pointers
3106 .keys()
3107 .chain(right_pointers.keys())
3108 .cloned()
3109 .collect::<BTreeSet<_>>();
3110 for pointer_id in pointer_ids {
3111 match (
3112 left_pointers.get(&pointer_id),
3113 right_pointers.get(&pointer_id),
3114 ) {
3115 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3116 section: "transcript_pointers".to_string(),
3117 label: pointer_id,
3118 details: vec!["pointer missing from right run".to_string()],
3119 }),
3120 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3121 section: "transcript_pointers".to_string(),
3122 label: pointer_id,
3123 details: vec!["pointer missing from left run".to_string()],
3124 }),
3125 (Some(left_pointer), Some(right_pointer)) if left_pointer != right_pointer => {
3126 observability_diffs.push(RunObservabilityDiffRecord {
3127 section: "transcript_pointers".to_string(),
3128 label: pointer_id,
3129 details: vec![format!(
3130 "pointer: {:?} -> {:?}",
3131 left_pointer, right_pointer
3132 )],
3133 });
3134 }
3135 _ => {}
3136 }
3137 }
3138
3139 let left_compactions = left_observability
3140 .compaction_events
3141 .iter()
3142 .map(|event| {
3143 (
3144 event.id.clone(),
3145 (
3146 event.strategy.clone(),
3147 event.archived_messages,
3148 event.snapshot_asset_id.clone(),
3149 event.available,
3150 ),
3151 )
3152 })
3153 .collect::<BTreeMap<_, _>>();
3154 let right_compactions = right_observability
3155 .compaction_events
3156 .iter()
3157 .map(|event| {
3158 (
3159 event.id.clone(),
3160 (
3161 event.strategy.clone(),
3162 event.archived_messages,
3163 event.snapshot_asset_id.clone(),
3164 event.available,
3165 ),
3166 )
3167 })
3168 .collect::<BTreeMap<_, _>>();
3169 let compaction_ids = left_compactions
3170 .keys()
3171 .chain(right_compactions.keys())
3172 .cloned()
3173 .collect::<BTreeSet<_>>();
3174 for compaction_id in compaction_ids {
3175 match (
3176 left_compactions.get(&compaction_id),
3177 right_compactions.get(&compaction_id),
3178 ) {
3179 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3180 section: "compaction_events".to_string(),
3181 label: compaction_id,
3182 details: vec!["compaction event missing from right run".to_string()],
3183 }),
3184 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3185 section: "compaction_events".to_string(),
3186 label: compaction_id,
3187 details: vec!["compaction event missing from left run".to_string()],
3188 }),
3189 (Some(left_event), Some(right_event)) if left_event != right_event => {
3190 observability_diffs.push(RunObservabilityDiffRecord {
3191 section: "compaction_events".to_string(),
3192 label: compaction_id,
3193 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3194 });
3195 }
3196 _ => {}
3197 }
3198 }
3199
3200 let left_daemons = left_observability
3201 .daemon_events
3202 .iter()
3203 .map(|event| {
3204 (
3205 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3206 (
3207 event.name.clone(),
3208 event.persist_path.clone(),
3209 event.payload_summary.clone(),
3210 ),
3211 )
3212 })
3213 .collect::<BTreeMap<_, _>>();
3214 let right_daemons = right_observability
3215 .daemon_events
3216 .iter()
3217 .map(|event| {
3218 (
3219 (event.daemon_id.clone(), event.kind, event.timestamp.clone()),
3220 (
3221 event.name.clone(),
3222 event.persist_path.clone(),
3223 event.payload_summary.clone(),
3224 ),
3225 )
3226 })
3227 .collect::<BTreeMap<_, _>>();
3228 let daemon_keys = left_daemons
3229 .keys()
3230 .chain(right_daemons.keys())
3231 .cloned()
3232 .collect::<BTreeSet<_>>();
3233 for daemon_key in daemon_keys {
3234 let label = format!("{}:{:?}:{}", daemon_key.0, daemon_key.1, daemon_key.2);
3235 match (
3236 left_daemons.get(&daemon_key),
3237 right_daemons.get(&daemon_key),
3238 ) {
3239 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3240 section: "daemon_events".to_string(),
3241 label,
3242 details: vec!["daemon event missing from right run".to_string()],
3243 }),
3244 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3245 section: "daemon_events".to_string(),
3246 label,
3247 details: vec!["daemon event missing from left run".to_string()],
3248 }),
3249 (Some(left_event), Some(right_event)) if left_event != right_event => {
3250 observability_diffs.push(RunObservabilityDiffRecord {
3251 section: "daemon_events".to_string(),
3252 label,
3253 details: vec![format!("event: {:?} -> {:?}", left_event, right_event)],
3254 });
3255 }
3256 _ => {}
3257 }
3258 }
3259
3260 let left_verification = left_observability
3261 .verification_outcomes
3262 .iter()
3263 .map(|item| (item.stage_id.clone(), item))
3264 .collect::<BTreeMap<_, _>>();
3265 let right_verification = right_observability
3266 .verification_outcomes
3267 .iter()
3268 .map(|item| (item.stage_id.clone(), item))
3269 .collect::<BTreeMap<_, _>>();
3270 let verification_ids = left_verification
3271 .keys()
3272 .chain(right_verification.keys())
3273 .cloned()
3274 .collect::<BTreeSet<_>>();
3275 for stage_id in verification_ids {
3276 match (
3277 left_verification.get(&stage_id),
3278 right_verification.get(&stage_id),
3279 ) {
3280 (Some(_), None) => observability_diffs.push(RunObservabilityDiffRecord {
3281 section: "verification".to_string(),
3282 label: stage_id,
3283 details: vec!["verification missing from right run".to_string()],
3284 }),
3285 (None, Some(_)) => observability_diffs.push(RunObservabilityDiffRecord {
3286 section: "verification".to_string(),
3287 label: stage_id,
3288 details: vec!["verification missing from left run".to_string()],
3289 }),
3290 (Some(left_item), Some(right_item)) if left_item != right_item => {
3291 let mut details = Vec::new();
3292 if left_item.passed != right_item.passed {
3293 details.push(format!(
3294 "passed: {:?} -> {:?}",
3295 left_item.passed, right_item.passed
3296 ));
3297 }
3298 if left_item.summary != right_item.summary {
3299 details.push(format!(
3300 "summary: {:?} -> {:?}",
3301 left_item.summary, right_item.summary
3302 ));
3303 }
3304 observability_diffs.push(RunObservabilityDiffRecord {
3305 section: "verification".to_string(),
3306 label: left_item.node_id.clone(),
3307 details,
3308 });
3309 }
3310 _ => {}
3311 }
3312 }
3313
3314 let left_graph = (
3315 left_observability.action_graph_nodes.len(),
3316 left_observability.action_graph_edges.len(),
3317 );
3318 let right_graph = (
3319 right_observability.action_graph_nodes.len(),
3320 right_observability.action_graph_edges.len(),
3321 );
3322 if left_graph != right_graph {
3323 observability_diffs.push(RunObservabilityDiffRecord {
3324 section: "action_graph".to_string(),
3325 label: "shape".to_string(),
3326 details: vec![format!(
3327 "nodes/edges: {}/{} -> {}/{}",
3328 left_graph.0, left_graph.1, right_graph.0, right_graph.1
3329 )],
3330 });
3331 }
3332
3333 let status_changed = left.status != right.status;
3334 let identical = !status_changed
3335 && stage_diffs.is_empty()
3336 && tool_diffs.is_empty()
3337 && observability_diffs.is_empty()
3338 && left.transitions.len() == right.transitions.len()
3339 && left.artifacts.len() == right.artifacts.len()
3340 && left.checkpoints.len() == right.checkpoints.len();
3341
3342 RunDiffReport {
3343 left_run_id: left.id.clone(),
3344 right_run_id: right.id.clone(),
3345 identical,
3346 status_changed,
3347 left_status: left.status.clone(),
3348 right_status: right.status.clone(),
3349 stage_diffs,
3350 tool_diffs,
3351 observability_diffs,
3352 transition_count_delta: right.transitions.len() as isize - left.transitions.len() as isize,
3353 artifact_count_delta: right.artifacts.len() as isize - left.artifacts.len() as isize,
3354 checkpoint_count_delta: right.checkpoints.len() as isize - left.checkpoints.len() as isize,
3355 }
3356}
3357
3358#[cfg(test)]
3359mod tests {
3360 use super::*;
3361 use std::fs;
3362
3363 fn minimal_run(status: &str) -> RunRecord {
3364 RunRecord {
3365 type_name: "workflow_run".to_string(),
3366 id: "run_1".to_string(),
3367 workflow_id: "workflow_1".to_string(),
3368 status: status.to_string(),
3369 usage: Some(LlmUsageRecord {
3370 total_duration_ms: 12,
3371 total_cost: 0.01,
3372 input_tokens: 3,
3373 output_tokens: 4,
3374 call_count: 1,
3375 models: vec!["mock".to_string()],
3376 }),
3377 replay_fixture: Some(ReplayFixture {
3378 type_name: "replay_fixture".to_string(),
3379 expected_status: "completed".to_string(),
3380 ..ReplayFixture::default()
3381 }),
3382 ..RunRecord::default()
3383 }
3384 }
3385
3386 #[test]
3387 fn eval_pack_manifest_toml_runs_replay_case() {
3388 let temp = tempfile::tempdir().unwrap();
3389 let run_path = temp.path().join("run.json");
3390 fs::write(
3391 &run_path,
3392 serde_json::to_string(&minimal_run("completed")).unwrap(),
3393 )
3394 .unwrap();
3395 let pack_path = temp.path().join("harn.eval.toml");
3396 fs::write(
3397 &pack_path,
3398 r#"
3399version = 1
3400id = "connector-regressions"
3401name = "Connector regressions"
3402
3403[[cases]]
3404id = "webhook"
3405name = "Webhook normalization"
3406run = "run.json"
3407rubrics = ["status"]
3408
3409[[rubrics]]
3410id = "status"
3411kind = "deterministic"
3412
3413[[rubrics.assertions]]
3414kind = "run-status"
3415expected = "completed"
3416"#,
3417 )
3418 .unwrap();
3419
3420 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3421 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3422
3423 assert!(report.pass);
3424 assert_eq!(report.total, 1);
3425 assert_eq!(report.cases[0].label, "Webhook normalization");
3426 }
3427
3428 #[test]
3429 fn eval_pack_warning_case_does_not_block() {
3430 let temp = tempfile::tempdir().unwrap();
3431 let run_path = temp.path().join("run.json");
3432 fs::write(
3433 &run_path,
3434 serde_json::to_string(&minimal_run("completed")).unwrap(),
3435 )
3436 .unwrap();
3437 let pack_path = temp.path().join("harn.eval.toml");
3438 fs::write(
3439 &pack_path,
3440 r#"
3441version = 1
3442id = "budgets"
3443
3444[[cases]]
3445id = "latency-budget"
3446run = "run.json"
3447severity = "warning"
3448
3449[cases.thresholds]
3450max-latency-ms = 1
3451"#,
3452 )
3453 .unwrap();
3454
3455 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3456 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3457
3458 assert!(report.pass);
3459 assert_eq!(report.warning_failed, 1);
3460 assert!(report.cases[0].warnings[0].contains("latency"));
3461 }
3462
3463 #[test]
3464 fn eval_pack_manifest_runs_friction_context_pack_case() {
3465 let temp = tempfile::tempdir().unwrap();
3466 let events_path = temp.path().join("incident-friction.json");
3467 fs::write(
3468 &events_path,
3469 r#"
3470{
3471 "events": [
3472 {
3473 "kind": "repeated_query",
3474 "source": "incident-triage",
3475 "actor": "sre",
3476 "tool": "splunk",
3477 "provider": "splunk",
3478 "redacted_summary": "Checkout incidents need the same Splunk search",
3479 "recurrence_hints": ["checkout incident queries"],
3480 "estimated_time_ms": 300000,
3481 "metadata": {
3482 "query": "index=checkout service=api error",
3483 "capability": "splunk.search",
3484 "secret_ref": "SPLUNK_READ_TOKEN",
3485 "output_slot": "splunk_errors"
3486 }
3487 },
3488 {
3489 "kind": "repeated_query",
3490 "source": "incident-triage",
3491 "actor": "sre",
3492 "tool": "splunk",
3493 "provider": "splunk",
3494 "redacted_summary": "Checkout incident triage repeated the Splunk search",
3495 "recurrence_hints": ["checkout incident queries"],
3496 "estimated_time_ms": 240000,
3497 "metadata": {
3498 "query": "index=checkout service=api error",
3499 "capability": "splunk.search",
3500 "secret_ref": "SPLUNK_READ_TOKEN",
3501 "output_slot": "splunk_errors"
3502 }
3503 }
3504 ]
3505}
3506"#,
3507 )
3508 .unwrap();
3509 let pack_path = temp.path().join("harn.eval.toml");
3510 fs::write(
3511 &pack_path,
3512 r#"
3513version = 1
3514id = "team-learning"
3515name = "Team learning evals"
3516
3517[[fixtures]]
3518id = "incident-friction"
3519kind = "friction-events"
3520path = "incident-friction.json"
3521
3522[[cases]]
3523id = "incident-context-pack"
3524name = "Incident context pack suggestion"
3525friction_events = "incident-friction"
3526rubrics = ["context-pack"]
3527
3528[[rubrics]]
3529id = "context-pack"
3530kind = "friction"
3531
3532[[rubrics.assertions]]
3533kind = "context-pack-suggestion"
3534contains = "incident"
3535expected = { min_suggestions = 1, recommended_artifact = "context_pack", required_capability = "splunk.search", required_output_slot = "splunk_errors" }
3536"#,
3537 )
3538 .unwrap();
3539
3540 let manifest = load_eval_pack_manifest(&pack_path).unwrap();
3541 let report = evaluate_eval_pack_manifest(&manifest).unwrap();
3542
3543 assert!(report.pass);
3544 assert_eq!(report.total, 1);
3545 assert_eq!(report.cases[0].run_id, "friction_events");
3546 assert_eq!(report.cases[0].stage_count, 2);
3547 }
3548}