1use std::collections::{BTreeMap, BTreeSet};
20use std::fmt;
21use std::fs;
22use std::io::{BufRead, BufReader};
23use std::path::{Path, PathBuf};
24
25use serde::{Deserialize, Serialize};
26
27use crate::agent_events::{AgentEvent, PersistedAgentEvent, ToolCallErrorCategory, ToolCallStatus};
28use crate::value::VmError;
29
30#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
34#[serde(rename_all = "snake_case")]
35pub enum FindingSeverity {
36 Info,
37 Warn,
38 Error,
39}
40
41impl FindingSeverity {
42 pub fn as_str(self) -> &'static str {
43 match self {
44 Self::Info => "info",
45 Self::Warn => "warn",
46 Self::Error => "error",
47 }
48 }
49}
50
51#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
54#[serde(rename_all = "snake_case")]
55pub enum FindingCategory {
56 ExtraModelCall,
58 InvalidStructuredOutput,
61 RepeatedRead,
65 BadWait,
69 UnsafeAttemptedAction,
73 SkippedVerification,
77 MissingApproval,
80 NonMinimalToolUsage,
82 MissingStateStep,
84 StateOutOfOrder,
86 StateSequenceMismatch,
89 IncompleteTranscript,
92 ForbiddenAction,
95}
96
97impl FindingCategory {
98 pub fn as_str(self) -> &'static str {
99 match self {
100 Self::ExtraModelCall => "extra_model_call",
101 Self::InvalidStructuredOutput => "invalid_structured_output",
102 Self::RepeatedRead => "repeated_read",
103 Self::BadWait => "bad_wait",
104 Self::UnsafeAttemptedAction => "unsafe_attempted_action",
105 Self::SkippedVerification => "skipped_verification",
106 Self::MissingApproval => "missing_approval",
107 Self::NonMinimalToolUsage => "non_minimal_tool_usage",
108 Self::MissingStateStep => "missing_state_step",
109 Self::StateOutOfOrder => "state_out_of_order",
110 Self::StateSequenceMismatch => "state_sequence_mismatch",
111 Self::IncompleteTranscript => "incomplete_transcript",
112 Self::ForbiddenAction => "forbidden_action",
113 }
114 }
115}
116
117#[derive(Clone, Debug, Serialize, Deserialize)]
121pub struct AuditFinding {
122 pub category: FindingCategory,
123 pub severity: FindingSeverity,
124 pub message: String,
125 #[serde(default, skip_serializing_if = "Vec::is_empty")]
129 pub event_indices: Vec<u64>,
130 #[serde(default, skip_serializing_if = "Option::is_none")]
132 pub state_step: Option<String>,
133 #[serde(default, skip_serializing_if = "Vec::is_empty")]
135 pub tools: Vec<String>,
136}
137
138#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
140pub struct StateTransition {
141 pub step: String,
144 pub event_index: u64,
146 pub triggered_by: String,
148}
149
150#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)]
154#[serde(default)]
155pub struct ToolPattern {
156 pub name: Option<String>,
158 pub glob: Option<String>,
161}
162
163impl ToolPattern {
164 pub fn matches(&self, tool: &str) -> bool {
165 let needle = tool.to_lowercase();
166 if let Some(name) = &self.name {
167 return name.eq_ignore_ascii_case(tool);
168 }
169 if let Some(glob) = &self.glob {
170 return glob_match(&glob.to_lowercase(), &needle);
171 }
172 false
173 }
174}
175
176fn glob_match(pattern: &str, value: &str) -> bool {
177 if !pattern.contains('*') {
178 return pattern == value;
179 }
180 let parts: Vec<&str> = pattern.split('*').collect();
181 let mut cursor = 0usize;
182 let last = parts.len().saturating_sub(1);
183 for (i, part) in parts.iter().enumerate() {
184 if part.is_empty() {
185 if i == 0 || i == last {
186 continue;
187 }
188 continue;
189 }
190 if i == 0 && !pattern.starts_with('*') {
191 if !value[cursor..].starts_with(part) {
192 return false;
193 }
194 cursor += part.len();
195 continue;
196 }
197 if i == last && !pattern.ends_with('*') {
198 return value[cursor..].ends_with(part);
199 }
200 match value[cursor..].find(part) {
201 Some(idx) => cursor += idx + part.len(),
202 None => return false,
203 }
204 }
205 pattern.ends_with('*') || cursor == value.len()
206}
207
208#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)]
210#[serde(default)]
211pub struct GoldenStateStep {
212 pub step: String,
215 pub tools: Vec<ToolPattern>,
217 pub plan_fields: Vec<String>,
221 pub events: Vec<String>,
224 pub required: bool,
227 #[serde(default)]
231 pub approval_gate: bool,
232 #[serde(default)]
236 pub verifier: bool,
237 #[serde(default)]
240 pub merge_action: bool,
241}
242
243#[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq, Eq)]
247#[serde(default)]
248pub struct MergeCaptainGolden {
249 #[serde(rename = "_type")]
250 pub type_name: String,
251 pub scenario: String,
254 pub description: Option<String>,
255 pub max_model_calls: Option<u64>,
257 pub max_tool_calls: Option<u64>,
259 pub max_repeat: Option<u32>,
262 pub require_approval_for: Vec<ToolPattern>,
265 pub forbidden_actions: Vec<ToolPattern>,
267 pub state_steps: Vec<GoldenStateStep>,
270 pub expected_state_transitions: Vec<String>,
274}
275
276#[derive(Clone, Debug, Serialize, Deserialize, Default)]
279pub struct AuditReport {
280 pub scenario: Option<String>,
281 pub source_path: Option<String>,
283 pub session_ids: Vec<String>,
285 pub event_count: u64,
286 pub model_call_count: u64,
287 pub tool_call_count: u64,
288 pub findings: Vec<AuditFinding>,
289 pub state_transitions: Vec<StateTransition>,
290 pub pass: bool,
291}
292
293impl AuditReport {
294 pub fn error_findings(&self) -> usize {
295 self.findings
296 .iter()
297 .filter(|f| f.severity == FindingSeverity::Error)
298 .count()
299 }
300
301 pub fn warn_findings(&self) -> usize {
302 self.findings
303 .iter()
304 .filter(|f| f.severity == FindingSeverity::Warn)
305 .count()
306 }
307}
308
309impl fmt::Display for AuditReport {
310 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
311 writeln!(
312 f,
313 "{} scenario={} events={} tool_calls={} model_calls={}",
314 if self.pass { "PASS" } else { "FAIL" },
315 self.scenario.as_deref().unwrap_or("<none>"),
316 self.event_count,
317 self.tool_call_count,
318 self.model_call_count
319 )?;
320 if let Some(path) = &self.source_path {
321 writeln!(f, " transcript: {path}")?;
322 }
323 if !self.state_transitions.is_empty() {
324 writeln!(f, " state transitions:")?;
325 for t in &self.state_transitions {
326 writeln!(
327 f,
328 " [{}] {} <- {}",
329 t.event_index, t.step, t.triggered_by
330 )?;
331 }
332 }
333 if self.findings.is_empty() {
334 writeln!(f, " findings: none")?;
335 } else {
336 writeln!(f, " findings ({}):", self.findings.len())?;
337 for finding in &self.findings {
338 let step = finding
339 .state_step
340 .as_deref()
341 .map(|s| format!(" step={s}"))
342 .unwrap_or_default();
343 let tools = if finding.tools.is_empty() {
344 String::new()
345 } else {
346 format!(" tools={}", finding.tools.join(","))
347 };
348 let events = if finding.event_indices.is_empty() {
349 String::new()
350 } else {
351 format!(
352 " events=[{}]",
353 finding
354 .event_indices
355 .iter()
356 .map(u64::to_string)
357 .collect::<Vec<_>>()
358 .join(",")
359 )
360 };
361 writeln!(
362 f,
363 " [{}] {}: {}{}{}{}",
364 finding.severity.as_str(),
365 finding.category.as_str(),
366 finding.message,
367 step,
368 tools,
369 events
370 )?;
371 }
372 }
373 Ok(())
374 }
375}
376
377#[derive(Clone, Debug)]
380pub struct LoadedTranscript {
381 pub source_path: PathBuf,
382 pub events: Vec<PersistedAgentEvent>,
383}
384
385pub fn load_transcript_jsonl(path: &Path) -> Result<LoadedTranscript, VmError> {
390 let metadata = fs::metadata(path).map_err(|e| {
391 VmError::Runtime(format!("failed to stat transcript {}: {e}", path.display()))
392 })?;
393 let mut events = Vec::new();
394 if metadata.is_dir() {
395 let mut files: Vec<PathBuf> = fs::read_dir(path)
396 .map_err(|e| {
397 VmError::Runtime(format!(
398 "failed to read transcript directory {}: {e}",
399 path.display()
400 ))
401 })?
402 .filter_map(|entry| entry.ok())
403 .map(|entry| entry.path())
404 .filter(|p| {
405 p.file_name()
406 .and_then(|n| n.to_str())
407 .map(|name| {
408 name.starts_with("event_log")
409 && p.extension().and_then(|e| e.to_str()) == Some("jsonl")
410 })
411 .unwrap_or(false)
412 })
413 .collect();
414 files.sort();
415 if files.is_empty() {
416 return Err(VmError::Runtime(format!(
417 "no event_log*.jsonl files under {}",
418 path.display()
419 )));
420 }
421 for file in &files {
422 events.extend(read_jsonl_file(file)?);
423 }
424 } else {
425 events.extend(read_jsonl_file(path)?);
426 }
427 events.sort_by_key(|e| e.index);
429 Ok(LoadedTranscript {
430 source_path: path.to_path_buf(),
431 events,
432 })
433}
434
435fn read_jsonl_file(path: &Path) -> Result<Vec<PersistedAgentEvent>, VmError> {
436 let file = fs::File::open(path).map_err(|e| {
437 VmError::Runtime(format!("failed to open transcript {}: {e}", path.display()))
438 })?;
439 let reader = BufReader::new(file);
440 let mut events = Vec::new();
441 for (line_no, line) in reader.lines().enumerate() {
442 let line = line.map_err(|e| {
443 VmError::Runtime(format!(
444 "failed to read line {} of {}: {e}",
445 line_no + 1,
446 path.display()
447 ))
448 })?;
449 let trimmed = line.trim();
450 if trimmed.is_empty() {
451 continue;
452 }
453 let event: PersistedAgentEvent = serde_json::from_str(trimmed).map_err(|e| {
454 VmError::Runtime(format!(
455 "failed to parse line {} of {} as PersistedAgentEvent: {e}",
456 line_no + 1,
457 path.display()
458 ))
459 })?;
460 events.push(event);
461 }
462 Ok(events)
463}
464
465pub fn load_merge_captain_golden(path: &Path) -> Result<MergeCaptainGolden, VmError> {
467 let bytes = fs::read(path).map_err(|e| {
468 VmError::Runtime(format!(
469 "failed to read merge_captain golden {}: {e}",
470 path.display()
471 ))
472 })?;
473 let golden: MergeCaptainGolden = serde_json::from_slice(&bytes).map_err(|e| {
474 VmError::Runtime(format!(
475 "failed to parse merge_captain golden {}: {e}",
476 path.display()
477 ))
478 })?;
479 Ok(golden)
480}
481
482fn default_state_steps() -> Vec<GoldenStateStep> {
487 vec![
488 GoldenStateStep {
489 step: "intake".into(),
490 tools: vec![ToolPattern {
491 glob: Some("*pull_request*".into()),
492 ..Default::default()
493 }],
494 plan_fields: vec!["pr_number".into()],
495 events: vec!["plan".into()],
496 ..Default::default()
497 },
498 GoldenStateStep {
499 step: "verify_checks".into(),
500 tools: vec![
501 ToolPattern {
502 glob: Some("*check*".into()),
503 ..Default::default()
504 },
505 ToolPattern {
506 glob: Some("*ci*".into()),
507 ..Default::default()
508 },
509 ToolPattern {
510 glob: Some("*workflow_run*".into()),
511 ..Default::default()
512 },
513 ],
514 verifier: true,
515 ..Default::default()
516 },
517 GoldenStateStep {
518 step: "decide_risk".into(),
519 plan_fields: vec!["review_risk".into()],
520 events: vec!["plan".into()],
521 ..Default::default()
522 },
523 GoldenStateStep {
524 step: "approval_gate".into(),
525 plan_fields: vec!["approval_required".into()],
526 events: vec!["handoff".into(), "feedback_injected".into()],
527 approval_gate: true,
528 ..Default::default()
529 },
530 GoldenStateStep {
531 step: "merge_or_handoff".into(),
532 tools: vec![
533 ToolPattern {
534 glob: Some("*merge*".into()),
535 ..Default::default()
536 },
537 ToolPattern {
538 glob: Some("*label*".into()),
539 ..Default::default()
540 },
541 ],
542 events: vec!["handoff".into()],
543 merge_action: true,
544 ..Default::default()
545 },
546 ]
547}
548
549pub(crate) fn is_merge_captain_write_tool(name: &str) -> bool {
553 let lower = name.to_lowercase();
554 lower.contains("merge")
555 || lower.contains("write_file")
556 || lower.contains("create_pull")
557 || lower.contains("_create")
558 || lower.contains("create_")
559 || lower.contains("delete")
560 || lower.contains("force_push")
561 || lower.contains("apply_patch")
562 || lower.contains("set_label")
563 || lower.contains("post_comment")
564 || lower.contains("approve")
565}
566
567fn is_wait_tool(name: &str) -> bool {
569 let lower = name.to_lowercase();
570 lower.contains("sleep") || lower.contains("wait") || lower.contains("poll")
571}
572
573pub fn audit_transcript(
575 events: &[PersistedAgentEvent],
576 golden: Option<&MergeCaptainGolden>,
577) -> AuditReport {
578 let scenario = golden.map(|g| g.scenario.clone());
579 let mut session_ids: Vec<String> = Vec::new();
580 let mut model_calls: u64 = 0;
581 let mut tool_calls: u64 = 0;
582 let mut findings: Vec<AuditFinding> = Vec::new();
583 let mut transitions: Vec<StateTransition> = Vec::new();
584
585 let state_steps_owned: Vec<GoldenStateStep> = match golden {
586 Some(g) if !g.state_steps.is_empty() => g.state_steps.clone(),
587 _ => default_state_steps(),
588 };
589 let max_repeat = golden.and_then(|g| g.max_repeat).unwrap_or(1);
590
591 let mut last_tool_call: BTreeMap<String, (String, String, Vec<u64>)> = BTreeMap::new();
593
594 let mut pending_approvals: Vec<u64> = Vec::new();
598
599 let mut verifier_scopes: BTreeSet<String> = BTreeSet::new();
603
604 let mut steps_seen: Vec<String> = Vec::new();
606
607 let mut last_index: u64 = 0;
608 let mut saw_terminal: bool = false;
609
610 for env in events {
611 last_index = env.index;
612 let event = &env.event;
613 let session = event.session_id().to_string();
614 if !session_ids.contains(&session) {
615 session_ids.push(session.clone());
616 }
617
618 match event {
619 AgentEvent::AgentMessageChunk { .. } | AgentEvent::AgentThoughtChunk { .. } => {
620 }
625 AgentEvent::IterationStart { .. } => {
626 model_calls += 1;
627 }
628 AgentEvent::IterationEnd { .. } => {
629 saw_terminal = true;
630 }
631 AgentEvent::BudgetExhausted { .. } => {
632 saw_terminal = true;
633 findings.push(AuditFinding {
634 category: FindingCategory::ExtraModelCall,
635 severity: FindingSeverity::Error,
636 message: "loop hit max_iterations without resolving".into(),
637 event_indices: vec![env.index],
638 state_step: None,
639 tools: vec![],
640 });
641 }
642 AgentEvent::LoopStuck { .. } => {
643 saw_terminal = true;
644 findings.push(AuditFinding {
645 category: FindingCategory::ExtraModelCall,
646 severity: FindingSeverity::Error,
647 message: "loop stuck on consecutive text-only turns".into(),
648 event_indices: vec![env.index],
649 state_step: None,
650 tools: vec![],
651 });
652 }
653 AgentEvent::Handoff { .. } => {
654 saw_terminal = true;
655 if !pending_approvals.is_empty() {
658 pending_approvals.clear();
659 }
660 check_state_transition(
661 &state_steps_owned,
662 StepTrigger::Event("handoff"),
663 env.index,
664 "handoff",
665 &mut transitions,
666 &mut steps_seen,
667 &mut findings,
668 &mut pending_approvals,
669 &mut verifier_scopes,
670 );
671 }
672 AgentEvent::FeedbackInjected { kind, .. } => {
673 if kind.eq_ignore_ascii_case("approval") || kind.eq_ignore_ascii_case("approved") {
674 pending_approvals.clear();
675 }
676 check_state_transition(
677 &state_steps_owned,
678 StepTrigger::Event("feedback_injected"),
679 env.index,
680 "feedback_injected",
681 &mut transitions,
682 &mut steps_seen,
683 &mut findings,
684 &mut pending_approvals,
685 &mut verifier_scopes,
686 );
687 }
688 AgentEvent::Plan { plan, .. } => {
689 check_plan_transitions(
690 &state_steps_owned,
691 plan,
692 env.index,
693 &mut transitions,
694 &mut steps_seen,
695 &mut findings,
696 &mut pending_approvals,
697 &mut verifier_scopes,
698 );
699 if let Some(approval) = plan
700 .get("approval_required")
701 .and_then(serde_json::Value::as_bool)
702 {
703 if approval {
704 pending_approvals.push(env.index);
705 }
706 }
707 if !plan.is_object() {
708 findings.push(AuditFinding {
709 category: FindingCategory::InvalidStructuredOutput,
710 severity: FindingSeverity::Error,
711 message: "Plan event payload was not a JSON object".into(),
712 event_indices: vec![env.index],
713 state_step: None,
714 tools: vec![],
715 });
716 }
717 }
718 AgentEvent::ToolCall {
719 tool_name,
720 raw_input,
721 status,
722 ..
723 } => {
724 tool_calls += 1;
725 let arg_hash = canonical_json(raw_input);
727 match last_tool_call.get_mut(&session) {
728 Some(entry) if entry.0 == *tool_name && entry.1 == arg_hash => {
729 entry.2.push(env.index);
730 if (entry.2.len() as u32) > max_repeat {
731 let indices = entry.2.clone();
732 findings.push(AuditFinding {
733 category: FindingCategory::RepeatedRead,
734 severity: FindingSeverity::Error,
735 message: format!(
736 "tool `{}` called {} times consecutively with identical args",
737 tool_name,
738 indices.len()
739 ),
740 event_indices: indices,
741 state_step: None,
742 tools: vec![tool_name.clone()],
743 });
744 *entry = (tool_name.clone(), arg_hash.clone(), vec![env.index]);
746 }
747 }
748 _ => {
749 last_tool_call.insert(
750 session.clone(),
751 (tool_name.clone(), arg_hash.clone(), vec![env.index]),
752 );
753 }
754 }
755
756 if is_wait_tool(tool_name) {
759 let indicates_progress = raw_input
760 .as_object()
761 .map(|obj| {
762 obj.contains_key("until")
763 || obj.contains_key("condition")
764 || obj.contains_key("subscription_id")
765 })
766 .unwrap_or(false);
767 if !indicates_progress {
768 findings.push(AuditFinding {
769 category: FindingCategory::BadWait,
770 severity: FindingSeverity::Warn,
771 message: format!(
772 "wait/poll tool `{tool_name}` invoked without progress predicate (until/condition/subscription_id)"
773 ),
774 event_indices: vec![env.index],
775 state_step: None,
776 tools: vec![tool_name.clone()],
777 });
778 }
779 }
780
781 let needs_approval_match = match golden {
785 Some(g) if !g.require_approval_for.is_empty() => {
786 g.require_approval_for.iter().any(|p| p.matches(tool_name))
787 }
788 _ => is_merge_captain_write_tool(tool_name),
789 };
790 if needs_approval_match
791 && pending_approvals.is_empty()
792 && !already_approved(&steps_seen, &state_steps_owned)
793 {
794 findings.push(AuditFinding {
795 category: FindingCategory::UnsafeAttemptedAction,
796 severity: FindingSeverity::Error,
797 message: format!(
798 "tool `{tool_name}` requires prior approval gate, but none observed"
799 ),
800 event_indices: vec![env.index],
801 state_step: None,
802 tools: vec![tool_name.clone()],
803 });
804 }
805
806 if let Some(g) = golden {
808 if g.forbidden_actions.iter().any(|p| p.matches(tool_name)) {
809 findings.push(AuditFinding {
810 category: FindingCategory::ForbiddenAction,
811 severity: FindingSeverity::Error,
812 message: format!(
813 "tool `{}` is forbidden in scenario `{}`",
814 tool_name, g.scenario
815 ),
816 event_indices: vec![env.index],
817 state_step: None,
818 tools: vec![tool_name.clone()],
819 });
820 }
821 }
822
823 check_state_transition(
827 &state_steps_owned,
828 StepTrigger::Tool {
829 name: tool_name,
830 scope: transition_scope(raw_input),
831 },
832 env.index,
833 tool_name,
834 &mut transitions,
835 &mut steps_seen,
836 &mut findings,
837 &mut pending_approvals,
838 &mut verifier_scopes,
839 );
840 let _ = status;
841 }
842 AgentEvent::ToolCallUpdate {
843 status,
844 error,
845 error_category,
846 tool_name,
847 ..
848 } => {
849 if matches!(status, ToolCallStatus::Failed) {
850 if let Some(category) = error_category {
851 if matches!(category, ToolCallErrorCategory::SchemaValidation) {
852 findings.push(AuditFinding {
853 category: FindingCategory::InvalidStructuredOutput,
854 severity: FindingSeverity::Error,
855 message: format!(
856 "tool `{}` failed schema validation: {}",
857 tool_name,
858 error.clone().unwrap_or_default()
859 ),
860 event_indices: vec![env.index],
861 state_step: None,
862 tools: vec![tool_name.clone()],
863 });
864 }
865 }
866 }
867 }
868 _ => {
869 }
872 }
873 }
874
875 if !pending_approvals.is_empty() {
877 findings.push(AuditFinding {
878 category: FindingCategory::MissingApproval,
879 severity: FindingSeverity::Error,
880 message: format!(
881 "{} plan(s) declared approval_required: true with no following approval gate",
882 pending_approvals.len()
883 ),
884 event_indices: pending_approvals.clone(),
885 state_step: Some("approval_gate".into()),
886 tools: vec![],
887 });
888 }
889
890 if !events.is_empty() && !saw_terminal {
891 findings.push(AuditFinding {
892 category: FindingCategory::IncompleteTranscript,
893 severity: FindingSeverity::Warn,
894 message:
895 "transcript ended without a IterationEnd / Handoff / BudgetExhausted / LoopStuck event"
896 .into(),
897 event_indices: vec![last_index],
898 state_step: None,
899 tools: vec![],
900 });
901 }
902
903 for step in &state_steps_owned {
905 if step.required && !steps_seen.iter().any(|s| s == &step.step) {
906 findings.push(AuditFinding {
907 category: FindingCategory::MissingStateStep,
908 severity: FindingSeverity::Error,
909 message: format!("required state step `{}` was never reached", step.step),
910 event_indices: vec![],
911 state_step: Some(step.step.clone()),
912 tools: vec![],
913 });
914 }
915 }
916
917 let order: BTreeMap<&str, usize> = state_steps_owned
922 .iter()
923 .enumerate()
924 .map(|(i, s)| (s.step.as_str(), i))
925 .collect();
926 let mut highest: usize = 0;
927 let mut last_step: Option<&str> = None;
928 for step in &steps_seen {
929 if let Some(idx) = order.get(step.as_str()) {
930 if *idx + 1 < highest && last_step != Some(step.as_str()) {
931 findings.push(AuditFinding {
932 category: FindingCategory::StateOutOfOrder,
933 severity: FindingSeverity::Warn,
934 message: format!("state step `{step}` fired after a later step"),
935 event_indices: vec![],
936 state_step: Some(step.clone()),
937 tools: vec![],
938 });
939 }
940 if *idx > highest {
941 highest = *idx;
942 }
943 last_step = Some(step.as_str());
944 }
945 }
946
947 if let Some(g) = golden {
948 if !g.expected_state_transitions.is_empty() {
949 let observed: Vec<String> = transitions
950 .iter()
951 .map(|transition| transition.step.clone())
952 .collect();
953 if observed != g.expected_state_transitions {
954 findings.push(AuditFinding {
955 category: FindingCategory::StateSequenceMismatch,
956 severity: FindingSeverity::Error,
957 message: format!(
958 "state transitions {:?} did not match expected {:?}",
959 observed, g.expected_state_transitions
960 ),
961 event_indices: vec![],
962 state_step: None,
963 tools: vec![],
964 });
965 }
966 }
967 }
968
969 if let Some(g) = golden {
971 if let Some(max) = g.max_tool_calls {
972 if tool_calls > max {
973 findings.push(AuditFinding {
974 category: FindingCategory::NonMinimalToolUsage,
975 severity: FindingSeverity::Error,
976 message: format!("tool calls ({tool_calls}) exceeded scenario budget ({max})"),
977 event_indices: vec![],
978 state_step: None,
979 tools: vec![],
980 });
981 }
982 }
983 if let Some(max) = g.max_model_calls {
984 if model_calls > max {
985 findings.push(AuditFinding {
986 category: FindingCategory::ExtraModelCall,
987 severity: FindingSeverity::Error,
988 message: format!(
989 "model calls ({model_calls}) exceeded scenario budget ({max})"
990 ),
991 event_indices: vec![],
992 state_step: None,
993 tools: vec![],
994 });
995 }
996 }
997 }
998
999 let pass = findings
1000 .iter()
1001 .all(|f| f.severity != FindingSeverity::Error);
1002
1003 AuditReport {
1004 scenario,
1005 source_path: None,
1006 session_ids,
1007 event_count: events.len() as u64,
1008 model_call_count: model_calls,
1009 tool_call_count: tool_calls,
1010 findings,
1011 state_transitions: transitions,
1012 pass,
1013 }
1014}
1015
1016enum StepTrigger<'a> {
1017 Tool {
1018 name: &'a str,
1019 scope: Option<String>,
1020 },
1021 Event(&'a str),
1022}
1023
1024#[allow(clippy::too_many_arguments)]
1025fn check_state_transition(
1026 steps: &[GoldenStateStep],
1027 trigger: StepTrigger,
1028 event_index: u64,
1029 triggered_by: &str,
1030 transitions: &mut Vec<StateTransition>,
1031 steps_seen: &mut Vec<String>,
1032 findings: &mut Vec<AuditFinding>,
1033 pending_approvals: &mut Vec<u64>,
1034 verifier_scopes: &mut BTreeSet<String>,
1035) {
1036 for step in steps {
1037 let matched = match &trigger {
1038 StepTrigger::Tool { name, .. } => step.tools.iter().any(|p| p.matches(name)),
1039 StepTrigger::Event(name) => step.events.iter().any(|e| e.eq_ignore_ascii_case(name)),
1040 };
1041 if !matched {
1042 continue;
1043 }
1044 let scope = match &trigger {
1045 StepTrigger::Tool { scope, .. } => scope.clone(),
1046 StepTrigger::Event(_) => None,
1047 };
1048 record_step(
1049 step,
1050 event_index,
1051 triggered_by,
1052 scope.as_deref(),
1053 transitions,
1054 steps_seen,
1055 findings,
1056 pending_approvals,
1057 verifier_scopes,
1058 );
1059 }
1064}
1065
1066#[allow(clippy::too_many_arguments)]
1067fn check_plan_transitions(
1068 steps: &[GoldenStateStep],
1069 plan: &serde_json::Value,
1070 event_index: u64,
1071 transitions: &mut Vec<StateTransition>,
1072 steps_seen: &mut Vec<String>,
1073 findings: &mut Vec<AuditFinding>,
1074 pending_approvals: &mut Vec<u64>,
1075 verifier_scopes: &mut BTreeSet<String>,
1076) {
1077 let obj = match plan.as_object() {
1078 Some(o) => o,
1079 None => return,
1080 };
1081 for step in steps {
1082 let plan_match = step.plan_fields.iter().any(|field| {
1083 if step.approval_gate && field == "approval_required" {
1084 obj.get(field).and_then(serde_json::Value::as_bool) == Some(true)
1085 } else {
1086 obj.contains_key(field)
1087 }
1088 });
1089 let event_match = step.events.iter().any(|e| e.eq_ignore_ascii_case("plan"));
1090 if !(plan_match || (event_match && step.plan_fields.is_empty())) {
1091 continue;
1092 }
1093 if !plan_match && !event_match {
1094 continue;
1095 }
1096 record_step(
1097 step,
1098 event_index,
1099 "plan",
1100 transition_scope(plan).as_deref(),
1101 transitions,
1102 steps_seen,
1103 findings,
1104 pending_approvals,
1105 verifier_scopes,
1106 );
1107 }
1108}
1109
1110#[allow(clippy::too_many_arguments)]
1111fn record_step(
1112 step: &GoldenStateStep,
1113 event_index: u64,
1114 triggered_by: &str,
1115 scope: Option<&str>,
1116 transitions: &mut Vec<StateTransition>,
1117 steps_seen: &mut Vec<String>,
1118 findings: &mut Vec<AuditFinding>,
1119 pending_approvals: &mut Vec<u64>,
1120 verifier_scopes: &mut BTreeSet<String>,
1121) {
1122 transitions.push(StateTransition {
1123 step: step.step.clone(),
1124 event_index,
1125 triggered_by: triggered_by.to_string(),
1126 });
1127 if !steps_seen.contains(&step.step) {
1128 steps_seen.push(step.step.clone());
1129 }
1130 if step.approval_gate {
1131 pending_approvals.clear();
1132 }
1133 if step.verifier {
1134 verifier_scopes.insert(scope.unwrap_or("*").to_string());
1135 }
1136 let verified = scope
1137 .map(|scope| verifier_scopes.contains(scope) || verifier_scopes.contains("*"))
1138 .unwrap_or_else(|| !verifier_scopes.is_empty());
1139 if step.merge_action && !verified {
1140 findings.push(AuditFinding {
1141 category: FindingCategory::SkippedVerification,
1142 severity: FindingSeverity::Error,
1143 message: format!(
1144 "merge action `{}` reached without a preceding verifier step",
1145 step.step
1146 ),
1147 event_indices: vec![event_index],
1148 state_step: Some(step.step.clone()),
1149 tools: vec![],
1150 });
1151 }
1152}
1153
1154fn transition_scope(value: &serde_json::Value) -> Option<String> {
1155 let repo = value.get("repo").and_then(serde_json::Value::as_str)?;
1156 let pr_number = value
1157 .get("pr_number")
1158 .or_else(|| value.get("number"))
1159 .and_then(serde_json::Value::as_u64)?;
1160 Some(format!("{repo}#{pr_number}"))
1161}
1162
1163fn already_approved(steps_seen: &[String], steps: &[GoldenStateStep]) -> bool {
1164 steps
1165 .iter()
1166 .filter(|s| s.approval_gate)
1167 .any(|s| steps_seen.contains(&s.step))
1168}
1169
1170fn canonical_json(value: &serde_json::Value) -> String {
1171 serde_json::to_string(value).unwrap_or_default()
1173}
1174
1175#[cfg(test)]
1176mod tests {
1177 use super::*;
1178 use crate::agent_events::{AgentEvent, PersistedAgentEvent, ToolCallStatus};
1179 use serde_json::json;
1180
1181 fn env(index: u64, event: AgentEvent) -> PersistedAgentEvent {
1182 PersistedAgentEvent {
1183 index,
1184 emitted_at_ms: 0,
1185 frame_depth: None,
1186 event,
1187 }
1188 }
1189
1190 fn iteration_start(index: u64, session: &str, iter: usize) -> PersistedAgentEvent {
1191 env(
1192 index,
1193 AgentEvent::IterationStart {
1194 session_id: session.into(),
1195 iteration: iter,
1196 provider: String::new(),
1197 model: String::new(),
1198 },
1199 )
1200 }
1201
1202 fn iteration_end(index: u64, session: &str, iter: usize) -> PersistedAgentEvent {
1203 env(
1204 index,
1205 AgentEvent::IterationEnd {
1206 session_id: session.into(),
1207 iteration: iter,
1208 iteration_info: serde_json::Value::Null,
1209 },
1210 )
1211 }
1212
1213 fn tool_call(
1214 index: u64,
1215 session: &str,
1216 tool: &str,
1217 args: serde_json::Value,
1218 ) -> PersistedAgentEvent {
1219 env(
1220 index,
1221 AgentEvent::ToolCall {
1222 session_id: session.into(),
1223 tool_call_id: format!("call_{index}"),
1224 tool_name: tool.into(),
1225 kind: None,
1226 status: ToolCallStatus::Pending,
1227 raw_input: args,
1228 parsing: None,
1229 audit: None,
1230 },
1231 )
1232 }
1233
1234 fn plan(index: u64, session: &str, plan: serde_json::Value) -> PersistedAgentEvent {
1235 env(
1236 index,
1237 AgentEvent::Plan {
1238 session_id: session.into(),
1239 plan,
1240 },
1241 )
1242 }
1243
1244 fn handoff(index: u64, session: &str) -> PersistedAgentEvent {
1245 env(
1246 index,
1247 AgentEvent::Handoff {
1248 session_id: session.into(),
1249 artifact_id: format!("artifact_{index}"),
1250 handoff: Box::new(crate::orchestration::HandoffArtifact::default()),
1251 },
1252 )
1253 }
1254
1255 #[test]
1256 fn pass_minimal_green_pr_default_rules() {
1257 let events = vec![
1258 iteration_start(1, "s", 1),
1259 tool_call(2, "s", "fetch_pull_request", json!({"number": 1})),
1260 tool_call(3, "s", "list_checks", json!({"pr": 1})),
1261 plan(
1262 4,
1263 "s",
1264 json!({
1265 "review_risk": "low",
1266 "approval_required": false,
1267 "pr_number": 1,
1268 }),
1269 ),
1270 iteration_end(5, "s", 1),
1271 ];
1272 let report = audit_transcript(&events, None);
1273 assert!(report.pass, "report: {report}");
1274 assert_eq!(report.tool_call_count, 2);
1275 assert_eq!(report.model_call_count, 1);
1276 assert!(
1277 report.findings.is_empty(),
1278 "findings: {:?}",
1279 report.findings
1280 );
1281 }
1282
1283 #[test]
1284 fn flags_repeated_reads_with_default_threshold() {
1285 let events = vec![
1286 iteration_start(1, "s", 1),
1287 tool_call(2, "s", "list_checks", json!({"pr": 1})),
1288 tool_call(3, "s", "list_checks", json!({"pr": 1})),
1289 tool_call(4, "s", "list_checks", json!({"pr": 1})),
1290 iteration_end(5, "s", 1),
1291 ];
1292 let report = audit_transcript(&events, None);
1293 assert!(!report.pass);
1294 assert!(report
1295 .findings
1296 .iter()
1297 .any(|f| f.category == FindingCategory::RepeatedRead));
1298 }
1299
1300 #[test]
1301 fn flags_unsafe_action_without_approval() {
1302 let events = vec![
1303 iteration_start(1, "s", 1),
1304 tool_call(2, "s", "merge_pull_request", json!({"number": 1})),
1305 iteration_end(3, "s", 1),
1306 ];
1307 let report = audit_transcript(&events, None);
1308 assert!(!report.pass);
1309 assert!(report
1310 .findings
1311 .iter()
1312 .any(|f| f.category == FindingCategory::UnsafeAttemptedAction));
1313 }
1314
1315 #[test]
1316 fn approval_required_false_does_not_open_approval_gate() {
1317 let events = vec![
1318 iteration_start(1, "s", 1),
1319 plan(
1320 2,
1321 "s",
1322 json!({"approval_required": false, "review_risk": "low"}),
1323 ),
1324 tool_call(3, "s", "merge_pull_request", json!({"number": 1})),
1325 iteration_end(4, "s", 1),
1326 ];
1327 let report = audit_transcript(&events, None);
1328 assert!(!report.pass);
1329 assert!(report
1330 .findings
1331 .iter()
1332 .any(|f| f.category == FindingCategory::UnsafeAttemptedAction));
1333 }
1334
1335 #[test]
1336 fn flags_missing_approval_after_required_plan() {
1337 let events = vec![
1338 iteration_start(1, "s", 1),
1339 plan(
1340 2,
1341 "s",
1342 json!({"approval_required": true, "review_risk": "high"}),
1343 ),
1344 iteration_end(3, "s", 1),
1345 ];
1346 let report = audit_transcript(&events, None);
1347 assert!(!report.pass);
1348 assert!(report
1349 .findings
1350 .iter()
1351 .any(|f| f.category == FindingCategory::MissingApproval));
1352 }
1353
1354 #[test]
1355 fn handoff_satisfies_pending_approval() {
1356 let events = vec![
1357 iteration_start(1, "s", 1),
1358 plan(
1359 2,
1360 "s",
1361 json!({"approval_required": true, "review_risk": "high"}),
1362 ),
1363 handoff(3, "s"),
1364 ];
1365 let report = audit_transcript(&events, None);
1366 assert!(
1367 !report
1368 .findings
1369 .iter()
1370 .any(|f| f.category == FindingCategory::MissingApproval),
1371 "findings: {:?}",
1372 report.findings
1373 );
1374 }
1375
1376 #[test]
1377 fn flags_skipped_verification_when_merge_runs_without_verifier() {
1378 let golden = MergeCaptainGolden {
1379 type_name: "merge_captain_golden".into(),
1380 scenario: "test".into(),
1381 state_steps: vec![
1382 GoldenStateStep {
1383 step: "verify".into(),
1384 tools: vec![ToolPattern {
1385 glob: Some("*list_checks*".into()),
1386 ..Default::default()
1387 }],
1388 verifier: true,
1389 ..Default::default()
1390 },
1391 GoldenStateStep {
1392 step: "approve".into(),
1393 events: vec!["feedback_injected".into()],
1394 approval_gate: true,
1395 ..Default::default()
1396 },
1397 GoldenStateStep {
1398 step: "merge".into(),
1399 tools: vec![ToolPattern {
1400 glob: Some("*merge*".into()),
1401 ..Default::default()
1402 }],
1403 merge_action: true,
1404 required: true,
1405 ..Default::default()
1406 },
1407 ],
1408 ..Default::default()
1409 };
1410 let events = vec![
1411 iteration_start(1, "s", 1),
1412 env(
1413 2,
1414 AgentEvent::FeedbackInjected {
1415 session_id: "s".into(),
1416 kind: "approval".into(),
1417 content: "ok".into(),
1418 },
1419 ),
1420 tool_call(3, "s", "merge_pull_request", json!({"number": 1})),
1421 iteration_end(4, "s", 1),
1422 ];
1423 let report = audit_transcript(&events, Some(&golden));
1424 assert!(report
1425 .findings
1426 .iter()
1427 .any(|f| f.category == FindingCategory::SkippedVerification));
1428 }
1429
1430 #[test]
1431 fn verifier_scope_must_match_merge_scope() {
1432 let golden = MergeCaptainGolden {
1433 type_name: "merge_captain_golden".into(),
1434 scenario: "test".into(),
1435 state_steps: vec![
1436 GoldenStateStep {
1437 step: "verify".into(),
1438 tools: vec![ToolPattern {
1439 glob: Some("*list_checks*".into()),
1440 ..Default::default()
1441 }],
1442 verifier: true,
1443 ..Default::default()
1444 },
1445 GoldenStateStep {
1446 step: "merge".into(),
1447 tools: vec![ToolPattern {
1448 glob: Some("*merge*".into()),
1449 ..Default::default()
1450 }],
1451 merge_action: true,
1452 ..Default::default()
1453 },
1454 ],
1455 ..Default::default()
1456 };
1457 let events = vec![
1458 iteration_start(1, "s", 1),
1459 tool_call(
1460 2,
1461 "s",
1462 "list_checks",
1463 json!({"repo": "burin-labs/harn", "pr_number": 1}),
1464 ),
1465 tool_call(
1466 3,
1467 "s",
1468 "merge_pull_request",
1469 json!({"repo": "burin-labs/harn", "pr_number": 2}),
1470 ),
1471 iteration_end(4, "s", 1),
1472 ];
1473 let report = audit_transcript(&events, Some(&golden));
1474 assert!(report
1475 .findings
1476 .iter()
1477 .any(|f| f.category == FindingCategory::SkippedVerification));
1478 }
1479
1480 #[test]
1481 fn flags_extra_model_calls_against_golden() {
1482 let golden = MergeCaptainGolden {
1483 type_name: "merge_captain_golden".into(),
1484 scenario: "test".into(),
1485 max_model_calls: Some(1),
1486 ..Default::default()
1487 };
1488 let events = vec![
1489 iteration_start(1, "s", 1),
1490 iteration_end(2, "s", 1),
1491 iteration_start(3, "s", 2),
1492 iteration_end(4, "s", 2),
1493 ];
1494 let report = audit_transcript(&events, Some(&golden));
1495 assert!(!report.pass);
1496 assert!(report
1497 .findings
1498 .iter()
1499 .any(|f| f.category == FindingCategory::ExtraModelCall));
1500 }
1501
1502 #[test]
1503 fn flags_non_minimal_tool_usage() {
1504 let golden = MergeCaptainGolden {
1505 type_name: "merge_captain_golden".into(),
1506 scenario: "test".into(),
1507 max_tool_calls: Some(1),
1508 ..Default::default()
1509 };
1510 let events = vec![
1511 iteration_start(1, "s", 1),
1512 tool_call(2, "s", "list_checks", json!({"a": 1})),
1513 tool_call(3, "s", "list_threads", json!({"a": 2})),
1514 iteration_end(4, "s", 1),
1515 ];
1516 let report = audit_transcript(&events, Some(&golden));
1517 assert!(!report.pass);
1518 assert!(report
1519 .findings
1520 .iter()
1521 .any(|f| f.category == FindingCategory::NonMinimalToolUsage));
1522 }
1523
1524 #[test]
1525 fn flags_invalid_structured_output_from_failed_tool_update() {
1526 let events = vec![
1527 iteration_start(1, "s", 1),
1528 tool_call(2, "s", "list_checks", json!({"a": 1})),
1529 env(
1530 3,
1531 AgentEvent::ToolCallUpdate {
1532 session_id: "s".into(),
1533 tool_call_id: "call_2".into(),
1534 tool_name: "list_checks".into(),
1535 status: ToolCallStatus::Failed,
1536 raw_output: None,
1537 error: Some("missing required field".into()),
1538 duration_ms: None,
1539 execution_duration_ms: None,
1540 error_category: Some(ToolCallErrorCategory::SchemaValidation),
1541 executor: None,
1542 parsing: None,
1543 raw_input: None,
1544 raw_input_partial: None,
1545 audit: None,
1546 },
1547 ),
1548 iteration_end(4, "s", 1),
1549 ];
1550 let report = audit_transcript(&events, None);
1551 assert!(report
1552 .findings
1553 .iter()
1554 .any(|f| f.category == FindingCategory::InvalidStructuredOutput));
1555 }
1556
1557 #[test]
1558 fn flags_forbidden_action() {
1559 let golden = MergeCaptainGolden {
1560 type_name: "merge_captain_golden".into(),
1561 scenario: "test".into(),
1562 forbidden_actions: vec![ToolPattern {
1563 glob: Some("*force_push*".into()),
1564 ..Default::default()
1565 }],
1566 ..Default::default()
1567 };
1568 let events = vec![
1570 iteration_start(1, "s", 1),
1571 env(
1572 2,
1573 AgentEvent::FeedbackInjected {
1574 session_id: "s".into(),
1575 kind: "approval".into(),
1576 content: "ok".into(),
1577 },
1578 ),
1579 tool_call(3, "s", "force_push", json!({"branch": "main"})),
1580 iteration_end(4, "s", 1),
1581 ];
1582 let report = audit_transcript(&events, Some(&golden));
1583 assert!(!report.pass);
1584 assert!(report
1585 .findings
1586 .iter()
1587 .any(|f| f.category == FindingCategory::ForbiddenAction));
1588 }
1589
1590 #[test]
1591 fn missing_required_state_step() {
1592 let golden = MergeCaptainGolden {
1593 type_name: "merge_captain_golden".into(),
1594 scenario: "test".into(),
1595 state_steps: vec![GoldenStateStep {
1596 step: "verify".into(),
1597 tools: vec![ToolPattern {
1598 glob: Some("*list_checks*".into()),
1599 ..Default::default()
1600 }],
1601 required: true,
1602 verifier: true,
1603 ..Default::default()
1604 }],
1605 ..Default::default()
1606 };
1607 let events = vec![iteration_start(1, "s", 1), iteration_end(2, "s", 1)];
1608 let report = audit_transcript(&events, Some(&golden));
1609 assert!(!report.pass);
1610 assert!(report
1611 .findings
1612 .iter()
1613 .any(|f| f.category == FindingCategory::MissingStateStep));
1614 }
1615
1616 #[test]
1617 fn glob_matching_basic_cases() {
1618 let p = ToolPattern {
1619 glob: Some("*merge*".into()),
1620 ..Default::default()
1621 };
1622 assert!(p.matches("gh_merge_pr"));
1623 assert!(p.matches("MERGE"));
1624 assert!(!p.matches("approve"));
1625
1626 let prefix = ToolPattern {
1627 glob: Some("gh_*".into()),
1628 ..Default::default()
1629 };
1630 assert!(prefix.matches("gh_pr_list"));
1631 assert!(!prefix.matches("git_pr_list"));
1632
1633 let suffix = ToolPattern {
1634 glob: Some("*_merge".into()),
1635 ..Default::default()
1636 };
1637 assert!(suffix.matches("force_merge"));
1638 assert!(!suffix.matches("merge_force"));
1639
1640 let exact = ToolPattern {
1641 name: Some("read_file".into()),
1642 ..Default::default()
1643 };
1644 assert!(exact.matches("read_file"));
1645 assert!(!exact.matches("read_files"));
1646 }
1647
1648 #[test]
1649 fn round_trip_report_serialization() {
1650 let events = vec![
1651 iteration_start(1, "s", 1),
1652 tool_call(2, "s", "list_checks", json!({"pr": 1})),
1653 iteration_end(3, "s", 1),
1654 ];
1655 let report = audit_transcript(&events, None);
1656 let json = serde_json::to_string(&report).expect("serialize");
1657 let parsed: AuditReport = serde_json::from_str(&json).expect("deserialize");
1658 assert_eq!(parsed.pass, report.pass);
1659 assert_eq!(parsed.event_count, report.event_count);
1660 }
1661
1662 #[test]
1663 fn loads_jsonl_transcript_from_file() {
1664 use std::io::Write;
1665 let dir = tempfile::tempdir().expect("tempdir");
1666 let path = dir.path().join("event_log.jsonl");
1667 let mut file = fs::File::create(&path).expect("create");
1668 for env in [iteration_start(1, "s", 1), iteration_end(2, "s", 1)] {
1669 let line = serde_json::to_string(&env).expect("ser");
1670 writeln!(file, "{line}").expect("write");
1671 }
1672 drop(file);
1673 let loaded = load_transcript_jsonl(&path).expect("load");
1674 assert_eq!(loaded.events.len(), 2);
1675 }
1676
1677 #[test]
1678 fn loads_jsonl_transcript_from_directory() {
1679 use std::io::Write;
1680 let dir = tempfile::tempdir().expect("tempdir");
1681 let path1 = dir.path().join("event_log.jsonl");
1682 let path2 = dir.path().join("event_log-000001.jsonl");
1683 {
1684 let mut file = fs::File::create(&path1).expect("create");
1685 writeln!(
1686 file,
1687 "{}",
1688 serde_json::to_string(&iteration_start(1, "s", 1)).unwrap()
1689 )
1690 .unwrap();
1691 }
1692 {
1693 let mut file = fs::File::create(&path2).expect("create");
1694 writeln!(
1695 file,
1696 "{}",
1697 serde_json::to_string(&iteration_end(2, "s", 1)).unwrap()
1698 )
1699 .unwrap();
1700 }
1701 let loaded = load_transcript_jsonl(dir.path()).expect("load");
1702 assert_eq!(loaded.events.len(), 2);
1703 assert_eq!(loaded.events[0].index, 1);
1704 assert_eq!(loaded.events[1].index, 2);
1705 }
1706}