1use serde::{Deserialize, Serialize};
44
45use crate::diff::alignment::{DivergenceKind, FirstDivergence};
46use crate::diff::axes::{Axis, Severity as AxisSeverity};
47use crate::diff::report::DiffReport;
48
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
52#[serde(rename_all = "snake_case")]
53pub enum RecommendationSeverity {
54 Error,
58 Warning,
62 Info,
66}
67
68impl RecommendationSeverity {
69 pub fn label(&self) -> &'static str {
71 match self {
72 RecommendationSeverity::Error => "error",
73 RecommendationSeverity::Warning => "warning",
74 RecommendationSeverity::Info => "info",
75 }
76 }
77
78 pub fn rank(&self) -> u8 {
81 match self {
82 RecommendationSeverity::Error => 3,
83 RecommendationSeverity::Warning => 2,
84 RecommendationSeverity::Info => 1,
85 }
86 }
87}
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum ActionKind {
94 Restore,
97 Remove,
100 Revert,
102 Review,
105 Verify,
107 RootCause,
114}
115
116impl ActionKind {
117 pub fn label(&self) -> &'static str {
119 match self {
120 ActionKind::Restore => "restore",
121 ActionKind::Remove => "remove",
122 ActionKind::Revert => "revert",
123 ActionKind::Review => "review",
124 ActionKind::Verify => "verify",
125 ActionKind::RootCause => "root_cause",
126 }
127 }
128}
129
130#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
136pub struct Recommendation {
137 pub severity: RecommendationSeverity,
139 pub action: ActionKind,
141 pub turn: usize,
151 #[serde(default)]
155 pub baseline_turn: usize,
156 #[serde(default)]
160 pub candidate_turn: usize,
161 pub message: String,
164 pub rationale: String,
168 pub axis: Axis,
171 pub confidence: f64,
174}
175
176pub fn generate(report: &DiffReport) -> Vec<Recommendation> {
192 let mut out: Vec<Recommendation> = Vec::new();
193
194 out.extend(detect_cross_axis_patterns(report));
198
199 for dv in &report.divergences {
200 if let Some(rec) = rule_for_divergence(dv) {
201 out.push(rec);
202 }
203 }
204
205 let worst_axis_row = report
214 .rows
215 .iter()
216 .filter(|r| r.severity == AxisSeverity::Severe)
217 .max_by(|a, b| a.severity.cmp(&b.severity));
218 if let Some(worst) = worst_axis_row {
219 let has_error = out
220 .iter()
221 .any(|r| r.severity == RecommendationSeverity::Error);
222 if !has_error {
223 out.push(Recommendation {
224 severity: RecommendationSeverity::Error,
225 action: ActionKind::Review,
226 turn: 0,
227 baseline_turn: 0,
228 candidate_turn: 0,
229 message: format!(
230 "Review the candidate: {} axis shifted with severity {}.",
231 worst.axis.label(),
232 worst.severity.label(),
233 ),
234 rationale: format!(
235 "Aggregate signal crosses the `severe` threshold \
236 ({}: delta {:+.3}, CI [{:+.3}, {:+.3}]).",
237 worst.axis.label(),
238 worst.delta,
239 worst.ci95_low,
240 worst.ci95_high,
241 ),
242 axis: worst.axis,
243 confidence: 0.8,
244 });
245 }
246 }
247
248 out.sort_by(|a, b| {
250 b.severity
251 .rank()
252 .cmp(&a.severity.rank())
253 .then_with(|| {
254 b.confidence
255 .partial_cmp(&a.confidence)
256 .unwrap_or(std::cmp::Ordering::Equal)
257 })
258 .then_with(|| a.turn.cmp(&b.turn))
259 });
260 out.truncate(8);
261 out
262}
263
264fn rule_for_divergence(dv: &FirstDivergence) -> Option<Recommendation> {
267 let exp = dv.explanation.to_lowercase();
268 match dv.kind {
269 DivergenceKind::Structural => {
273 if exp.contains("dropped tool")
275 || exp.contains("dropped a response turn")
276 || exp.contains("dropped a turn")
277 {
278 let tool_ref = extract_backticked(&dv.explanation).unwrap_or("missing element");
279 Some(Recommendation {
280 severity: RecommendationSeverity::Error,
281 action: ActionKind::Restore,
282 turn: dv.baseline_turn,
283 baseline_turn: dv.baseline_turn,
284 candidate_turn: dv.candidate_turn,
285 message: format!("Restore {tool_ref} at turn {}.", dv.baseline_turn),
286 rationale: dv.explanation.clone(),
287 axis: dv.primary_axis,
288 confidence: dv.confidence,
289 })
290 }
291 else if exp.contains("added tool") || exp.contains("inserted an extra") {
294 let tool_ref = extract_backticked(&dv.explanation).unwrap_or("extra element");
295 Some(Recommendation {
296 severity: RecommendationSeverity::Error,
297 action: ActionKind::Review,
298 turn: dv.baseline_turn,
299 baseline_turn: dv.baseline_turn,
300 candidate_turn: dv.candidate_turn,
301 message: format!(
302 "Review unexpected addition at turn {}: {tool_ref}.",
303 dv.baseline_turn
304 ),
305 rationale: dv.explanation.clone(),
306 axis: dv.primary_axis,
307 confidence: dv.confidence,
308 })
309 }
310 else if exp.contains("duplicate tool") {
312 let tool_ref = extract_backticked(&dv.explanation).unwrap_or("the duplicated tool");
313 Some(Recommendation {
314 severity: RecommendationSeverity::Error,
315 action: ActionKind::Remove,
316 turn: dv.baseline_turn,
317 baseline_turn: dv.baseline_turn,
318 candidate_turn: dv.candidate_turn,
319 message: format!(
320 "Remove duplicate invocation of {tool_ref} at turn {}.",
321 dv.baseline_turn
322 ),
323 rationale: dv.explanation.clone(),
324 axis: dv.primary_axis,
325 confidence: dv.confidence,
326 })
327 }
328 else if exp.contains("tool set changed") || exp.contains("tool ordering differs") {
330 Some(Recommendation {
331 severity: RecommendationSeverity::Error,
332 action: ActionKind::Review,
333 turn: dv.baseline_turn,
334 baseline_turn: dv.baseline_turn,
335 candidate_turn: dv.candidate_turn,
336 message: format!(
337 "Review tool-schema change at turn {}: call shape diverged.",
338 dv.baseline_turn
339 ),
340 rationale: dv.explanation.clone(),
341 axis: dv.primary_axis,
342 confidence: dv.confidence,
343 })
344 }
345 else {
347 Some(Recommendation {
348 severity: RecommendationSeverity::Error,
349 action: ActionKind::Review,
350 turn: dv.baseline_turn,
351 baseline_turn: dv.baseline_turn,
352 candidate_turn: dv.candidate_turn,
353 message: format!("Review structural change at turn {}.", dv.baseline_turn),
354 rationale: dv.explanation.clone(),
355 axis: dv.primary_axis,
356 confidence: dv.confidence,
357 })
358 }
359 }
360 DivergenceKind::Decision => {
364 if dv.primary_axis == Axis::Safety && exp.contains("stop_reason") {
366 let is_new_refusal = exp.contains("content_filter");
368 let severity = if is_new_refusal {
369 RecommendationSeverity::Error
370 } else {
371 RecommendationSeverity::Warning
372 };
373 Some(Recommendation {
374 severity,
375 action: ActionKind::Review,
376 turn: dv.baseline_turn,
377 baseline_turn: dv.baseline_turn,
378 candidate_turn: dv.candidate_turn,
379 message: format!(
380 "Review refusal behaviour at turn {}: candidate may be over-refusing.",
381 dv.baseline_turn
382 ),
383 rationale: dv.explanation.clone(),
384 axis: dv.primary_axis,
385 confidence: dv.confidence,
386 })
387 }
388 else if dv.primary_axis == Axis::Trajectory && exp.contains("arg value") {
390 let arg_ref = extract_backticked(&dv.explanation).unwrap_or("arg value");
391 Some(Recommendation {
392 severity: RecommendationSeverity::Warning,
393 action: ActionKind::Revert,
394 turn: dv.baseline_turn,
395 baseline_turn: dv.baseline_turn,
396 candidate_turn: dv.candidate_turn,
397 message: format!(
398 "Revert {arg_ref} at turn {} to the baseline value.",
399 dv.baseline_turn
400 ),
401 rationale: dv.explanation.clone(),
402 axis: dv.primary_axis,
403 confidence: dv.confidence,
404 })
405 }
406 else if dv.primary_axis == Axis::Semantic {
408 Some(Recommendation {
409 severity: RecommendationSeverity::Warning,
410 action: ActionKind::Review,
411 turn: dv.baseline_turn,
412 baseline_turn: dv.baseline_turn,
413 candidate_turn: dv.candidate_turn,
414 message: format!(
415 "Review response text at turn {}: semantic content shifted.",
416 dv.baseline_turn
417 ),
418 rationale: dv.explanation.clone(),
419 axis: dv.primary_axis,
420 confidence: dv.confidence,
421 })
422 }
423 else {
425 Some(Recommendation {
426 severity: RecommendationSeverity::Warning,
427 action: ActionKind::Review,
428 turn: dv.baseline_turn,
429 baseline_turn: dv.baseline_turn,
430 candidate_turn: dv.candidate_turn,
431 message: format!("Review decision change at turn {}.", dv.baseline_turn),
432 rationale: dv.explanation.clone(),
433 axis: dv.primary_axis,
434 confidence: dv.confidence,
435 })
436 }
437 }
438 DivergenceKind::Style => Some(Recommendation {
442 severity: RecommendationSeverity::Info,
443 action: ActionKind::Verify,
444 turn: dv.baseline_turn,
445 baseline_turn: dv.baseline_turn,
446 candidate_turn: dv.candidate_turn,
447 message: format!(
448 "Cosmetic wording change at turn {} — verify intended.",
449 dv.baseline_turn
450 ),
451 rationale: dv.explanation.clone(),
452 axis: dv.primary_axis,
453 confidence: dv.confidence,
454 }),
455 }
456}
457
458fn extract_backticked(s: &str) -> Option<&str> {
463 let first = s.find('`')?;
464 let rest = &s[first + 1..];
465 let end = rest.find('`')?;
466 Some(&rest[..end])
467}
468
469fn axis_moved(report: &DiffReport, axis: Axis) -> bool {
476 report
477 .rows
478 .iter()
479 .find(|r| r.axis == axis)
480 .map(|r| matches!(r.severity, AxisSeverity::Moderate | AxisSeverity::Severe))
481 .unwrap_or(false)
482}
483
484fn axis_severe(report: &DiffReport, axis: Axis) -> bool {
486 report
487 .rows
488 .iter()
489 .find(|r| r.axis == axis)
490 .map(|r| r.severity == AxisSeverity::Severe)
491 .unwrap_or(false)
492}
493
494fn detect_cross_axis_patterns(report: &DiffReport) -> Vec<Recommendation> {
530 let mut out = Vec::new();
531
532 if axis_moved(report, Axis::Cost)
534 && axis_moved(report, Axis::Latency)
535 && axis_moved(report, Axis::Semantic)
536 {
537 let cost_delta = axis_delta(report, Axis::Cost);
538 let lat_delta = axis_delta(report, Axis::Latency);
539 let sem_delta = axis_delta(report, Axis::Semantic);
540 out.push(Recommendation {
541 severity: RecommendationSeverity::Error,
542 action: ActionKind::RootCause,
543 turn: 0,
544 baseline_turn: 0,
545 candidate_turn: 0,
546 message:
547 "Looks like a model change. Cost, latency, and semantic axes all shifted together."
548 .to_string(),
549 rationale: format!(
550 "Cross-axis signature: cost Δ {cost_delta:+.3}, latency Δ {lat_delta:+.3}, \
551 semantic Δ {sem_delta:+.3}. Three axes moving together is the canonical \
552 model-swap signature (provider change, frontier→haiku, etc.). Diff the \
553 `model` field across configs first."
554 ),
555 axis: Axis::Cost,
556 confidence: 0.85,
557 });
558 }
559
560 if axis_moved(report, Axis::Semantic) && axis_moved(report, Axis::Verbosity) {
562 let already_model_swap = out.iter().any(|r| {
564 matches!(r.action, ActionKind::RootCause)
565 && r.message.starts_with("Looks like a model change")
566 });
567 if !already_model_swap {
568 let sem_delta = axis_delta(report, Axis::Semantic);
569 let vrb_delta = axis_delta(report, Axis::Verbosity);
570 let safety_part = if axis_moved(report, Axis::Safety) {
571 " plus safety axis (refusal-style instruction change)"
572 } else {
573 ""
574 };
575 out.push(Recommendation {
576 severity: RecommendationSeverity::Warning,
577 action: ActionKind::RootCause,
578 turn: 0,
579 baseline_turn: 0,
580 candidate_turn: 0,
581 message: format!(
582 "Looks like a system-prompt edit. Semantic + verbosity moved together{safety_part}."
583 ),
584 rationale: format!(
585 "Cross-axis signature: semantic Δ {sem_delta:+.3}, verbosity Δ {vrb_delta:+.3}. \
586 Diff the `system` field of the request across configs."
587 ),
588 axis: Axis::Semantic,
589 confidence: 0.70,
590 });
591 }
592 }
593
594 if axis_severe(report, Axis::Safety) {
596 let safety_delta = axis_delta(report, Axis::Safety);
597 if safety_delta > 0.0 {
598 out.push(Recommendation {
599 severity: RecommendationSeverity::Error,
600 action: ActionKind::RootCause,
601 turn: 0,
602 baseline_turn: 0,
603 candidate_turn: 0,
604 message: "Refusal rate is up severely. Check for stricter system instructions \
605 or tighter content policies."
606 .to_string(),
607 rationale: format!(
608 "Safety axis severe with positive delta {safety_delta:+.3} — the candidate \
609 refused or was content-filtered more often than baseline. Common causes: \
610 added safety preamble in system prompt, model upgrade with stricter RLHF, \
611 provider-side content-filter tightening."
612 ),
613 axis: Axis::Safety,
614 confidence: 0.80,
615 });
616 }
617 }
618
619 if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Reasoning) {
621 let traj_delta = axis_delta(report, Axis::Trajectory);
622 let reason_delta = axis_delta(report, Axis::Reasoning);
623 out.push(Recommendation {
624 severity: RecommendationSeverity::Error,
625 action: ActionKind::RootCause,
626 turn: 0,
627 baseline_turn: 0,
628 candidate_turn: 0,
629 message: "Looks like a tool-schema migration. Trajectory + reasoning both moved."
630 .to_string(),
631 rationale: format!(
632 "Cross-axis signature: trajectory Δ {traj_delta:+.3} (tool sequence/args \
633 changed), reasoning Δ {reason_delta:+.3} (the model is thinking through a \
634 different schema). Diff the `tools` array across configs and check whether \
635 arg keys were added or removed."
636 ),
637 axis: Axis::Trajectory,
638 confidence: 0.78,
639 });
640 }
641
642 if axis_moved(report, Axis::Semantic) && axis_moved(report, Axis::Judge) {
644 let sem_delta = axis_delta(report, Axis::Semantic);
645 let judge_delta = axis_delta(report, Axis::Judge);
646 let verbosity_part = if axis_moved(report, Axis::Verbosity) {
647 ", with verbosity also up"
648 } else {
649 ""
650 };
651 out.push(Recommendation {
652 severity: RecommendationSeverity::Error,
653 action: ActionKind::RootCause,
654 turn: 0,
655 baseline_turn: 0,
656 candidate_turn: 0,
657 message: format!(
658 "Possible hallucination regression. Semantic and judge axes both moved{verbosity_part}."
659 ),
660 rationale: format!(
661 "Cross-axis signature: semantic Δ {sem_delta:+.3}, judge Δ {judge_delta:+.3}. \
662 The classic 'confident-and-wrong' signature — the response diverged \
663 semantically AND was scored lower by the rubric. Sample 3-5 candidate \
664 outputs and verify factual claims against ground truth before merging."
665 ),
666 axis: Axis::Judge,
667 confidence: 0.82,
668 });
669 }
670
671 if axis_severe(report, Axis::Cost) && axis_moved(report, Axis::Reasoning) {
677 let cost_d = axis_delta(report, Axis::Cost);
678 let reason_d = axis_delta(report, Axis::Reasoning);
679 let model_swap_active = out
681 .iter()
682 .any(|r| r.action == ActionKind::RootCause && r.message.contains("model change"));
683 if !model_swap_active && cost_d > 0.0 {
684 out.push(Recommendation {
685 severity: RecommendationSeverity::Error,
686 action: ActionKind::RootCause,
687 turn: 0,
688 baseline_turn: 0,
689 candidate_turn: 0,
690 message: "Possible context-window overflow. Cost spiked severely without a model \
691 change, and reasoning shifted with it."
692 .to_string(),
693 rationale: format!(
694 "Cross-axis signature: cost Δ {cost_d:+.3} (severe) with reasoning \
695 Δ {reason_d:+.3}, model unchanged. Common cause: prompt-length growth \
696 past the effective context window — providers either truncate (lossy \
697 reasoning) or charge for the full prompt every turn (cost balloons). \
698 Check prompt-token usage trend across the candidate's turns."
699 ),
700 axis: Axis::Cost,
701 confidence: 0.72,
702 });
703 }
704 }
705
706 if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Latency) {
711 let schema_active = out
714 .iter()
715 .any(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"));
716 if !schema_active {
717 let traj_d = axis_delta(report, Axis::Trajectory);
718 let lat_d = axis_delta(report, Axis::Latency);
719 if lat_d > 0.0 {
720 out.push(Recommendation {
721 severity: RecommendationSeverity::Error,
722 action: ActionKind::RootCause,
723 turn: 0,
724 baseline_turn: 0,
725 candidate_turn: 0,
726 message: "Possible retry loop. Trajectory diverged severely with latency \
727 spike but no reasoning shift."
728 .to_string(),
729 rationale: format!(
730 "Cross-axis signature: trajectory Δ {traj_d:+.3}, latency Δ \
731 {lat_d:+.3}, reasoning stable. Suggests the agent is retrying a \
732 failing tool call (each retry inflates the tool-call count and \
733 adds latency, but doesn't change reasoning depth). Inspect tool \
734 results for transient errors that the agent is silently retrying."
735 ),
736 axis: Axis::Trajectory,
737 confidence: 0.70,
738 });
739 }
740 }
741 }
742
743 if axis_severe(report, Axis::Cost)
748 && !axis_moved(report, Axis::Latency)
749 && !axis_moved(report, Axis::Semantic)
750 {
751 let cost_d = axis_delta(report, Axis::Cost);
752 if cost_d > 0.0 {
753 out.push(Recommendation {
754 severity: RecommendationSeverity::Error,
755 action: ActionKind::RootCause,
756 turn: 0,
757 baseline_turn: 0,
758 candidate_turn: 0,
759 message: "Cost up severely with latency stable. Suggests cache control \
760 stopped being honored."
761 .to_string(),
762 rationale: format!(
763 "Cross-axis signature: cost Δ {cost_d:+.3}, latency stable, semantic \
764 stable. Cache-hit latency without cache-hit pricing means the request \
765 hit the cache for performance but billed at the uncached rate. Common \
766 causes: SDK upgrade dropped the `cache_control` flag, prompt-prefix \
767 drift broke cache reuse, or the provider changed cache pricing."
768 ),
769 axis: Axis::Cost,
770 confidence: 0.68,
771 });
772 }
773 }
774
775 if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Safety) {
782 let safety_d = axis_delta(report, Axis::Safety);
783 if safety_d < 0.0 {
786 let traj_d = axis_delta(report, Axis::Trajectory);
787 out.push(Recommendation {
788 severity: RecommendationSeverity::Error,
789 action: ActionKind::RootCause,
790 turn: 0,
791 baseline_turn: 0,
792 candidate_turn: 0,
793 message: "Possible prompt-injection or tool-args exfiltration. Trajectory \
794 severe AND refusal rate dropped."
795 .to_string(),
796 rationale: format!(
797 "Cross-axis signature: trajectory Δ {traj_d:+.3} with safety Δ \
798 {safety_d:+.3} (refusing less). Tool calls diverged AND the agent \
799 became more permissive — the canonical signature of a prompt-injected \
800 trace where tool args are being used to exfiltrate or escalate. \
801 Sample 3-5 candidate tool-call payloads against the baseline; look \
802 for unexpected URLs, IDs, or tokens in the input objects."
803 ),
804 axis: Axis::Safety,
805 confidence: 0.75,
806 });
807 }
808 }
809
810 if axis_severe(report, Axis::Latency)
817 && !axis_moved(report, Axis::Cost)
818 && !axis_moved(report, Axis::Semantic)
819 {
820 let lat_d = axis_delta(report, Axis::Latency);
821 let already_explained = out.iter().any(|r| {
824 r.action == ActionKind::RootCause
825 && (r.message.contains("model change") || r.message.contains("context-window"))
826 });
827 if !already_explained && lat_d > 0.0 {
828 out.push(Recommendation {
829 severity: RecommendationSeverity::Warning,
830 action: ActionKind::RootCause,
831 turn: 0,
832 baseline_turn: 0,
833 candidate_turn: 0,
834 message: "Latency up severely with cost stable. Provider-side capacity or \
835 network change."
836 .to_string(),
837 rationale: format!(
838 "Cross-axis signature: latency Δ {lat_d:+.3}, cost stable, semantic \
839 stable. Same model, same output length, slower response. Common \
840 causes: provider capacity event, network path change, regional \
841 fail-over. Check provider status pages for the candidate's run window \
842 before treating this as a code regression."
843 ),
844 axis: Axis::Latency,
845 confidence: 0.65,
846 });
847 }
848 }
849
850 out
851}
852
853fn axis_delta(report: &DiffReport, axis: Axis) -> f64 {
856 report
857 .rows
858 .iter()
859 .find(|r| r.axis == axis)
860 .map(|r| r.delta)
861 .unwrap_or(0.0)
862}
863
864#[cfg(test)]
869mod tests {
870 use super::*;
871 use crate::diff::axes::{Axis, AxisStat, Severity};
872
873 fn empty_report() -> DiffReport {
874 let rows = Axis::all().iter().map(|a| AxisStat::empty(*a)).collect();
875 DiffReport {
876 rows,
877 baseline_trace_id: String::new(),
878 candidate_trace_id: String::new(),
879 pair_count: 0,
880 first_divergence: None,
881 divergences: Vec::new(),
882 recommendations: Vec::new(),
883 drill_down: Vec::new(),
884 }
885 }
886
887 fn divergence(
888 kind: DivergenceKind,
889 axis: Axis,
890 explanation: &str,
891 confidence: f64,
892 ) -> FirstDivergence {
893 FirstDivergence {
894 baseline_turn: 3,
895 candidate_turn: 3,
896 kind,
897 primary_axis: axis,
898 explanation: explanation.to_string(),
899 confidence,
900 }
901 }
902
903 #[test]
904 fn no_divergences_produces_no_recommendations() {
905 let out = generate(&empty_report());
906 assert!(out.is_empty());
907 }
908
909 #[test]
910 fn dropped_tool_becomes_restore_error() {
911 let mut r = empty_report();
912 r.divergences.push(divergence(
913 DivergenceKind::Structural,
914 Axis::Trajectory,
915 "candidate dropped tool call(s): `send_confirmation_email(order_id,to)`",
916 0.9,
917 ));
918 let recs = generate(&r);
919 assert_eq!(recs.len(), 1);
920 let rec = &recs[0];
921 assert_eq!(rec.severity, RecommendationSeverity::Error);
922 assert_eq!(rec.action, ActionKind::Restore);
923 assert!(rec.message.contains("Restore"));
924 assert!(rec.message.contains("send_confirmation_email"));
925 assert_eq!(rec.turn, 3);
926 assert_eq!(rec.baseline_turn, rec.turn);
930 assert_eq!(rec.candidate_turn, 3);
931 }
932
933 #[test]
934 fn baseline_turn_can_exceed_pair_count_when_candidate_dropped_turns() {
935 let mut r = empty_report();
943 r.pair_count = 3; r.divergences.push(FirstDivergence {
945 baseline_turn: 4,
946 candidate_turn: 2, kind: DivergenceKind::Structural,
948 primary_axis: Axis::Trajectory,
949 explanation: "candidate dropped tool call(s): `send_email(to)`".to_string(),
950 confidence: 0.9,
951 });
952 let recs = generate(&r);
953 let rec = recs
954 .iter()
955 .find(|r| r.action == ActionKind::Restore)
956 .unwrap();
957 assert!(rec.baseline_turn >= r.pair_count);
958 assert_eq!(rec.baseline_turn, 4);
959 assert_eq!(rec.candidate_turn, 2);
960 assert_eq!(rec.turn, rec.baseline_turn);
961 }
962
963 #[test]
964 fn recommendation_serializes_with_both_turn_fields() {
965 let mut r = empty_report();
969 r.divergences.push(divergence(
970 DivergenceKind::Structural,
971 Axis::Trajectory,
972 "candidate dropped tool call(s): `x(y)`",
973 0.9,
974 ));
975 let recs = generate(&r);
976 let json = serde_json::to_value(&recs[0]).unwrap();
977 assert!(json.get("turn").is_some());
978 assert!(json.get("baseline_turn").is_some());
979 assert!(json.get("candidate_turn").is_some());
980 }
981
982 #[test]
983 fn duplicate_tool_becomes_remove_error() {
984 let mut r = empty_report();
985 r.divergences.push(divergence(
986 DivergenceKind::Structural,
987 Axis::Trajectory,
988 "candidate called `lookup_order(order_id)` 2 time(s) vs baseline's 1 — duplicate tool invocation",
989 0.5,
990 ));
991 let recs = generate(&r);
992 assert_eq!(recs.len(), 1);
993 let rec = &recs[0];
994 assert_eq!(rec.severity, RecommendationSeverity::Error);
995 assert_eq!(rec.action, ActionKind::Remove);
996 assert!(rec.message.contains("Remove duplicate"));
997 assert!(rec.message.contains("lookup_order"));
998 }
999
1000 #[test]
1001 fn added_tool_becomes_review_error() {
1002 let mut r = empty_report();
1003 r.divergences.push(divergence(
1004 DivergenceKind::Structural,
1005 Axis::Trajectory,
1006 "candidate added tool call(s): `new_tool(arg)`",
1007 0.7,
1008 ));
1009 let recs = generate(&r);
1010 assert_eq!(recs.len(), 1);
1011 assert_eq!(recs[0].action, ActionKind::Review);
1012 assert_eq!(recs[0].severity, RecommendationSeverity::Error);
1013 }
1014
1015 #[test]
1016 fn refusal_flip_to_content_filter_is_error() {
1017 let mut r = empty_report();
1018 r.divergences.push(divergence(
1019 DivergenceKind::Decision,
1020 Axis::Safety,
1021 "stop_reason changed: `end_turn` → `content_filter`",
1022 0.8,
1023 ));
1024 let recs = generate(&r);
1025 assert_eq!(recs.len(), 1);
1026 assert_eq!(recs[0].severity, RecommendationSeverity::Error);
1028 assert_eq!(recs[0].action, ActionKind::Review);
1029 assert!(recs[0].message.to_lowercase().contains("refusal"));
1030 }
1031
1032 #[test]
1033 fn arg_value_change_becomes_revert_warning() {
1034 let mut r = empty_report();
1035 r.divergences.push(divergence(
1036 DivergenceKind::Decision,
1037 Axis::Trajectory,
1038 "tool arg value changed: `refund(amount)`: `99.99` → `9.99`",
1039 0.6,
1040 ));
1041 let recs = generate(&r);
1042 assert_eq!(recs.len(), 1);
1043 assert_eq!(recs[0].severity, RecommendationSeverity::Warning);
1044 assert_eq!(recs[0].action, ActionKind::Revert);
1045 assert!(recs[0].message.contains("Revert"));
1046 assert!(recs[0].message.contains("refund(amount)"));
1047 }
1048
1049 #[test]
1050 fn semantic_decision_drift_becomes_review_warning() {
1051 let mut r = empty_report();
1052 r.divergences.push(divergence(
1053 DivergenceKind::Decision,
1054 Axis::Semantic,
1055 "response text diverged (text similarity 0.10); same tool sequence",
1056 0.6,
1057 ));
1058 let recs = generate(&r);
1059 assert_eq!(recs.len(), 1);
1060 assert_eq!(recs[0].severity, RecommendationSeverity::Warning);
1061 assert_eq!(recs[0].action, ActionKind::Review);
1062 }
1063
1064 #[test]
1065 fn style_drift_becomes_verify_info() {
1066 let mut r = empty_report();
1067 r.divergences.push(divergence(
1068 DivergenceKind::Style,
1069 Axis::Semantic,
1070 "cosmetic wording change — tool sequence preserved",
1071 0.3,
1072 ));
1073 let recs = generate(&r);
1074 assert_eq!(recs.len(), 1);
1075 assert_eq!(recs[0].severity, RecommendationSeverity::Info);
1076 assert_eq!(recs[0].action, ActionKind::Verify);
1077 }
1078
1079 #[test]
1080 fn sort_puts_errors_before_warnings_before_info() {
1081 let mut r = empty_report();
1082 r.divergences.push(divergence(
1083 DivergenceKind::Style,
1084 Axis::Semantic,
1085 "cosmetic wording change",
1086 0.9, ));
1088 r.divergences.push(divergence(
1089 DivergenceKind::Structural,
1090 Axis::Trajectory,
1091 "candidate dropped tool call(s): `x(y)`",
1092 0.2, ));
1094 r.divergences.push(divergence(
1095 DivergenceKind::Decision,
1096 Axis::Trajectory,
1097 "tool arg value changed: `f(a)`: `1` → `2`",
1098 0.5,
1099 ));
1100 let recs = generate(&r);
1101 assert_eq!(recs.len(), 3);
1102 assert_eq!(recs[0].severity, RecommendationSeverity::Error);
1103 assert_eq!(recs[1].severity, RecommendationSeverity::Warning);
1104 assert_eq!(recs[2].severity, RecommendationSeverity::Info);
1105 }
1106
1107 #[test]
1108 fn trace_wide_severe_axis_adds_fallback_recommendation() {
1109 let mut r = empty_report();
1112 let row = r
1113 .rows
1114 .iter_mut()
1115 .find(|a| a.axis == Axis::Semantic)
1116 .unwrap();
1117 row.delta = -0.6;
1118 row.baseline_median = 1.0;
1119 row.candidate_median = 0.4;
1120 row.ci95_low = -0.7;
1121 row.ci95_high = -0.5;
1122 row.severity = Severity::Severe;
1123 row.n = 20;
1124 let recs = generate(&r);
1125 assert_eq!(recs.len(), 1);
1126 assert_eq!(recs[0].severity, RecommendationSeverity::Error);
1127 assert_eq!(recs[0].action, ActionKind::Review);
1128 assert_eq!(recs[0].turn, 0);
1129 assert!(recs[0].message.contains("semantic"));
1130 assert!(recs[0].rationale.contains("severe"));
1131 }
1132
1133 #[test]
1134 fn trace_wide_fallback_skipped_when_error_already_exists() {
1135 let mut r = empty_report();
1136 r.divergences.push(divergence(
1137 DivergenceKind::Structural,
1138 Axis::Trajectory,
1139 "candidate dropped tool call(s): `x(y)`",
1140 0.8,
1141 ));
1142 let row = r
1143 .rows
1144 .iter_mut()
1145 .find(|a| a.axis == Axis::Semantic)
1146 .unwrap();
1147 row.delta = -0.6;
1148 row.severity = Severity::Severe;
1149 row.n = 20;
1150 let recs = generate(&r);
1151 assert_eq!(recs.len(), 1);
1153 assert_eq!(recs[0].severity, RecommendationSeverity::Error);
1154 }
1155
1156 #[test]
1157 fn output_capped_at_8() {
1158 let mut r = empty_report();
1159 for i in 0..15 {
1160 r.divergences.push(divergence(
1161 DivergenceKind::Decision,
1162 Axis::Trajectory,
1163 &format!("tool arg value changed: `f(a)`: `{i}` → `{}`", i + 1),
1164 0.5,
1165 ));
1166 }
1167 let recs = generate(&r);
1168 assert_eq!(recs.len(), 8);
1169 }
1170
1171 #[test]
1172 fn extract_backticked_pulls_first_token() {
1173 assert_eq!(
1174 extract_backticked("before `first(token)` middle `second`"),
1175 Some("first(token)")
1176 );
1177 assert_eq!(extract_backticked("no backticks here"), None);
1178 assert_eq!(extract_backticked("`only-one`"), Some("only-one"));
1179 }
1180
1181 #[test]
1182 fn severity_rank_ordering_is_error_above_warning_above_info() {
1183 assert!(RecommendationSeverity::Error.rank() > RecommendationSeverity::Warning.rank());
1184 assert!(RecommendationSeverity::Warning.rank() > RecommendationSeverity::Info.rank());
1185 }
1186
1187 fn force_axis_severe(report: &mut DiffReport, axis: Axis, delta: f64) {
1192 let row = report.rows.iter_mut().find(|a| a.axis == axis).unwrap();
1193 row.delta = delta;
1194 row.baseline_median = if delta < 0.0 { 1.0 } else { 0.0 };
1195 row.candidate_median = row.baseline_median + delta;
1196 row.ci95_low = delta - 0.05;
1197 row.ci95_high = delta + 0.05;
1198 row.severity = Severity::Severe;
1199 row.n = 20;
1200 }
1201
1202 fn force_axis_moderate(report: &mut DiffReport, axis: Axis, delta: f64) {
1203 let row = report.rows.iter_mut().find(|a| a.axis == axis).unwrap();
1204 row.delta = delta;
1205 row.baseline_median = if delta < 0.0 { 1.0 } else { 0.0 };
1206 row.candidate_median = row.baseline_median + delta;
1207 row.ci95_low = delta - 0.05;
1208 row.ci95_high = delta + 0.05;
1209 row.severity = Severity::Moderate;
1210 row.n = 20;
1211 }
1212
1213 #[test]
1214 fn model_swap_signature_emits_root_cause() {
1215 let mut r = empty_report();
1216 force_axis_moderate(&mut r, Axis::Cost, 0.6);
1217 force_axis_moderate(&mut r, Axis::Latency, 0.8);
1218 force_axis_moderate(&mut r, Axis::Semantic, -0.3);
1219 let recs = generate(&r);
1220 let model_swap_rec = recs
1221 .iter()
1222 .find(|r| r.action == ActionKind::RootCause && r.message.contains("model change"));
1223 assert!(
1224 model_swap_rec.is_some(),
1225 "model-swap signature should produce a root-cause recommendation; got {:#?}",
1226 recs
1227 );
1228 let rec = model_swap_rec.unwrap();
1229 assert_eq!(rec.severity, RecommendationSeverity::Error);
1230 assert!(rec.rationale.contains("cost"));
1231 assert!(rec.rationale.contains("latency"));
1232 assert!(rec.rationale.contains("semantic"));
1233 }
1234
1235 #[test]
1236 fn prompt_drift_signature_fires_when_only_two_axes_move() {
1237 let mut r = empty_report();
1238 force_axis_moderate(&mut r, Axis::Semantic, -0.2);
1239 force_axis_moderate(&mut r, Axis::Verbosity, 0.4);
1240 let recs = generate(&r);
1242 let prompt_rec = recs
1243 .iter()
1244 .find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt"));
1245 assert!(prompt_rec.is_some());
1246 let no_model = recs
1247 .iter()
1248 .all(|r| !(r.action == ActionKind::RootCause && r.message.contains("model change")));
1249 assert!(no_model, "prompt-drift should not also fire model_swap");
1250 }
1251
1252 #[test]
1253 fn prompt_drift_suppressed_when_model_swap_already_fires() {
1254 let mut r = empty_report();
1255 force_axis_moderate(&mut r, Axis::Cost, 0.5);
1256 force_axis_moderate(&mut r, Axis::Latency, 0.7);
1257 force_axis_moderate(&mut r, Axis::Semantic, -0.3);
1258 force_axis_moderate(&mut r, Axis::Verbosity, 0.4);
1259 let recs = generate(&r);
1260 let n_root_cause = recs
1261 .iter()
1262 .filter(|r| r.action == ActionKind::RootCause)
1263 .count();
1264 let n_model = recs
1268 .iter()
1269 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
1270 .count();
1271 let n_prompt = recs
1272 .iter()
1273 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("prompt"))
1274 .count();
1275 assert_eq!(n_model, 1);
1276 assert_eq!(
1277 n_prompt, 0,
1278 "prompt drift should be suppressed; got {n_root_cause} root-causes"
1279 );
1280 }
1281
1282 #[test]
1283 fn refusal_escalation_fires_on_severe_safety_with_positive_delta() {
1284 let mut r = empty_report();
1285 force_axis_severe(&mut r, Axis::Safety, 0.4); let recs = generate(&r);
1287 let refusal_rec = recs
1288 .iter()
1289 .find(|r| r.action == ActionKind::RootCause && r.message.contains("Refusal rate"));
1290 assert!(refusal_rec.is_some(), "got {:#?}", recs);
1291 assert_eq!(refusal_rec.unwrap().severity, RecommendationSeverity::Error);
1292 }
1293
1294 #[test]
1295 fn refusal_escalation_does_not_fire_on_negative_safety_delta() {
1296 let mut r = empty_report();
1299 force_axis_severe(&mut r, Axis::Safety, -0.4);
1300 let recs = generate(&r);
1301 let refusal_rec = recs
1302 .iter()
1303 .find(|r| r.action == ActionKind::RootCause && r.message.contains("Refusal rate"));
1304 assert!(refusal_rec.is_none());
1305 }
1306
1307 #[test]
1308 fn tool_schema_migration_fires_on_severe_trajectory_plus_reasoning() {
1309 let mut r = empty_report();
1310 force_axis_severe(&mut r, Axis::Trajectory, 0.5);
1311 force_axis_moderate(&mut r, Axis::Reasoning, 0.3);
1312 let recs = generate(&r);
1313 let tool_rec = recs
1314 .iter()
1315 .find(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"));
1316 assert!(tool_rec.is_some(), "got {:#?}", recs);
1317 }
1318
1319 #[test]
1320 fn hallucination_cluster_fires_on_semantic_plus_judge() {
1321 let mut r = empty_report();
1322 force_axis_moderate(&mut r, Axis::Semantic, -0.3);
1323 force_axis_moderate(&mut r, Axis::Judge, -0.4);
1324 let recs = generate(&r);
1325 let halluc_rec = recs
1326 .iter()
1327 .find(|r| r.action == ActionKind::RootCause && r.message.contains("hallucination"));
1328 assert!(halluc_rec.is_some(), "got {:#?}", recs);
1329 assert_eq!(halluc_rec.unwrap().severity, RecommendationSeverity::Error);
1330 }
1331
1332 #[test]
1333 fn single_axis_movement_triggers_at_most_one_root_cause() {
1334 let mut r = empty_report();
1340 force_axis_severe(&mut r, Axis::Trajectory, 0.7);
1341 let recs = generate(&r);
1342 let n_root = recs
1343 .iter()
1344 .filter(|r| r.action == ActionKind::RootCause)
1345 .count();
1346 assert!(
1347 n_root <= 1,
1348 "single-axis trajectory fired {n_root} root-causes: {recs:#?}"
1349 );
1350 }
1351
1352 #[test]
1353 fn root_cause_action_label_is_root_cause() {
1354 assert_eq!(ActionKind::RootCause.label(), "root_cause");
1355 }
1356
1357 #[test]
1362 fn context_window_overflow_fires_on_severe_cost_plus_reasoning() {
1363 let mut r = empty_report();
1364 force_axis_severe(&mut r, Axis::Cost, 0.7);
1365 force_axis_moderate(&mut r, Axis::Reasoning, -0.4);
1366 let recs = generate(&r);
1367 let context_rec = recs
1368 .iter()
1369 .find(|r| r.action == ActionKind::RootCause && r.message.contains("context-window"));
1370 assert!(context_rec.is_some(), "got {:#?}", recs);
1371 }
1372
1373 #[test]
1374 fn context_window_suppressed_when_model_swap_explains_cost() {
1375 let mut r = empty_report();
1376 force_axis_severe(&mut r, Axis::Cost, 0.7);
1378 force_axis_moderate(&mut r, Axis::Latency, 0.5);
1379 force_axis_moderate(&mut r, Axis::Semantic, -0.3);
1380 force_axis_moderate(&mut r, Axis::Reasoning, -0.4);
1381 let recs = generate(&r);
1382 let n_model = recs
1383 .iter()
1384 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
1385 .count();
1386 let n_context = recs
1387 .iter()
1388 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("context-window"))
1389 .count();
1390 assert_eq!(n_model, 1);
1391 assert_eq!(
1392 n_context, 0,
1393 "context-window should be suppressed; got {:#?}",
1394 recs
1395 );
1396 }
1397
1398 #[test]
1399 fn retry_loop_fires_on_severe_trajectory_plus_latency_without_reasoning() {
1400 let mut r = empty_report();
1401 force_axis_severe(&mut r, Axis::Trajectory, 0.5);
1402 force_axis_moderate(&mut r, Axis::Latency, 0.4);
1403 let recs = generate(&r);
1405 let retry_rec = recs
1406 .iter()
1407 .find(|r| r.action == ActionKind::RootCause && r.message.contains("retry loop"));
1408 assert!(retry_rec.is_some(), "got {:#?}", recs);
1409 }
1410
1411 #[test]
1412 fn retry_loop_suppressed_when_tool_schema_explains_trajectory() {
1413 let mut r = empty_report();
1414 force_axis_severe(&mut r, Axis::Trajectory, 0.5);
1415 force_axis_moderate(&mut r, Axis::Reasoning, 0.3);
1416 force_axis_moderate(&mut r, Axis::Latency, 0.4);
1417 let recs = generate(&r);
1418 let n_schema = recs
1419 .iter()
1420 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"))
1421 .count();
1422 let n_retry = recs
1423 .iter()
1424 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("retry loop"))
1425 .count();
1426 assert_eq!(n_schema, 1);
1427 assert_eq!(n_retry, 0);
1428 }
1429
1430 #[test]
1431 fn cost_explosion_cached_mismatch_fires_on_severe_cost_with_stable_latency_and_semantic() {
1432 let mut r = empty_report();
1433 force_axis_severe(&mut r, Axis::Cost, 0.6);
1434 let recs = generate(&r);
1436 let cache_rec = recs
1437 .iter()
1438 .find(|r| r.action == ActionKind::RootCause && r.message.contains("cache control"));
1439 assert!(cache_rec.is_some(), "got {:#?}", recs);
1440 }
1441
1442 #[test]
1443 fn prompt_injection_fires_on_severe_trajectory_plus_negative_safety() {
1444 let mut r = empty_report();
1445 force_axis_severe(&mut r, Axis::Trajectory, 0.5);
1446 force_axis_moderate(&mut r, Axis::Safety, -0.4); let recs = generate(&r);
1448 let inj_rec = recs
1449 .iter()
1450 .find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt-injection"));
1451 assert!(inj_rec.is_some(), "got {:#?}", recs);
1452 }
1453
1454 #[test]
1455 fn prompt_injection_does_not_fire_on_positive_safety_delta() {
1456 let mut r = empty_report();
1458 force_axis_severe(&mut r, Axis::Trajectory, 0.5);
1459 force_axis_moderate(&mut r, Axis::Safety, 0.4);
1460 let recs = generate(&r);
1461 let inj_rec = recs
1462 .iter()
1463 .find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt-injection"));
1464 assert!(inj_rec.is_none());
1465 }
1466
1467 #[test]
1468 fn latency_spike_without_cost_fires_on_severe_latency_alone() {
1469 let mut r = empty_report();
1470 force_axis_severe(&mut r, Axis::Latency, 0.6);
1471 let recs = generate(&r);
1473 let lat_rec = recs.iter().find(|r| {
1474 r.action == ActionKind::RootCause && r.message.contains("Provider-side capacity")
1475 });
1476 assert!(lat_rec.is_some(), "got {:#?}", recs);
1477 }
1478
1479 #[test]
1480 fn latency_spike_suppressed_when_model_swap_explains_it() {
1481 let mut r = empty_report();
1482 force_axis_severe(&mut r, Axis::Cost, 0.5);
1484 force_axis_severe(&mut r, Axis::Latency, 0.6);
1485 force_axis_moderate(&mut r, Axis::Semantic, -0.3);
1486 let recs = generate(&r);
1487 let n_model = recs
1488 .iter()
1489 .filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
1490 .count();
1491 let n_lat_alone = recs
1492 .iter()
1493 .filter(|r| {
1494 r.action == ActionKind::RootCause && r.message.contains("Provider-side capacity")
1495 })
1496 .count();
1497 assert_eq!(n_model, 1);
1498 assert_eq!(n_lat_alone, 0);
1499 }
1500
1501 #[test]
1506 fn no_two_root_causes_fire_on_single_axis_movement() {
1507 for axis in [
1508 Axis::Semantic,
1509 Axis::Trajectory,
1510 Axis::Safety,
1511 Axis::Verbosity,
1512 Axis::Latency,
1513 Axis::Cost,
1514 Axis::Reasoning,
1515 Axis::Judge,
1516 Axis::Conformance,
1517 ] {
1518 let mut r = empty_report();
1519 force_axis_severe(&mut r, axis, 0.5);
1520 let recs = generate(&r);
1521 let n_root = recs
1522 .iter()
1523 .filter(|r| r.action == ActionKind::RootCause)
1524 .count();
1525 assert!(
1526 n_root <= 1,
1527 "single-axis severe on {axis:?} fired {n_root} root-causes; \
1528 patterns must be mutually exclusive on the same single-axis evidence: {recs:#?}"
1529 );
1530 }
1531 }
1532}