use serde::{Deserialize, Serialize};
use crate::diff::alignment::{DivergenceKind, FirstDivergence};
use crate::diff::axes::{Axis, Severity as AxisSeverity};
use crate::diff::report::DiffReport;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecommendationSeverity {
Error,
Warning,
Info,
}
impl RecommendationSeverity {
pub fn label(&self) -> &'static str {
match self {
RecommendationSeverity::Error => "error",
RecommendationSeverity::Warning => "warning",
RecommendationSeverity::Info => "info",
}
}
pub fn rank(&self) -> u8 {
match self {
RecommendationSeverity::Error => 3,
RecommendationSeverity::Warning => 2,
RecommendationSeverity::Info => 1,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ActionKind {
Restore,
Remove,
Revert,
Review,
Verify,
RootCause,
}
impl ActionKind {
pub fn label(&self) -> &'static str {
match self {
ActionKind::Restore => "restore",
ActionKind::Remove => "remove",
ActionKind::Revert => "revert",
ActionKind::Review => "review",
ActionKind::Verify => "verify",
ActionKind::RootCause => "root_cause",
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Recommendation {
pub severity: RecommendationSeverity,
pub action: ActionKind,
pub turn: usize,
#[serde(default)]
pub baseline_turn: usize,
#[serde(default)]
pub candidate_turn: usize,
pub message: String,
pub rationale: String,
pub axis: Axis,
pub confidence: f64,
}
pub fn generate(report: &DiffReport) -> Vec<Recommendation> {
let mut out: Vec<Recommendation> = Vec::new();
out.extend(detect_cross_axis_patterns(report));
for dv in &report.divergences {
if let Some(rec) = rule_for_divergence(dv) {
out.push(rec);
}
}
let worst_axis_row = report
.rows
.iter()
.filter(|r| r.severity == AxisSeverity::Severe)
.max_by(|a, b| a.severity.cmp(&b.severity));
if let Some(worst) = worst_axis_row {
let has_error = out
.iter()
.any(|r| r.severity == RecommendationSeverity::Error);
if !has_error {
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Review,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: format!(
"Review the candidate: {} axis shifted with severity {}.",
worst.axis.label(),
worst.severity.label(),
),
rationale: format!(
"Aggregate signal crosses the `severe` threshold \
({}: delta {:+.3}, CI [{:+.3}, {:+.3}]).",
worst.axis.label(),
worst.delta,
worst.ci95_low,
worst.ci95_high,
),
axis: worst.axis,
confidence: 0.8,
});
}
}
out.sort_by(|a, b| {
b.severity
.rank()
.cmp(&a.severity.rank())
.then_with(|| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| a.turn.cmp(&b.turn))
});
out.truncate(8);
out
}
fn rule_for_divergence(dv: &FirstDivergence) -> Option<Recommendation> {
let exp = dv.explanation.to_lowercase();
match dv.kind {
DivergenceKind::Structural => {
if exp.contains("dropped tool")
|| exp.contains("dropped a response turn")
|| exp.contains("dropped a turn")
{
let tool_ref = extract_backticked(&dv.explanation).unwrap_or("missing element");
Some(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Restore,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!("Restore {tool_ref} at turn {}.", dv.baseline_turn),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else if exp.contains("added tool") || exp.contains("inserted an extra") {
let tool_ref = extract_backticked(&dv.explanation).unwrap_or("extra element");
Some(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Review unexpected addition at turn {}: {tool_ref}.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else if exp.contains("duplicate tool") {
let tool_ref = extract_backticked(&dv.explanation).unwrap_or("the duplicated tool");
Some(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Remove,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Remove duplicate invocation of {tool_ref} at turn {}.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else if exp.contains("tool set changed") || exp.contains("tool ordering differs") {
Some(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Review tool-schema change at turn {}: call shape diverged.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else {
Some(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!("Review structural change at turn {}.", dv.baseline_turn),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
}
DivergenceKind::Decision => {
if dv.primary_axis == Axis::Safety && exp.contains("stop_reason") {
let is_new_refusal = exp.contains("content_filter");
let severity = if is_new_refusal {
RecommendationSeverity::Error
} else {
RecommendationSeverity::Warning
};
Some(Recommendation {
severity,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Review refusal behaviour at turn {}: candidate may be over-refusing.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else if dv.primary_axis == Axis::Trajectory && exp.contains("arg value") {
let arg_ref = extract_backticked(&dv.explanation).unwrap_or("arg value");
Some(Recommendation {
severity: RecommendationSeverity::Warning,
action: ActionKind::Revert,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Revert {arg_ref} at turn {} to the baseline value.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else if dv.primary_axis == Axis::Semantic {
Some(Recommendation {
severity: RecommendationSeverity::Warning,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Review response text at turn {}: semantic content shifted.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
else {
Some(Recommendation {
severity: RecommendationSeverity::Warning,
action: ActionKind::Review,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!("Review decision change at turn {}.", dv.baseline_turn),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
})
}
}
DivergenceKind::Style => Some(Recommendation {
severity: RecommendationSeverity::Info,
action: ActionKind::Verify,
turn: dv.baseline_turn,
baseline_turn: dv.baseline_turn,
candidate_turn: dv.candidate_turn,
message: format!(
"Cosmetic wording change at turn {} — verify intended.",
dv.baseline_turn
),
rationale: dv.explanation.clone(),
axis: dv.primary_axis,
confidence: dv.confidence,
}),
}
}
fn extract_backticked(s: &str) -> Option<&str> {
let first = s.find('`')?;
let rest = &s[first + 1..];
let end = rest.find('`')?;
Some(&rest[..end])
}
fn axis_moved(report: &DiffReport, axis: Axis) -> bool {
report
.rows
.iter()
.find(|r| r.axis == axis)
.map(|r| matches!(r.severity, AxisSeverity::Moderate | AxisSeverity::Severe))
.unwrap_or(false)
}
fn axis_severe(report: &DiffReport, axis: Axis) -> bool {
report
.rows
.iter()
.find(|r| r.axis == axis)
.map(|r| r.severity == AxisSeverity::Severe)
.unwrap_or(false)
}
fn detect_cross_axis_patterns(report: &DiffReport) -> Vec<Recommendation> {
let mut out = Vec::new();
if axis_moved(report, Axis::Cost)
&& axis_moved(report, Axis::Latency)
&& axis_moved(report, Axis::Semantic)
{
let cost_delta = axis_delta(report, Axis::Cost);
let lat_delta = axis_delta(report, Axis::Latency);
let sem_delta = axis_delta(report, Axis::Semantic);
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message:
"Looks like a model change. Cost, latency, and semantic axes all shifted together."
.to_string(),
rationale: format!(
"Cross-axis signature: cost Δ {cost_delta:+.3}, latency Δ {lat_delta:+.3}, \
semantic Δ {sem_delta:+.3}. Three axes moving together is the canonical \
model-swap signature (provider change, frontier→haiku, etc.). Diff the \
`model` field across configs first."
),
axis: Axis::Cost,
confidence: 0.85,
});
}
if axis_moved(report, Axis::Semantic) && axis_moved(report, Axis::Verbosity) {
let already_model_swap = out.iter().any(|r| {
matches!(r.action, ActionKind::RootCause)
&& r.message.starts_with("Looks like a model change")
});
if !already_model_swap {
let sem_delta = axis_delta(report, Axis::Semantic);
let vrb_delta = axis_delta(report, Axis::Verbosity);
let safety_part = if axis_moved(report, Axis::Safety) {
" plus safety axis (refusal-style instruction change)"
} else {
""
};
out.push(Recommendation {
severity: RecommendationSeverity::Warning,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: format!(
"Looks like a system-prompt edit. Semantic + verbosity moved together{safety_part}."
),
rationale: format!(
"Cross-axis signature: semantic Δ {sem_delta:+.3}, verbosity Δ {vrb_delta:+.3}. \
Diff the `system` field of the request across configs."
),
axis: Axis::Semantic,
confidence: 0.70,
});
}
}
if axis_severe(report, Axis::Safety) {
let safety_delta = axis_delta(report, Axis::Safety);
if safety_delta > 0.0 {
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Refusal rate is up severely. Check for stricter system instructions \
or tighter content policies."
.to_string(),
rationale: format!(
"Safety axis severe with positive delta {safety_delta:+.3} — the candidate \
refused or was content-filtered more often than baseline. Common causes: \
added safety preamble in system prompt, model upgrade with stricter RLHF, \
provider-side content-filter tightening."
),
axis: Axis::Safety,
confidence: 0.80,
});
}
}
if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Reasoning) {
let traj_delta = axis_delta(report, Axis::Trajectory);
let reason_delta = axis_delta(report, Axis::Reasoning);
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Looks like a tool-schema migration. Trajectory + reasoning both moved."
.to_string(),
rationale: format!(
"Cross-axis signature: trajectory Δ {traj_delta:+.3} (tool sequence/args \
changed), reasoning Δ {reason_delta:+.3} (the model is thinking through a \
different schema). Diff the `tools` array across configs and check whether \
arg keys were added or removed."
),
axis: Axis::Trajectory,
confidence: 0.78,
});
}
if axis_moved(report, Axis::Semantic) && axis_moved(report, Axis::Judge) {
let sem_delta = axis_delta(report, Axis::Semantic);
let judge_delta = axis_delta(report, Axis::Judge);
let verbosity_part = if axis_moved(report, Axis::Verbosity) {
", with verbosity also up"
} else {
""
};
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: format!(
"Possible hallucination regression. Semantic and judge axes both moved{verbosity_part}."
),
rationale: format!(
"Cross-axis signature: semantic Δ {sem_delta:+.3}, judge Δ {judge_delta:+.3}. \
The classic 'confident-and-wrong' signature — the response diverged \
semantically AND was scored lower by the rubric. Sample 3-5 candidate \
outputs and verify factual claims against ground truth before merging."
),
axis: Axis::Judge,
confidence: 0.82,
});
}
if axis_severe(report, Axis::Cost) && axis_moved(report, Axis::Reasoning) {
let cost_d = axis_delta(report, Axis::Cost);
let reason_d = axis_delta(report, Axis::Reasoning);
let model_swap_active = out
.iter()
.any(|r| r.action == ActionKind::RootCause && r.message.contains("model change"));
if !model_swap_active && cost_d > 0.0 {
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Possible context-window overflow. Cost spiked severely without a model \
change, and reasoning shifted with it."
.to_string(),
rationale: format!(
"Cross-axis signature: cost Δ {cost_d:+.3} (severe) with reasoning \
Δ {reason_d:+.3}, model unchanged. Common cause: prompt-length growth \
past the effective context window — providers either truncate (lossy \
reasoning) or charge for the full prompt every turn (cost balloons). \
Check prompt-token usage trend across the candidate's turns."
),
axis: Axis::Cost,
confidence: 0.72,
});
}
}
if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Latency) {
let schema_active = out
.iter()
.any(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"));
if !schema_active {
let traj_d = axis_delta(report, Axis::Trajectory);
let lat_d = axis_delta(report, Axis::Latency);
if lat_d > 0.0 {
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Possible retry loop. Trajectory diverged severely with latency \
spike but no reasoning shift."
.to_string(),
rationale: format!(
"Cross-axis signature: trajectory Δ {traj_d:+.3}, latency Δ \
{lat_d:+.3}, reasoning stable. Suggests the agent is retrying a \
failing tool call (each retry inflates the tool-call count and \
adds latency, but doesn't change reasoning depth). Inspect tool \
results for transient errors that the agent is silently retrying."
),
axis: Axis::Trajectory,
confidence: 0.70,
});
}
}
}
if axis_severe(report, Axis::Cost)
&& !axis_moved(report, Axis::Latency)
&& !axis_moved(report, Axis::Semantic)
{
let cost_d = axis_delta(report, Axis::Cost);
if cost_d > 0.0 {
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Cost up severely with latency stable. Suggests cache control \
stopped being honored."
.to_string(),
rationale: format!(
"Cross-axis signature: cost Δ {cost_d:+.3}, latency stable, semantic \
stable. Cache-hit latency without cache-hit pricing means the request \
hit the cache for performance but billed at the uncached rate. Common \
causes: SDK upgrade dropped the `cache_control` flag, prompt-prefix \
drift broke cache reuse, or the provider changed cache pricing."
),
axis: Axis::Cost,
confidence: 0.68,
});
}
}
if axis_severe(report, Axis::Trajectory) && axis_moved(report, Axis::Safety) {
let safety_d = axis_delta(report, Axis::Safety);
if safety_d < 0.0 {
let traj_d = axis_delta(report, Axis::Trajectory);
out.push(Recommendation {
severity: RecommendationSeverity::Error,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Possible prompt-injection or tool-args exfiltration. Trajectory \
severe AND refusal rate dropped."
.to_string(),
rationale: format!(
"Cross-axis signature: trajectory Δ {traj_d:+.3} with safety Δ \
{safety_d:+.3} (refusing less). Tool calls diverged AND the agent \
became more permissive — the canonical signature of a prompt-injected \
trace where tool args are being used to exfiltrate or escalate. \
Sample 3-5 candidate tool-call payloads against the baseline; look \
for unexpected URLs, IDs, or tokens in the input objects."
),
axis: Axis::Safety,
confidence: 0.75,
});
}
}
if axis_severe(report, Axis::Latency)
&& !axis_moved(report, Axis::Cost)
&& !axis_moved(report, Axis::Semantic)
{
let lat_d = axis_delta(report, Axis::Latency);
let already_explained = out.iter().any(|r| {
r.action == ActionKind::RootCause
&& (r.message.contains("model change") || r.message.contains("context-window"))
});
if !already_explained && lat_d > 0.0 {
out.push(Recommendation {
severity: RecommendationSeverity::Warning,
action: ActionKind::RootCause,
turn: 0,
baseline_turn: 0,
candidate_turn: 0,
message: "Latency up severely with cost stable. Provider-side capacity or \
network change."
.to_string(),
rationale: format!(
"Cross-axis signature: latency Δ {lat_d:+.3}, cost stable, semantic \
stable. Same model, same output length, slower response. Common \
causes: provider capacity event, network path change, regional \
fail-over. Check provider status pages for the candidate's run window \
before treating this as a code regression."
),
axis: Axis::Latency,
confidence: 0.65,
});
}
}
out
}
fn axis_delta(report: &DiffReport, axis: Axis) -> f64 {
report
.rows
.iter()
.find(|r| r.axis == axis)
.map(|r| r.delta)
.unwrap_or(0.0)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::diff::axes::{Axis, AxisStat, Severity};
fn empty_report() -> DiffReport {
let rows = Axis::all().iter().map(|a| AxisStat::empty(*a)).collect();
DiffReport {
rows,
baseline_trace_id: String::new(),
candidate_trace_id: String::new(),
pair_count: 0,
first_divergence: None,
divergences: Vec::new(),
recommendations: Vec::new(),
drill_down: Vec::new(),
}
}
fn divergence(
kind: DivergenceKind,
axis: Axis,
explanation: &str,
confidence: f64,
) -> FirstDivergence {
FirstDivergence {
baseline_turn: 3,
candidate_turn: 3,
kind,
primary_axis: axis,
explanation: explanation.to_string(),
confidence,
}
}
#[test]
fn no_divergences_produces_no_recommendations() {
let out = generate(&empty_report());
assert!(out.is_empty());
}
#[test]
fn dropped_tool_becomes_restore_error() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate dropped tool call(s): `send_confirmation_email(order_id,to)`",
0.9,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
let rec = &recs[0];
assert_eq!(rec.severity, RecommendationSeverity::Error);
assert_eq!(rec.action, ActionKind::Restore);
assert!(rec.message.contains("Restore"));
assert!(rec.message.contains("send_confirmation_email"));
assert_eq!(rec.turn, 3);
assert_eq!(rec.baseline_turn, rec.turn);
assert_eq!(rec.candidate_turn, 3);
}
#[test]
fn baseline_turn_can_exceed_pair_count_when_candidate_dropped_turns() {
let mut r = empty_report();
r.pair_count = 3; r.divergences.push(FirstDivergence {
baseline_turn: 4,
candidate_turn: 2, kind: DivergenceKind::Structural,
primary_axis: Axis::Trajectory,
explanation: "candidate dropped tool call(s): `send_email(to)`".to_string(),
confidence: 0.9,
});
let recs = generate(&r);
let rec = recs
.iter()
.find(|r| r.action == ActionKind::Restore)
.unwrap();
assert!(rec.baseline_turn >= r.pair_count);
assert_eq!(rec.baseline_turn, 4);
assert_eq!(rec.candidate_turn, 2);
assert_eq!(rec.turn, rec.baseline_turn);
}
#[test]
fn recommendation_serializes_with_both_turn_fields() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate dropped tool call(s): `x(y)`",
0.9,
));
let recs = generate(&r);
let json = serde_json::to_value(&recs[0]).unwrap();
assert!(json.get("turn").is_some());
assert!(json.get("baseline_turn").is_some());
assert!(json.get("candidate_turn").is_some());
}
#[test]
fn duplicate_tool_becomes_remove_error() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate called `lookup_order(order_id)` 2 time(s) vs baseline's 1 — duplicate tool invocation",
0.5,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
let rec = &recs[0];
assert_eq!(rec.severity, RecommendationSeverity::Error);
assert_eq!(rec.action, ActionKind::Remove);
assert!(rec.message.contains("Remove duplicate"));
assert!(rec.message.contains("lookup_order"));
}
#[test]
fn added_tool_becomes_review_error() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate added tool call(s): `new_tool(arg)`",
0.7,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].action, ActionKind::Review);
assert_eq!(recs[0].severity, RecommendationSeverity::Error);
}
#[test]
fn refusal_flip_to_content_filter_is_error() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Decision,
Axis::Safety,
"stop_reason changed: `end_turn` → `content_filter`",
0.8,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Error);
assert_eq!(recs[0].action, ActionKind::Review);
assert!(recs[0].message.to_lowercase().contains("refusal"));
}
#[test]
fn arg_value_change_becomes_revert_warning() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Decision,
Axis::Trajectory,
"tool arg value changed: `refund(amount)`: `99.99` → `9.99`",
0.6,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Warning);
assert_eq!(recs[0].action, ActionKind::Revert);
assert!(recs[0].message.contains("Revert"));
assert!(recs[0].message.contains("refund(amount)"));
}
#[test]
fn semantic_decision_drift_becomes_review_warning() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Decision,
Axis::Semantic,
"response text diverged (text similarity 0.10); same tool sequence",
0.6,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Warning);
assert_eq!(recs[0].action, ActionKind::Review);
}
#[test]
fn style_drift_becomes_verify_info() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Style,
Axis::Semantic,
"cosmetic wording change — tool sequence preserved",
0.3,
));
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Info);
assert_eq!(recs[0].action, ActionKind::Verify);
}
#[test]
fn sort_puts_errors_before_warnings_before_info() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Style,
Axis::Semantic,
"cosmetic wording change",
0.9, ));
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate dropped tool call(s): `x(y)`",
0.2, ));
r.divergences.push(divergence(
DivergenceKind::Decision,
Axis::Trajectory,
"tool arg value changed: `f(a)`: `1` → `2`",
0.5,
));
let recs = generate(&r);
assert_eq!(recs.len(), 3);
assert_eq!(recs[0].severity, RecommendationSeverity::Error);
assert_eq!(recs[1].severity, RecommendationSeverity::Warning);
assert_eq!(recs[2].severity, RecommendationSeverity::Info);
}
#[test]
fn trace_wide_severe_axis_adds_fallback_recommendation() {
let mut r = empty_report();
let row = r
.rows
.iter_mut()
.find(|a| a.axis == Axis::Semantic)
.unwrap();
row.delta = -0.6;
row.baseline_median = 1.0;
row.candidate_median = 0.4;
row.ci95_low = -0.7;
row.ci95_high = -0.5;
row.severity = Severity::Severe;
row.n = 20;
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Error);
assert_eq!(recs[0].action, ActionKind::Review);
assert_eq!(recs[0].turn, 0);
assert!(recs[0].message.contains("semantic"));
assert!(recs[0].rationale.contains("severe"));
}
#[test]
fn trace_wide_fallback_skipped_when_error_already_exists() {
let mut r = empty_report();
r.divergences.push(divergence(
DivergenceKind::Structural,
Axis::Trajectory,
"candidate dropped tool call(s): `x(y)`",
0.8,
));
let row = r
.rows
.iter_mut()
.find(|a| a.axis == Axis::Semantic)
.unwrap();
row.delta = -0.6;
row.severity = Severity::Severe;
row.n = 20;
let recs = generate(&r);
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].severity, RecommendationSeverity::Error);
}
#[test]
fn output_capped_at_8() {
let mut r = empty_report();
for i in 0..15 {
r.divergences.push(divergence(
DivergenceKind::Decision,
Axis::Trajectory,
&format!("tool arg value changed: `f(a)`: `{i}` → `{}`", i + 1),
0.5,
));
}
let recs = generate(&r);
assert_eq!(recs.len(), 8);
}
#[test]
fn extract_backticked_pulls_first_token() {
assert_eq!(
extract_backticked("before `first(token)` middle `second`"),
Some("first(token)")
);
assert_eq!(extract_backticked("no backticks here"), None);
assert_eq!(extract_backticked("`only-one`"), Some("only-one"));
}
#[test]
fn severity_rank_ordering_is_error_above_warning_above_info() {
assert!(RecommendationSeverity::Error.rank() > RecommendationSeverity::Warning.rank());
assert!(RecommendationSeverity::Warning.rank() > RecommendationSeverity::Info.rank());
}
fn force_axis_severe(report: &mut DiffReport, axis: Axis, delta: f64) {
let row = report.rows.iter_mut().find(|a| a.axis == axis).unwrap();
row.delta = delta;
row.baseline_median = if delta < 0.0 { 1.0 } else { 0.0 };
row.candidate_median = row.baseline_median + delta;
row.ci95_low = delta - 0.05;
row.ci95_high = delta + 0.05;
row.severity = Severity::Severe;
row.n = 20;
}
fn force_axis_moderate(report: &mut DiffReport, axis: Axis, delta: f64) {
let row = report.rows.iter_mut().find(|a| a.axis == axis).unwrap();
row.delta = delta;
row.baseline_median = if delta < 0.0 { 1.0 } else { 0.0 };
row.candidate_median = row.baseline_median + delta;
row.ci95_low = delta - 0.05;
row.ci95_high = delta + 0.05;
row.severity = Severity::Moderate;
row.n = 20;
}
#[test]
fn model_swap_signature_emits_root_cause() {
let mut r = empty_report();
force_axis_moderate(&mut r, Axis::Cost, 0.6);
force_axis_moderate(&mut r, Axis::Latency, 0.8);
force_axis_moderate(&mut r, Axis::Semantic, -0.3);
let recs = generate(&r);
let model_swap_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("model change"));
assert!(
model_swap_rec.is_some(),
"model-swap signature should produce a root-cause recommendation; got {:#?}",
recs
);
let rec = model_swap_rec.unwrap();
assert_eq!(rec.severity, RecommendationSeverity::Error);
assert!(rec.rationale.contains("cost"));
assert!(rec.rationale.contains("latency"));
assert!(rec.rationale.contains("semantic"));
}
#[test]
fn prompt_drift_signature_fires_when_only_two_axes_move() {
let mut r = empty_report();
force_axis_moderate(&mut r, Axis::Semantic, -0.2);
force_axis_moderate(&mut r, Axis::Verbosity, 0.4);
let recs = generate(&r);
let prompt_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt"));
assert!(prompt_rec.is_some());
let no_model = recs
.iter()
.all(|r| !(r.action == ActionKind::RootCause && r.message.contains("model change")));
assert!(no_model, "prompt-drift should not also fire model_swap");
}
#[test]
fn prompt_drift_suppressed_when_model_swap_already_fires() {
let mut r = empty_report();
force_axis_moderate(&mut r, Axis::Cost, 0.5);
force_axis_moderate(&mut r, Axis::Latency, 0.7);
force_axis_moderate(&mut r, Axis::Semantic, -0.3);
force_axis_moderate(&mut r, Axis::Verbosity, 0.4);
let recs = generate(&r);
let n_root_cause = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause)
.count();
let n_model = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
.count();
let n_prompt = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("prompt"))
.count();
assert_eq!(n_model, 1);
assert_eq!(
n_prompt, 0,
"prompt drift should be suppressed; got {n_root_cause} root-causes"
);
}
#[test]
fn refusal_escalation_fires_on_severe_safety_with_positive_delta() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Safety, 0.4); let recs = generate(&r);
let refusal_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("Refusal rate"));
assert!(refusal_rec.is_some(), "got {:#?}", recs);
assert_eq!(refusal_rec.unwrap().severity, RecommendationSeverity::Error);
}
#[test]
fn refusal_escalation_does_not_fire_on_negative_safety_delta() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Safety, -0.4);
let recs = generate(&r);
let refusal_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("Refusal rate"));
assert!(refusal_rec.is_none());
}
#[test]
fn tool_schema_migration_fires_on_severe_trajectory_plus_reasoning() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.5);
force_axis_moderate(&mut r, Axis::Reasoning, 0.3);
let recs = generate(&r);
let tool_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"));
assert!(tool_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn hallucination_cluster_fires_on_semantic_plus_judge() {
let mut r = empty_report();
force_axis_moderate(&mut r, Axis::Semantic, -0.3);
force_axis_moderate(&mut r, Axis::Judge, -0.4);
let recs = generate(&r);
let halluc_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("hallucination"));
assert!(halluc_rec.is_some(), "got {:#?}", recs);
assert_eq!(halluc_rec.unwrap().severity, RecommendationSeverity::Error);
}
#[test]
fn single_axis_movement_triggers_at_most_one_root_cause() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.7);
let recs = generate(&r);
let n_root = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause)
.count();
assert!(
n_root <= 1,
"single-axis trajectory fired {n_root} root-causes: {recs:#?}"
);
}
#[test]
fn root_cause_action_label_is_root_cause() {
assert_eq!(ActionKind::RootCause.label(), "root_cause");
}
#[test]
fn context_window_overflow_fires_on_severe_cost_plus_reasoning() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Cost, 0.7);
force_axis_moderate(&mut r, Axis::Reasoning, -0.4);
let recs = generate(&r);
let context_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("context-window"));
assert!(context_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn context_window_suppressed_when_model_swap_explains_cost() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Cost, 0.7);
force_axis_moderate(&mut r, Axis::Latency, 0.5);
force_axis_moderate(&mut r, Axis::Semantic, -0.3);
force_axis_moderate(&mut r, Axis::Reasoning, -0.4);
let recs = generate(&r);
let n_model = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
.count();
let n_context = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("context-window"))
.count();
assert_eq!(n_model, 1);
assert_eq!(
n_context, 0,
"context-window should be suppressed; got {:#?}",
recs
);
}
#[test]
fn retry_loop_fires_on_severe_trajectory_plus_latency_without_reasoning() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.5);
force_axis_moderate(&mut r, Axis::Latency, 0.4);
let recs = generate(&r);
let retry_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("retry loop"));
assert!(retry_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn retry_loop_suppressed_when_tool_schema_explains_trajectory() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.5);
force_axis_moderate(&mut r, Axis::Reasoning, 0.3);
force_axis_moderate(&mut r, Axis::Latency, 0.4);
let recs = generate(&r);
let n_schema = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("tool-schema"))
.count();
let n_retry = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("retry loop"))
.count();
assert_eq!(n_schema, 1);
assert_eq!(n_retry, 0);
}
#[test]
fn cost_explosion_cached_mismatch_fires_on_severe_cost_with_stable_latency_and_semantic() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Cost, 0.6);
let recs = generate(&r);
let cache_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("cache control"));
assert!(cache_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn prompt_injection_fires_on_severe_trajectory_plus_negative_safety() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.5);
force_axis_moderate(&mut r, Axis::Safety, -0.4); let recs = generate(&r);
let inj_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt-injection"));
assert!(inj_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn prompt_injection_does_not_fire_on_positive_safety_delta() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Trajectory, 0.5);
force_axis_moderate(&mut r, Axis::Safety, 0.4);
let recs = generate(&r);
let inj_rec = recs
.iter()
.find(|r| r.action == ActionKind::RootCause && r.message.contains("prompt-injection"));
assert!(inj_rec.is_none());
}
#[test]
fn latency_spike_without_cost_fires_on_severe_latency_alone() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Latency, 0.6);
let recs = generate(&r);
let lat_rec = recs.iter().find(|r| {
r.action == ActionKind::RootCause && r.message.contains("Provider-side capacity")
});
assert!(lat_rec.is_some(), "got {:#?}", recs);
}
#[test]
fn latency_spike_suppressed_when_model_swap_explains_it() {
let mut r = empty_report();
force_axis_severe(&mut r, Axis::Cost, 0.5);
force_axis_severe(&mut r, Axis::Latency, 0.6);
force_axis_moderate(&mut r, Axis::Semantic, -0.3);
let recs = generate(&r);
let n_model = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause && r.message.contains("model change"))
.count();
let n_lat_alone = recs
.iter()
.filter(|r| {
r.action == ActionKind::RootCause && r.message.contains("Provider-side capacity")
})
.count();
assert_eq!(n_model, 1);
assert_eq!(n_lat_alone, 0);
}
#[test]
fn no_two_root_causes_fire_on_single_axis_movement() {
for axis in [
Axis::Semantic,
Axis::Trajectory,
Axis::Safety,
Axis::Verbosity,
Axis::Latency,
Axis::Cost,
Axis::Reasoning,
Axis::Judge,
Axis::Conformance,
] {
let mut r = empty_report();
force_axis_severe(&mut r, axis, 0.5);
let recs = generate(&r);
let n_root = recs
.iter()
.filter(|r| r.action == ActionKind::RootCause)
.count();
assert!(
n_root <= 1,
"single-axis severe on {axis:?} fired {n_root} root-causes; \
patterns must be mutually exclusive on the same single-axis evidence: {recs:#?}"
);
}
}
}