use crate::agentlog::Record;
use crate::diff::axes::{Axis, AxisStat};
use crate::diff::bootstrap::paired_ci;
fn response_text(r: &Record) -> String {
let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
return String::new();
};
arr.iter()
.filter_map(|p| {
if p.get("type").and_then(|t| t.as_str()) == Some("text") {
p.get("text")
.and_then(|t| t.as_str())
.map(ToString::to_string)
} else {
None
}
})
.collect::<Vec<_>>()
.join(" ")
}
fn strip_markdown_fences(text: &str) -> &str {
let t = text.trim();
if !t.starts_with("```") {
return t;
}
let after_fence = if let Some(i) = t.find('\n') {
&t[i + 1..]
} else {
let no_backticks = &t[3..];
let after_lang = no_backticks.trim_start_matches(|c: char| c.is_ascii_alphanumeric());
after_lang
};
let trimmed_end = after_fence.trim_end();
let body = trimmed_end.strip_suffix("```").unwrap_or(trimmed_end);
body.trim()
}
fn is_json_parseable(text: &str) -> bool {
serde_json::from_str::<serde_json::Value>(strip_markdown_fences(text)).is_ok()
}
fn has_json_intent(text: &str) -> bool {
let body = strip_markdown_fences(text);
body.starts_with('{') || body.starts_with('[')
}
fn mean(xs: &[f64]) -> f64 {
if xs.is_empty() {
0.0
} else {
xs.iter().sum::<f64>() / xs.len() as f64
}
}
fn first_tool_use_keys(r: &Record) -> Option<std::collections::BTreeSet<String>> {
let arr = r.payload.get("content").and_then(|c| c.as_array())?;
for part in arr {
if part.get("type").and_then(|t| t.as_str()) == Some("tool_use") {
if let Some(input) = part.get("input").and_then(|i| i.as_object()) {
return Some(input.keys().cloned().collect());
}
}
}
None
}
pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
let mut b = Vec::new();
let mut c = Vec::new();
for (br, cr) in pairs {
let baseline_text = response_text(br);
let baseline_tool_keys = first_tool_use_keys(br);
let baseline_json_intent = has_json_intent(&baseline_text);
let baseline_tool_intent = baseline_tool_keys.is_some();
if !baseline_json_intent && !baseline_tool_intent {
continue;
}
let b_score;
let c_score;
if baseline_json_intent {
b_score = f64::from(is_json_parseable(&baseline_text));
c_score = f64::from(is_json_parseable(&response_text(cr)));
} else {
b_score = 1.0;
let candidate_tool_keys = first_tool_use_keys(cr);
c_score = match (&baseline_tool_keys, &candidate_tool_keys) {
(Some(bk), Some(ck)) if bk == ck => 1.0,
_ => 0.0,
};
}
b.push(b_score);
c.push(c_score);
}
if b.is_empty() {
return AxisStat::empty(Axis::Conformance);
}
let bm = mean(&b);
let cm = mean(&c);
let delta = cm - bm;
let ci = paired_ci(&b, &c, |bs, cs| mean(cs) - mean(bs), 0, seed);
AxisStat::new_rate(Axis::Conformance, bm, cm, delta, ci.low, ci.high, b.len())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::agentlog::Kind;
use crate::diff::axes::Severity;
use serde_json::json;
fn response(text: &str) -> Record {
Record::new(
Kind::ChatResponse,
json!({
"model": "x",
"content": [{"type": "text", "text": text}],
"stop_reason": "end_turn",
"latency_ms": 0,
"usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
}),
"2026-04-21T10:00:00Z",
None,
)
}
#[test]
fn baseline_json_intent_candidate_prose_flags_severe() {
let baseline = response(r#"[{"a": 1}]"#);
let candidate = response("Here are your results: ...");
let pairs = [(&baseline, &candidate); 3];
let stat = compute(&pairs, Some(1));
assert!((stat.baseline_median - 1.0).abs() < 1e-9);
assert!((stat.candidate_median - 0.0).abs() < 1e-9);
assert_eq!(stat.severity, Severity::Severe);
assert_eq!(stat.n, 3);
}
#[test]
fn both_sides_valid_json_is_no_regression() {
let r = response(r#"{"a": 1}"#);
let pairs = [(&r, &r); 5];
let stat = compute(&pairs, Some(1));
assert_eq!(stat.severity, Severity::None);
}
#[test]
fn baseline_without_json_intent_is_excluded_from_population() {
let baseline = response("hello");
let candidate = response("world");
let pairs = [(&baseline, &candidate); 3];
let stat = compute(&pairs, Some(1));
assert_eq!(stat.n, 0);
}
#[test]
fn baseline_json_candidate_broken_json_is_counted() {
let baseline = response(r#"{"ok": true}"#);
let candidate = response("{broken");
let pairs = [(&baseline, &candidate); 4];
let stat = compute(&pairs, Some(1));
assert!((stat.baseline_median - 1.0).abs() < 1e-9);
assert!((stat.candidate_median - 0.0).abs() < 1e-9);
assert_eq!(stat.severity, Severity::Severe);
}
#[test]
fn partial_regression_is_moderate() {
use crate::diff::axes::Flag;
let baseline = response(r#"{"ok": true}"#);
let good = response(r#"{"ok": true}"#);
let bad = response("plain text response");
let mut pairs: Vec<(&Record, &Record)> = Vec::new();
for i in 0..30 {
pairs.push(if i % 3 == 0 {
(&baseline, &bad)
} else {
(&baseline, &good)
});
}
let stat = compute(&pairs, Some(1));
assert!(matches!(
stat.severity,
Severity::Moderate | Severity::Severe
));
assert!(!stat.flags.contains(&Flag::LowPower));
}
#[test]
fn markdown_fenced_json_is_still_json_intent() {
let baseline = response("```json\n{\"ok\": true}\n```");
let candidate = response("```json\n{\"ok\": true}\n```");
let pairs = [(&baseline, &candidate); 5];
let stat = compute(&pairs, Some(1));
assert!(stat.n > 0, "fenced JSON should count toward conformance");
}
fn tool_use_response(name: &str, keys: &[&str]) -> Record {
let input: serde_json::Map<String, serde_json::Value> = keys
.iter()
.map(|k| ((*k).to_string(), json!("v")))
.collect();
Record::new(
Kind::ChatResponse,
json!({
"model": "x",
"content": [{
"type": "tool_use",
"id": "t1",
"name": name,
"input": input,
}],
"stop_reason": "tool_use",
"latency_ms": 0,
"usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
}),
"2026-04-21T10:00:00Z",
None,
)
}
#[test]
fn tool_use_intent_counts_toward_conformance_axis() {
let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
let pairs = [(&baseline, &baseline); 5];
let stat = compute(&pairs, Some(1));
assert_eq!(stat.n, 5, "tool_use intent must produce n>0");
assert_eq!(stat.severity, Severity::None); }
#[test]
fn tool_use_key_regression_flags_severe() {
let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
let candidate = tool_use_response("submit", &["ticker"]);
let pairs = [(&baseline, &candidate); 5];
let stat = compute(&pairs, Some(1));
assert_eq!(stat.n, 5);
assert!((stat.baseline_median - 1.0).abs() < 1e-9);
assert!((stat.candidate_median - 0.0).abs() < 1e-9);
assert_eq!(stat.severity, Severity::Severe);
}
#[test]
fn tool_use_candidate_no_tool_call_flags_severe() {
let baseline = tool_use_response("submit", &["ticker", "revenue"]);
let candidate = response("I think the answer is 42.");
let pairs = [(&baseline, &candidate); 4];
let stat = compute(&pairs, Some(1));
assert_eq!(stat.n, 4);
assert_eq!(stat.severity, Severity::Severe);
}
#[test]
fn strip_fences_handles_single_line_fence() {
assert_eq!(strip_markdown_fences("```json{\"a\":1}```"), "{\"a\":1}");
assert_eq!(strip_markdown_fences("```{\"a\":1}```"), "{\"a\":1}");
}
#[test]
fn strip_fences_handles_unclosed_fence_with_indent() {
let input = "```json\n {\"a\":1}";
assert_eq!(strip_markdown_fences(input), "{\"a\":1}");
}
#[test]
fn single_line_fenced_json_intent_is_recognised() {
let baseline = response("```json{\"ok\":true}```");
let candidate = response("plain prose");
let pairs = [(&baseline, &candidate); 5];
let stat = compute(&pairs, Some(1));
assert!(stat.n > 0, "single-line fenced JSON should count");
}
#[test]
fn fenced_json_vs_fenced_sql_is_severe_regression() {
let baseline = response("```json\n{\"sql\": \"SELECT 1\"}\n```");
let candidate = response("```sql\nSELECT 1\n```");
let pairs = [(&baseline, &candidate); 30];
let stat = compute(&pairs, Some(1));
assert!(stat.n > 0);
assert!(stat.delta < 0.0);
assert!(matches!(
stat.severity,
Severity::Moderate | Severity::Severe
));
}
}