1use crate::agentlog::Record;
21use crate::diff::axes::{Axis, AxisStat};
22use crate::diff::bootstrap::paired_ci;
23
24fn response_text(r: &Record) -> String {
25 let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
26 return String::new();
27 };
28 arr.iter()
29 .filter_map(|p| {
30 if p.get("type").and_then(|t| t.as_str()) == Some("text") {
31 p.get("text")
32 .and_then(|t| t.as_str())
33 .map(ToString::to_string)
34 } else {
35 None
36 }
37 })
38 .collect::<Vec<_>>()
39 .join(" ")
40}
41
42fn strip_markdown_fences(text: &str) -> &str {
57 let t = text.trim();
58 if !t.starts_with("```") {
59 return t;
60 }
61 let after_fence = if let Some(i) = t.find('\n') {
63 &t[i + 1..]
64 } else {
65 let no_backticks = &t[3..];
68 let after_lang = no_backticks.trim_start_matches(|c: char| c.is_ascii_alphanumeric());
70 after_lang
71 };
72 let trimmed_end = after_fence.trim_end();
74 let body = trimmed_end.strip_suffix("```").unwrap_or(trimmed_end);
75 body.trim()
76}
77
78fn is_json_parseable(text: &str) -> bool {
80 serde_json::from_str::<serde_json::Value>(strip_markdown_fences(text)).is_ok()
81}
82
83fn has_json_intent(text: &str) -> bool {
86 let body = strip_markdown_fences(text);
87 body.starts_with('{') || body.starts_with('[')
88}
89
90fn mean(xs: &[f64]) -> f64 {
91 if xs.is_empty() {
92 0.0
93 } else {
94 xs.iter().sum::<f64>() / xs.len() as f64
95 }
96}
97
98fn first_tool_use_keys(r: &Record) -> Option<std::collections::BTreeSet<String>> {
101 let arr = r.payload.get("content").and_then(|c| c.as_array())?;
102 for part in arr {
103 if part.get("type").and_then(|t| t.as_str()) == Some("tool_use") {
104 if let Some(input) = part.get("input").and_then(|i| i.as_object()) {
105 return Some(input.keys().cloned().collect());
106 }
107 }
108 }
109 None
110}
111
112pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
114 let mut b = Vec::new();
115 let mut c = Vec::new();
116 for (br, cr) in pairs {
117 let baseline_text = response_text(br);
118 let baseline_tool_keys = first_tool_use_keys(br);
119 let baseline_json_intent = has_json_intent(&baseline_text);
120 let baseline_tool_intent = baseline_tool_keys.is_some();
121 if !baseline_json_intent && !baseline_tool_intent {
122 continue;
125 }
126 let b_score;
132 let c_score;
133 if baseline_json_intent {
134 b_score = f64::from(is_json_parseable(&baseline_text));
135 c_score = f64::from(is_json_parseable(&response_text(cr)));
136 } else {
137 b_score = 1.0;
140 let candidate_tool_keys = first_tool_use_keys(cr);
141 c_score = match (&baseline_tool_keys, &candidate_tool_keys) {
142 (Some(bk), Some(ck)) if bk == ck => 1.0,
143 _ => 0.0,
144 };
145 }
146 b.push(b_score);
147 c.push(c_score);
148 }
149 if b.is_empty() {
150 return AxisStat::empty(Axis::Conformance);
151 }
152 let bm = mean(&b);
153 let cm = mean(&c);
154 let delta = cm - bm;
155 let ci = paired_ci(&b, &c, |bs, cs| mean(cs) - mean(bs), 0, seed);
156 AxisStat::new_rate(Axis::Conformance, bm, cm, delta, ci.low, ci.high, b.len())
159}
160
161#[cfg(test)]
162mod tests {
163 use super::*;
164 use crate::agentlog::Kind;
165 use crate::diff::axes::Severity;
166 use serde_json::json;
167
168 fn response(text: &str) -> Record {
169 Record::new(
170 Kind::ChatResponse,
171 json!({
172 "model": "x",
173 "content": [{"type": "text", "text": text}],
174 "stop_reason": "end_turn",
175 "latency_ms": 0,
176 "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
177 }),
178 "2026-04-21T10:00:00Z",
179 None,
180 )
181 }
182
183 #[test]
184 fn baseline_json_intent_candidate_prose_flags_severe() {
185 let baseline = response(r#"[{"a": 1}]"#);
188 let candidate = response("Here are your results: ...");
189 let pairs = [(&baseline, &candidate); 3];
190 let stat = compute(&pairs, Some(1));
191 assert!((stat.baseline_median - 1.0).abs() < 1e-9);
192 assert!((stat.candidate_median - 0.0).abs() < 1e-9);
193 assert_eq!(stat.severity, Severity::Severe);
194 assert_eq!(stat.n, 3);
195 }
196
197 #[test]
198 fn both_sides_valid_json_is_no_regression() {
199 let r = response(r#"{"a": 1}"#);
200 let pairs = [(&r, &r); 5];
201 let stat = compute(&pairs, Some(1));
202 assert_eq!(stat.severity, Severity::None);
203 }
204
205 #[test]
206 fn baseline_without_json_intent_is_excluded_from_population() {
207 let baseline = response("hello");
210 let candidate = response("world");
211 let pairs = [(&baseline, &candidate); 3];
212 let stat = compute(&pairs, Some(1));
213 assert_eq!(stat.n, 0);
214 }
215
216 #[test]
217 fn baseline_json_candidate_broken_json_is_counted() {
218 let baseline = response(r#"{"ok": true}"#);
220 let candidate = response("{broken");
221 let pairs = [(&baseline, &candidate); 4];
222 let stat = compute(&pairs, Some(1));
223 assert!((stat.baseline_median - 1.0).abs() < 1e-9);
224 assert!((stat.candidate_median - 0.0).abs() < 1e-9);
225 assert_eq!(stat.severity, Severity::Severe);
226 }
227
228 #[test]
229 fn partial_regression_is_moderate() {
230 use crate::diff::axes::Flag;
231 let baseline = response(r#"{"ok": true}"#);
234 let good = response(r#"{"ok": true}"#);
235 let bad = response("plain text response");
236 let mut pairs: Vec<(&Record, &Record)> = Vec::new();
237 for i in 0..30 {
238 pairs.push(if i % 3 == 0 {
239 (&baseline, &bad)
240 } else {
241 (&baseline, &good)
242 });
243 }
244 let stat = compute(&pairs, Some(1));
245 assert!(matches!(
246 stat.severity,
247 Severity::Moderate | Severity::Severe
248 ));
249 assert!(!stat.flags.contains(&Flag::LowPower));
250 }
251
252 #[test]
253 fn markdown_fenced_json_is_still_json_intent() {
254 let baseline = response("```json\n{\"ok\": true}\n```");
257 let candidate = response("```json\n{\"ok\": true}\n```");
258 let pairs = [(&baseline, &candidate); 5];
259 let stat = compute(&pairs, Some(1));
260 assert!(stat.n > 0, "fenced JSON should count toward conformance");
262 }
263
264 fn tool_use_response(name: &str, keys: &[&str]) -> Record {
266 let input: serde_json::Map<String, serde_json::Value> = keys
267 .iter()
268 .map(|k| ((*k).to_string(), json!("v")))
269 .collect();
270 Record::new(
271 Kind::ChatResponse,
272 json!({
273 "model": "x",
274 "content": [{
275 "type": "tool_use",
276 "id": "t1",
277 "name": name,
278 "input": input,
279 }],
280 "stop_reason": "tool_use",
281 "latency_ms": 0,
282 "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
283 }),
284 "2026-04-21T10:00:00Z",
285 None,
286 )
287 }
288
289 #[test]
290 fn tool_use_intent_counts_toward_conformance_axis() {
291 let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
297 let pairs = [(&baseline, &baseline); 5];
298 let stat = compute(&pairs, Some(1));
299 assert_eq!(stat.n, 5, "tool_use intent must produce n>0");
300 assert_eq!(stat.severity, Severity::None); }
302
303 #[test]
304 fn tool_use_key_regression_flags_severe() {
305 let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
307 let candidate = tool_use_response("submit", &["ticker"]);
308 let pairs = [(&baseline, &candidate); 5];
309 let stat = compute(&pairs, Some(1));
310 assert_eq!(stat.n, 5);
311 assert!((stat.baseline_median - 1.0).abs() < 1e-9);
312 assert!((stat.candidate_median - 0.0).abs() < 1e-9);
313 assert_eq!(stat.severity, Severity::Severe);
314 }
315
316 #[test]
317 fn tool_use_candidate_no_tool_call_flags_severe() {
318 let baseline = tool_use_response("submit", &["ticker", "revenue"]);
320 let candidate = response("I think the answer is 42.");
321 let pairs = [(&baseline, &candidate); 4];
322 let stat = compute(&pairs, Some(1));
323 assert_eq!(stat.n, 4);
324 assert_eq!(stat.severity, Severity::Severe);
325 }
326
327 #[test]
328 fn strip_fences_handles_single_line_fence() {
329 assert_eq!(strip_markdown_fences("```json{\"a\":1}```"), "{\"a\":1}");
333 assert_eq!(strip_markdown_fences("```{\"a\":1}```"), "{\"a\":1}");
334 }
335
336 #[test]
337 fn strip_fences_handles_unclosed_fence_with_indent() {
338 let input = "```json\n {\"a\":1}";
341 assert_eq!(strip_markdown_fences(input), "{\"a\":1}");
342 }
343
344 #[test]
345 fn single_line_fenced_json_intent_is_recognised() {
346 let baseline = response("```json{\"ok\":true}```");
347 let candidate = response("plain prose");
348 let pairs = [(&baseline, &candidate); 5];
349 let stat = compute(&pairs, Some(1));
350 assert!(stat.n > 0, "single-line fenced JSON should count");
351 }
352
353 #[test]
354 fn fenced_json_vs_fenced_sql_is_severe_regression() {
355 let baseline = response("```json\n{\"sql\": \"SELECT 1\"}\n```");
356 let candidate = response("```sql\nSELECT 1\n```");
357 let pairs = [(&baseline, &candidate); 30];
358 let stat = compute(&pairs, Some(1));
359 assert!(stat.n > 0);
360 assert!(stat.delta < 0.0);
361 assert!(matches!(
362 stat.severity,
363 Severity::Moderate | Severity::Severe
364 ));
365 }
366}