Skip to main content

shadow_core/diff/
conformance.rs

1//! Axis 9: schema / format conformance rate.
2//!
3//! Intent-gated on the baseline side. A pair is counted (and scored) if
4//! the baseline response has EITHER:
5//!
6//!   1. JSON-text intent: its text starts with `{` or `[` (after fence-
7//!      strip). Both sides are scored on whether their text parses.
8//!
9//!   2. Tool-use intent: it emits at least one `tool_use` block with a
10//!      dict-shaped `input`. Both sides are scored on whether their
11//!      (first) tool_use's input keys match the baseline's. This covers
12//!      the common agent pattern where the "structured final answer"
13//!      is a tool call (e.g. `submit_answer(...)`) rather than JSON
14//!      text — the same conformance question in a different syntax.
15//!
16//! Pairs where baseline has neither JSON intent nor tool_use intent are
17//! excluded (we don't penalise the candidate for not structuring when
18//! nothing asked for structure).
19
20use crate::agentlog::Record;
21use crate::diff::axes::{Axis, AxisStat};
22use crate::diff::bootstrap::paired_ci;
23
24fn response_text(r: &Record) -> String {
25    let Some(arr) = r.payload.get("content").and_then(|c| c.as_array()) else {
26        return String::new();
27    };
28    arr.iter()
29        .filter_map(|p| {
30            if p.get("type").and_then(|t| t.as_str()) == Some("text") {
31                p.get("text")
32                    .and_then(|t| t.as_str())
33                    .map(ToString::to_string)
34            } else {
35                None
36            }
37        })
38        .collect::<Vec<_>>()
39        .join(" ")
40}
41
42/// Strip leading ```json / ``` and trailing ``` markdown fences, if present.
43/// Frontier models (Claude, GPT) routinely wrap JSON output in fences even
44/// when instructed not to — treating the fence as "not JSON intent" would
45/// silently disable this axis on real traffic.
46///
47/// Handles four fence shapes:
48///   1. Multi-line with closer: "```json\n{...}\n```"     (the common case)
49///   2. Multi-line no closer:   "```json\n{...}"          (truncated streams)
50///   3. Single-line with closer: "```json{...}```"        (rare, but seen)
51///   4. Single-line no closer:  "```json{...}"            (pathological)
52///
53/// In every case the fence plus optional language tag is stripped, leading
54/// whitespace on the body is trimmed (so `has_json_intent` can rely on the
55/// first char), and any trailing ``` closer is removed.
56fn strip_markdown_fences(text: &str) -> &str {
57    let t = text.trim();
58    if !t.starts_with("```") {
59        return t;
60    }
61    // Multi-line: strip everything up to and including the first newline.
62    let after_fence = if let Some(i) = t.find('\n') {
63        &t[i + 1..]
64    } else {
65        // Single-line: strip the leading ``` plus an optional language tag
66        // like ```json / ```sql up to either `{`, `[`, or another `.
67        let no_backticks = &t[3..];
68        // Skip over an alphanumeric language tag.
69        let after_lang = no_backticks.trim_start_matches(|c: char| c.is_ascii_alphanumeric());
70        after_lang
71    };
72    // Trim a trailing closing fence (whether or not it was present).
73    let trimmed_end = after_fence.trim_end();
74    let body = trimmed_end.strip_suffix("```").unwrap_or(trimmed_end);
75    body.trim()
76}
77
78/// True if `text` (after trim + fence-strip) parses as JSON.
79fn is_json_parseable(text: &str) -> bool {
80    serde_json::from_str::<serde_json::Value>(strip_markdown_fences(text)).is_ok()
81}
82
83/// True if `text` (after fence-strip + trim) starts with `{` or `[` — our
84/// heuristic for "this response intends to be JSON."
85fn has_json_intent(text: &str) -> bool {
86    let body = strip_markdown_fences(text);
87    body.starts_with('{') || body.starts_with('[')
88}
89
90fn mean(xs: &[f64]) -> f64 {
91    if xs.is_empty() {
92        0.0
93    } else {
94        xs.iter().sum::<f64>() / xs.len() as f64
95    }
96}
97
98/// Return the top-level key set of the first `tool_use` block whose
99/// `input` is a dict. Returns None if no such block exists.
100fn first_tool_use_keys(r: &Record) -> Option<std::collections::BTreeSet<String>> {
101    let arr = r.payload.get("content").and_then(|c| c.as_array())?;
102    for part in arr {
103        if part.get("type").and_then(|t| t.as_str()) == Some("tool_use") {
104            if let Some(input) = part.get("input").and_then(|i| i.as_object()) {
105                return Some(input.keys().cloned().collect());
106            }
107        }
108    }
109    None
110}
111
112/// Compute the schema-conformance axis.
113pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
114    let mut b = Vec::new();
115    let mut c = Vec::new();
116    for (br, cr) in pairs {
117        let baseline_text = response_text(br);
118        let baseline_tool_keys = first_tool_use_keys(br);
119        let baseline_json_intent = has_json_intent(&baseline_text);
120        let baseline_tool_intent = baseline_tool_keys.is_some();
121        if !baseline_json_intent && !baseline_tool_intent {
122            // Baseline wasn't producing structure of either flavour —
123            // this pair isn't part of the conformance population.
124            continue;
125        }
126        // Score each side. Conformance is "did the structured contract
127        // survive". We use the union of signals: a pair scores 1.0 if
128        // EITHER the JSON intent was honoured OR (when applicable) the
129        // tool_use key set matched. A candidate that abandons both
130        // scores 0.0 — the full regression signal.
131        let b_score;
132        let c_score;
133        if baseline_json_intent {
134            b_score = f64::from(is_json_parseable(&baseline_text));
135            c_score = f64::from(is_json_parseable(&response_text(cr)));
136        } else {
137            // Tool-use intent only. Score by key-set match.
138            // Baseline always 1.0 (by construction — it has the keys).
139            b_score = 1.0;
140            let candidate_tool_keys = first_tool_use_keys(cr);
141            c_score = match (&baseline_tool_keys, &candidate_tool_keys) {
142                (Some(bk), Some(ck)) if bk == ck => 1.0,
143                _ => 0.0,
144            };
145        }
146        b.push(b_score);
147        c.push(c_score);
148    }
149    if b.is_empty() {
150        return AxisStat::empty(Axis::Conformance);
151    }
152    let bm = mean(&b);
153    let cm = mean(&c);
154    let delta = cm - bm;
155    let ci = paired_ci(&b, &c, |bs, cs| mean(cs) - mean(bs), 0, seed);
156    // Use absolute-scale severity: a rate axis whose baseline is 1.0
157    // and candidate is 0.5 is a 50% regression, not "within noise."
158    AxisStat::new_rate(Axis::Conformance, bm, cm, delta, ci.low, ci.high, b.len())
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164    use crate::agentlog::Kind;
165    use crate::diff::axes::Severity;
166    use serde_json::json;
167
168    fn response(text: &str) -> Record {
169        Record::new(
170            Kind::ChatResponse,
171            json!({
172                "model": "x",
173                "content": [{"type": "text", "text": text}],
174                "stop_reason": "end_turn",
175                "latency_ms": 0,
176                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
177            }),
178            "2026-04-21T10:00:00Z",
179            None,
180        )
181    }
182
183    #[test]
184    fn baseline_json_intent_candidate_prose_flags_severe() {
185        // Headline fix: the case where baseline produces valid JSON and
186        // candidate regresses to prose MUST surface as a severe drop.
187        let baseline = response(r#"[{"a": 1}]"#);
188        let candidate = response("Here are your results: ...");
189        let pairs = [(&baseline, &candidate); 3];
190        let stat = compute(&pairs, Some(1));
191        assert!((stat.baseline_median - 1.0).abs() < 1e-9);
192        assert!((stat.candidate_median - 0.0).abs() < 1e-9);
193        assert_eq!(stat.severity, Severity::Severe);
194        assert_eq!(stat.n, 3);
195    }
196
197    #[test]
198    fn both_sides_valid_json_is_no_regression() {
199        let r = response(r#"{"a": 1}"#);
200        let pairs = [(&r, &r); 5];
201        let stat = compute(&pairs, Some(1));
202        assert_eq!(stat.severity, Severity::None);
203    }
204
205    #[test]
206    fn baseline_without_json_intent_is_excluded_from_population() {
207        // Both responses are prose — no JSON intent on the baseline side,
208        // so the pair doesn't count toward the conformance axis.
209        let baseline = response("hello");
210        let candidate = response("world");
211        let pairs = [(&baseline, &candidate); 3];
212        let stat = compute(&pairs, Some(1));
213        assert_eq!(stat.n, 0);
214    }
215
216    #[test]
217    fn baseline_json_candidate_broken_json_is_counted() {
218        // Candidate attempted JSON but produced invalid output.
219        let baseline = response(r#"{"ok": true}"#);
220        let candidate = response("{broken");
221        let pairs = [(&baseline, &candidate); 4];
222        let stat = compute(&pairs, Some(1));
223        assert!((stat.baseline_median - 1.0).abs() < 1e-9);
224        assert!((stat.candidate_median - 0.0).abs() < 1e-9);
225        assert_eq!(stat.severity, Severity::Severe);
226    }
227
228    #[test]
229    fn partial_regression_is_moderate() {
230        use crate::diff::axes::Flag;
231        // Baseline always JSON, candidate mostly JSON with 1/3 regressions
232        // — at n=30 the CI is tight and severity reaches Moderate/Severe.
233        let baseline = response(r#"{"ok": true}"#);
234        let good = response(r#"{"ok": true}"#);
235        let bad = response("plain text response");
236        let mut pairs: Vec<(&Record, &Record)> = Vec::new();
237        for i in 0..30 {
238            pairs.push(if i % 3 == 0 {
239                (&baseline, &bad)
240            } else {
241                (&baseline, &good)
242            });
243        }
244        let stat = compute(&pairs, Some(1));
245        assert!(matches!(
246            stat.severity,
247            Severity::Moderate | Severity::Severe
248        ));
249        assert!(!stat.flags.contains(&Flag::LowPower));
250    }
251
252    #[test]
253    fn markdown_fenced_json_is_still_json_intent() {
254        // Real-world: Claude/GPT wrap JSON in ```json ... ``` fences. The
255        // intent-gate must strip fences before checking for JSON intent.
256        let baseline = response("```json\n{\"ok\": true}\n```");
257        let candidate = response("```json\n{\"ok\": true}\n```");
258        let pairs = [(&baseline, &candidate); 5];
259        let stat = compute(&pairs, Some(1));
260        // At least some pairs should be scored (n > 0) — the bug was n=0.
261        assert!(stat.n > 0, "fenced JSON should count toward conformance");
262    }
263
264    /// Build a chat_response with the specified tool_use input keys.
265    fn tool_use_response(name: &str, keys: &[&str]) -> Record {
266        let input: serde_json::Map<String, serde_json::Value> = keys
267            .iter()
268            .map(|k| ((*k).to_string(), json!("v")))
269            .collect();
270        Record::new(
271            Kind::ChatResponse,
272            json!({
273                "model": "x",
274                "content": [{
275                    "type": "tool_use",
276                    "id": "t1",
277                    "name": name,
278                    "input": input,
279                }],
280                "stop_reason": "tool_use",
281                "latency_ms": 0,
282                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": 0},
283            }),
284            "2026-04-21T10:00:00Z",
285            None,
286        )
287    }
288
289    #[test]
290    fn tool_use_intent_counts_toward_conformance_axis() {
291        // REGRESSION TEST: the v0.1 conformance axis only fired on JSON
292        // text responses, so agents that output their final answer via a
293        // tool_use call (the common real-world pattern, e.g.
294        // `submit_answer(...)`) got n=0 and the axis was dead weight.
295        // Now tool_use blocks with dict input count.
296        let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
297        let pairs = [(&baseline, &baseline); 5];
298        let stat = compute(&pairs, Some(1));
299        assert_eq!(stat.n, 5, "tool_use intent must produce n>0");
300        assert_eq!(stat.severity, Severity::None); // same on both sides
301    }
302
303    #[test]
304    fn tool_use_key_regression_flags_severe() {
305        // Baseline submits all 3 required keys; candidate drops 2.
306        let baseline = tool_use_response("submit", &["ticker", "revenue", "net_income"]);
307        let candidate = tool_use_response("submit", &["ticker"]);
308        let pairs = [(&baseline, &candidate); 5];
309        let stat = compute(&pairs, Some(1));
310        assert_eq!(stat.n, 5);
311        assert!((stat.baseline_median - 1.0).abs() < 1e-9);
312        assert!((stat.candidate_median - 0.0).abs() < 1e-9);
313        assert_eq!(stat.severity, Severity::Severe);
314    }
315
316    #[test]
317    fn tool_use_candidate_no_tool_call_flags_severe() {
318        // Candidate didn't even call the tool (returned prose instead).
319        let baseline = tool_use_response("submit", &["ticker", "revenue"]);
320        let candidate = response("I think the answer is 42.");
321        let pairs = [(&baseline, &candidate); 4];
322        let stat = compute(&pairs, Some(1));
323        assert_eq!(stat.n, 4);
324        assert_eq!(stat.severity, Severity::Severe);
325    }
326
327    #[test]
328    fn strip_fences_handles_single_line_fence() {
329        // Pathological but seen in the wild: fence + body + closing fence on
330        // one line. The old code short-circuited on no-newline and returned
331        // the input verbatim.
332        assert_eq!(strip_markdown_fences("```json{\"a\":1}```"), "{\"a\":1}");
333        assert_eq!(strip_markdown_fences("```{\"a\":1}```"), "{\"a\":1}");
334    }
335
336    #[test]
337    fn strip_fences_handles_unclosed_fence_with_indent() {
338        // Truncated-stream case: opening fence, newline, indented body,
339        // no closing fence. Old else-branch leaked leading whitespace.
340        let input = "```json\n   {\"a\":1}";
341        assert_eq!(strip_markdown_fences(input), "{\"a\":1}");
342    }
343
344    #[test]
345    fn single_line_fenced_json_intent_is_recognised() {
346        let baseline = response("```json{\"ok\":true}```");
347        let candidate = response("plain prose");
348        let pairs = [(&baseline, &candidate); 5];
349        let stat = compute(&pairs, Some(1));
350        assert!(stat.n > 0, "single-line fenced JSON should count");
351    }
352
353    #[test]
354    fn fenced_json_vs_fenced_sql_is_severe_regression() {
355        let baseline = response("```json\n{\"sql\": \"SELECT 1\"}\n```");
356        let candidate = response("```sql\nSELECT 1\n```");
357        let pairs = [(&baseline, &candidate); 30];
358        let stat = compute(&pairs, Some(1));
359        assert!(stat.n > 0);
360        assert!(stat.delta < 0.0);
361        assert!(matches!(
362            stat.severity,
363            Severity::Moderate | Severity::Severe
364        ));
365    }
366}