shadow_core/diff/
reasoning.rs1use crate::agentlog::Record;
8use crate::diff::axes::{Axis, AxisStat};
9use crate::diff::bootstrap::{median, paired_ci};
10
11fn reasoning_score(r: &Record) -> Option<f64> {
12 let thinking_tokens = r
13 .payload
14 .get("usage")
15 .and_then(|u| u.get("thinking_tokens"))
16 .and_then(|v| v.as_f64())
17 .unwrap_or(0.0);
18 let thinking_parts = r
19 .payload
20 .get("content")
21 .and_then(|c| c.as_array())
22 .map(|arr| {
23 arr.iter()
24 .filter(|p| p.get("type").and_then(|t| t.as_str()) == Some("thinking"))
25 .count() as f64
26 })
27 .unwrap_or(0.0);
28 Some(thinking_tokens + thinking_parts)
29}
30
31pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
33 let mut b = Vec::with_capacity(pairs.len());
34 let mut c = Vec::with_capacity(pairs.len());
35 for (br, cr) in pairs {
36 if let (Some(bv), Some(cv)) = (reasoning_score(br), reasoning_score(cr)) {
37 b.push(bv);
38 c.push(cv);
39 }
40 }
41 if b.is_empty() {
42 return AxisStat::empty(Axis::Reasoning);
43 }
44 let bm = median(&b);
45 let cm = median(&c);
46 let delta = cm - bm;
47 let ci = paired_ci(&b, &c, |bs, cs| median(cs) - median(bs), 0, seed);
48 AxisStat::new_value(Axis::Reasoning, bm, cm, delta, ci.low, ci.high, b.len())
49}
50
51#[cfg(test)]
52mod tests {
53 use super::*;
54 use crate::agentlog::Kind;
55 use serde_json::json;
56
57 fn response(thinking_tokens: u64, with_thinking_part: bool) -> Record {
58 let content = if with_thinking_part {
59 json!([{"type": "thinking", "text": "..."}, {"type": "text", "text": "done"}])
60 } else {
61 json!([{"type": "text", "text": "done"}])
62 };
63 Record::new(
64 Kind::ChatResponse,
65 json!({
66 "model": "x",
67 "content": content,
68 "stop_reason": "end_turn",
69 "latency_ms": 0,
70 "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": thinking_tokens},
71 }),
72 "2026-04-21T10:00:00Z",
73 None,
74 )
75 }
76
77 #[test]
78 fn thinking_tokens_and_parts_are_summed() {
79 let baseline: Vec<Record> = (0..10).map(|_| response(0, false)).collect();
80 let candidate: Vec<Record> = (0..10).map(|_| response(100, true)).collect();
81 let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
82 let stat = compute(&pairs, Some(1));
83 assert_eq!(stat.baseline_median, 0.0);
84 assert_eq!(stat.candidate_median, 101.0);
85 }
86}