Skip to main content

shadow_core/diff/
reasoning.rs

1//! Axis 7: reasoning depth — thinking tokens + self-correction markers.
2//!
3//! "Thinking" tokens come from the response `usage.thinking_tokens` field.
4//! Self-correction markers are conservative: the count of content parts
5//! whose `type == "thinking"`. We sum these per response.
6
7use crate::agentlog::Record;
8use crate::diff::axes::{Axis, AxisStat};
9use crate::diff::bootstrap::{median, paired_ci};
10
11fn reasoning_score(r: &Record) -> Option<f64> {
12    let thinking_tokens = r
13        .payload
14        .get("usage")
15        .and_then(|u| u.get("thinking_tokens"))
16        .and_then(|v| v.as_f64())
17        .unwrap_or(0.0);
18    let thinking_parts = r
19        .payload
20        .get("content")
21        .and_then(|c| c.as_array())
22        .map(|arr| {
23            arr.iter()
24                .filter(|p| p.get("type").and_then(|t| t.as_str()) == Some("thinking"))
25                .count() as f64
26        })
27        .unwrap_or(0.0);
28    Some(thinking_tokens + thinking_parts)
29}
30
31/// Compute the reasoning-depth axis.
32pub fn compute(pairs: &[(&Record, &Record)], seed: Option<u64>) -> AxisStat {
33    let mut b = Vec::with_capacity(pairs.len());
34    let mut c = Vec::with_capacity(pairs.len());
35    for (br, cr) in pairs {
36        if let (Some(bv), Some(cv)) = (reasoning_score(br), reasoning_score(cr)) {
37            b.push(bv);
38            c.push(cv);
39        }
40    }
41    if b.is_empty() {
42        return AxisStat::empty(Axis::Reasoning);
43    }
44    let bm = median(&b);
45    let cm = median(&c);
46    let delta = cm - bm;
47    let ci = paired_ci(&b, &c, |bs, cs| median(cs) - median(bs), 0, seed);
48    AxisStat::new_value(Axis::Reasoning, bm, cm, delta, ci.low, ci.high, b.len())
49}
50
51#[cfg(test)]
52mod tests {
53    use super::*;
54    use crate::agentlog::Kind;
55    use serde_json::json;
56
57    fn response(thinking_tokens: u64, with_thinking_part: bool) -> Record {
58        let content = if with_thinking_part {
59            json!([{"type": "thinking", "text": "..."}, {"type": "text", "text": "done"}])
60        } else {
61            json!([{"type": "text", "text": "done"}])
62        };
63        Record::new(
64            Kind::ChatResponse,
65            json!({
66                "model": "x",
67                "content": content,
68                "stop_reason": "end_turn",
69                "latency_ms": 0,
70                "usage": {"input_tokens": 1, "output_tokens": 1, "thinking_tokens": thinking_tokens},
71            }),
72            "2026-04-21T10:00:00Z",
73            None,
74        )
75    }
76
77    #[test]
78    fn thinking_tokens_and_parts_are_summed() {
79        let baseline: Vec<Record> = (0..10).map(|_| response(0, false)).collect();
80        let candidate: Vec<Record> = (0..10).map(|_| response(100, true)).collect();
81        let pairs: Vec<(&Record, &Record)> = baseline.iter().zip(candidate.iter()).collect();
82        let stat = compute(&pairs, Some(1));
83        assert_eq!(stat.baseline_median, 0.0);
84        assert_eq!(stat.candidate_median, 101.0);
85    }
86}