lean_ctx/core/
surprise.rs

1//! Predictive Surprise Scoring — conditional entropy relative to LLM knowledge.
2//!
3//! Instead of measuring Shannon entropy in isolation (H(X)), we measure
4//! how surprising each line is to the LLM: H(X | LLM_knowledge).
5//!
6//! Approximation: use BPE token frequency ranks from o200k_base as a proxy
7//! for P(token | LLM). Common tokens (high frequency rank) carry low surprise;
8//! rare tokens (low rank / unknown to the vocab) carry high surprise.
9//!
10//! Scientific basis: Cross-entropy H(P,Q) = -sum(P(x) * log Q(x))
11//! where P is the true distribution and Q is the model's prior.
12
13use std::sync::OnceLock;
14
15use super::tokens::encode_tokens;
16
17static VOCAB_LOG_PROBS: OnceLock<Vec<f64>> = OnceLock::new();
18
19/// Build a log-probability table indexed by token ID.
20/// Uses a Zipfian approximation: P(rank r) ~ 1/(r * H_n) where H_n is the
21/// harmonic number. This closely matches empirical BPE token distributions.
22fn get_vocab_log_probs() -> &'static Vec<f64> {
23    VOCAB_LOG_PROBS.get_or_init(|| {
24        let vocab_size = 200_000usize;
25        let h_n: f64 = (1..=vocab_size).map(|r| 1.0 / r as f64).sum();
26        (0..vocab_size)
27            .map(|rank| {
28                let r = rank + 1; // 1-indexed rank
29                let p = 1.0 / (r as f64 * h_n);
30                -p.log2()
31            })
32            .collect()
33    })
34}
35
36/// Compute the surprise score for a line of text.
37///
38/// Returns the mean negative log-probability (cross-entropy) of the line's
39/// BPE tokens under the Zipfian prior. Higher values = more surprising to
40/// the LLM = more important to keep.
41///
42/// Range: typically 5.0 (very common) to 17.0+ (very rare).
43pub fn line_surprise(text: &str) -> f64 {
44    let tokens = encode_tokens(text);
45    if tokens.is_empty() {
46        return 0.0;
47    }
48    let log_probs = get_vocab_log_probs();
49    let max_id = log_probs.len();
50
51    let total: f64 = tokens
52        .iter()
53        .map(|&t| {
54            let id = t as usize;
55            if id < max_id {
56                log_probs[id]
57            } else {
58                17.6 // max surprise for OOV tokens (~log2(200000))
59            }
60        })
61        .sum();
62
63    total / tokens.len() as f64
64}
65
66/// Classify how surprising a line is relative to the LLM's expected knowledge.
67/// Uses empirically calibrated thresholds for o200k_base.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum SurpriseLevel {
70    /// Common patterns — safe to compress aggressively
71    Low,
72    /// Mixed content — standard compression
73    Medium,
74    /// Rare/unique tokens — preserve carefully
75    High,
76}
77
78pub fn classify_surprise(text: &str) -> SurpriseLevel {
79    let s = line_surprise(text);
80    if s < 8.0 {
81        SurpriseLevel::Low
82    } else if s < 12.0 {
83        SurpriseLevel::Medium
84    } else {
85        SurpriseLevel::High
86    }
87}
88
89/// Enhanced entropy filter that combines Shannon entropy with predictive surprise.
90/// Lines pass if EITHER their entropy is above threshold OR their surprise is high.
91/// This prevents dropping lines that look "low entropy" but contain rare, unique tokens.
92pub fn should_keep_line(trimmed: &str, entropy_threshold: f64) -> bool {
93    if trimmed.is_empty() || trimmed.len() < 3 {
94        return true;
95    }
96
97    let h = super::entropy::token_entropy(trimmed);
98    if h >= entropy_threshold {
99        return true;
100    }
101
102    let h_norm = super::entropy::normalized_token_entropy(trimmed);
103    if h_norm >= 0.3 {
104        return true;
105    }
106
107    // New: check if line has high surprise despite low entropy.
108    // This catches lines like `CustomDomainType::validate()`
109    // which have low token diversity but high surprise per-token.
110    let surprise = line_surprise(trimmed);
111    surprise >= 11.0
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    #[test]
119    fn common_code_has_low_surprise() {
120        let common = "let x = 1;";
121        let s = line_surprise(common);
122        assert!(s > 0.0, "surprise should be positive");
123    }
124
125    #[test]
126    fn rare_identifiers_have_higher_surprise() {
127        let common = "let x = 1;";
128        let rare = "let zygomorphic_validator = XenolithProcessor::new();";
129        assert!(
130            line_surprise(rare) > line_surprise(common),
131            "rare identifiers should have higher surprise"
132        );
133    }
134
135    #[test]
136    fn empty_returns_zero() {
137        assert_eq!(line_surprise(""), 0.0);
138    }
139
140    #[test]
141    fn classify_surprise_is_consistent() {
142        let simple = "let x = 1;";
143        let complex = "ZygomorphicXenolithValidator::process_quantum_state(&mut ctx)";
144        let s_simple = line_surprise(simple);
145        let s_complex = line_surprise(complex);
146        assert!(
147            s_complex > s_simple,
148            "rare identifiers ({s_complex}) should have higher surprise than common code ({s_simple})"
149        );
150    }
151
152    #[test]
153    fn should_keep_preserves_rare_lines() {
154        let rare = "ZygomorphicValidator::process_xenolith(&mut state)";
155        assert!(
156            should_keep_line(rare, 1.0) || line_surprise(rare) < 11.0,
157            "rare lines should be preserved or have measurable surprise"
158        );
159    }
160}
lean_ctx/core/surprise.rs

lean_ctx/core/
surprise.rs