lean_ctx/core/
surprise.rs1use std::sync::OnceLock;
14
15use super::tokens::encode_tokens;
16
17static VOCAB_LOG_PROBS: OnceLock<Vec<f64>> = OnceLock::new();
18
19fn get_vocab_log_probs() -> &'static Vec<f64> {
23 VOCAB_LOG_PROBS.get_or_init(|| {
24 let vocab_size = 200_000usize;
25 let h_n: f64 = (1..=vocab_size).map(|r| 1.0 / r as f64).sum();
26 (0..vocab_size)
27 .map(|rank| {
28 let r = rank + 1; let p = 1.0 / (r as f64 * h_n);
30 -p.log2()
31 })
32 .collect()
33 })
34}
35
36pub fn line_surprise(text: &str) -> f64 {
44 let tokens = encode_tokens(text);
45 if tokens.is_empty() {
46 return 0.0;
47 }
48 let log_probs = get_vocab_log_probs();
49 let max_id = log_probs.len();
50
51 let total: f64 = tokens
52 .iter()
53 .map(|&t| {
54 let id = t as usize;
55 if id < max_id {
56 log_probs[id]
57 } else {
58 17.6 }
60 })
61 .sum();
62
63 total / tokens.len() as f64
64}
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum SurpriseLevel {
70 Low,
72 Medium,
74 High,
76}
77
78pub fn classify_surprise(text: &str) -> SurpriseLevel {
79 let s = line_surprise(text);
80 if s < 8.0 {
81 SurpriseLevel::Low
82 } else if s < 12.0 {
83 SurpriseLevel::Medium
84 } else {
85 SurpriseLevel::High
86 }
87}
88
89pub fn should_keep_line(trimmed: &str, entropy_threshold: f64) -> bool {
93 if trimmed.is_empty() || trimmed.len() < 3 {
94 return true;
95 }
96
97 let h = super::entropy::token_entropy(trimmed);
98 if h >= entropy_threshold {
99 return true;
100 }
101
102 let h_norm = super::entropy::normalized_token_entropy(trimmed);
103 if h_norm >= 0.3 {
104 return true;
105 }
106
107 let surprise = line_surprise(trimmed);
111 surprise >= 11.0
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn common_code_has_low_surprise() {
120 let common = "let x = 1;";
121 let s = line_surprise(common);
122 assert!(s > 0.0, "surprise should be positive");
123 }
124
125 #[test]
126 fn rare_identifiers_have_higher_surprise() {
127 let common = "let x = 1;";
128 let rare = "let zygomorphic_validator = XenolithProcessor::new();";
129 assert!(
130 line_surprise(rare) > line_surprise(common),
131 "rare identifiers should have higher surprise"
132 );
133 }
134
135 #[test]
136 fn empty_returns_zero() {
137 assert_eq!(line_surprise(""), 0.0);
138 }
139
140 #[test]
141 fn classify_surprise_is_consistent() {
142 let simple = "let x = 1;";
143 let complex = "ZygomorphicXenolithValidator::process_quantum_state(&mut ctx)";
144 let s_simple = line_surprise(simple);
145 let s_complex = line_surprise(complex);
146 assert!(
147 s_complex > s_simple,
148 "rare identifiers ({s_complex}) should have higher surprise than common code ({s_simple})"
149 );
150 }
151
152 #[test]
153 fn should_keep_preserves_rare_lines() {
154 let rare = "ZygomorphicValidator::process_xenolith(&mut state)";
155 assert!(
156 should_keep_line(rare, 1.0) || line_surprise(rare) < 11.0,
157 "rare lines should be preserved or have measurable surprise"
158 );
159 }
160}