Skip to main content

lean_ctx/core/terse/
scoring.rs

1//! Surprisal-based line scoring for deterministic compression.
2//!
3//! Each line receives an information density score based on:
4//! - Character-level entropy (Shannon)
5//! - Structural markers (paths, numbers, identifiers)
6//! - Repetition detection (overlap with previous lines)
7
8use std::collections::HashSet;
9
10/// Score for a single line — higher means more informative.
11#[derive(Debug, Clone)]
12pub struct LineScore {
13    pub line_idx: usize,
14    pub entropy: f32,
15    pub has_structural_marker: bool,
16    pub repetition_ratio: f32,
17    pub combined: f32,
18}
19
20const MAX_TRIGRAM_SET_SIZE: usize = 10_000;
21
22/// Scores all lines in the input text for information density.
23pub fn score_lines(text: &str) -> Vec<LineScore> {
24    let lines: Vec<&str> = text.lines().collect();
25    let mut seen_trigrams: HashSet<String> = HashSet::new();
26    let mut trigram_saturated = false;
27    let mut scores = Vec::with_capacity(lines.len());
28
29    for (idx, line) in lines.iter().enumerate() {
30        let trimmed = line.trim();
31
32        let entropy = char_entropy(trimmed);
33        let has_marker = has_structural_marker(trimmed);
34        let rep_ratio = if trigram_saturated {
35            0.0
36        } else {
37            repetition_ratio(trimmed, &seen_trigrams)
38        };
39
40        if !trigram_saturated {
41            register_trigrams(trimmed, &mut seen_trigrams);
42            if seen_trigrams.len() >= MAX_TRIGRAM_SET_SIZE {
43                trigram_saturated = true;
44            }
45        }
46
47        let combined = compute_combined(entropy, has_marker, rep_ratio);
48
49        scores.push(LineScore {
50            line_idx: idx,
51            entropy,
52            has_structural_marker: has_marker,
53            repetition_ratio: rep_ratio,
54            combined,
55        });
56    }
57
58    scores
59}
60
61fn char_entropy(line: &str) -> f32 {
62    if line.is_empty() {
63        return 0.0;
64    }
65    let mut freq = [0u32; 128];
66    let mut total = 0u32;
67    for b in line.bytes() {
68        if (b as usize) < 128 {
69            freq[b as usize] += 1;
70            total += 1;
71        }
72    }
73    if total == 0 {
74        return 0.0;
75    }
76    let mut ent = 0.0f32;
77    for &count in &freq {
78        if count > 0 {
79            let p = count as f32 / total as f32;
80            ent -= p * p.log2();
81        }
82    }
83    ent
84}
85
86fn has_structural_marker(line: &str) -> bool {
87    if line.contains('/') && (line.contains('.') || line.contains("src")) {
88        return true;
89    }
90    if line.chars().any(|c| c.is_ascii_digit()) {
91        return true;
92    }
93    if line.contains("error") || line.contains("Error") || line.contains("ERROR") {
94        return true;
95    }
96    if line.contains("warning") || line.contains("Warning") || line.contains("WARN") {
97        return true;
98    }
99    let long_idents = line
100        .split(|c: char| !c.is_alphanumeric() && c != '_')
101        .filter(|w| w.len() >= 6)
102        .count();
103    long_idents >= 2
104}
105
106fn repetition_ratio(line: &str, seen: &HashSet<String>) -> f32 {
107    let chars: Vec<char> = line.chars().collect();
108    if chars.len() < 9 {
109        return 0.0;
110    }
111    let total = chars.len().saturating_sub(2);
112    if total == 0 {
113        return 0.0;
114    }
115    let mut repeated = 0;
116    for i in 0..total {
117        let end = (i + 3).min(chars.len());
118        let trigram: String = chars[i..end].iter().collect();
119        if seen.contains(&trigram) {
120            repeated += 1;
121        }
122    }
123    repeated as f32 / total as f32
124}
125
126fn register_trigrams(line: &str, seen: &mut HashSet<String>) {
127    let chars: Vec<char> = line.chars().collect();
128    if chars.len() < 3 {
129        return;
130    }
131    for i in 0..chars.len().saturating_sub(2) {
132        let end = (i + 3).min(chars.len());
133        let trigram: String = chars[i..end].iter().collect();
134        seen.insert(trigram);
135    }
136}
137
138fn compute_combined(entropy: f32, has_marker: bool, rep_ratio: f32) -> f32 {
139    let marker_bonus = if has_marker { 0.3 } else { 0.0 };
140    let rep_penalty = rep_ratio * 0.5;
141    (entropy + marker_bonus - rep_penalty).max(0.0)
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    #[test]
149    fn empty_line_zero_entropy() {
150        assert_eq!(char_entropy(""), 0.0);
151    }
152
153    #[test]
154    fn uniform_string_low_entropy() {
155        let e = char_entropy("aaaaaaaaaa");
156        assert!(e < 0.01, "uniform string should have ~0 entropy, got {e}");
157    }
158
159    #[test]
160    fn mixed_string_higher_entropy() {
161        let low = char_entropy("aaaaaaaaaa");
162        let high = char_entropy("abcdefghij");
163        assert!(high > low, "mixed > uniform entropy");
164    }
165
166    #[test]
167    fn structural_marker_path() {
168        assert!(has_structural_marker("src/core/config.rs"));
169    }
170
171    #[test]
172    fn structural_marker_error() {
173        assert!(has_structural_marker("error[E0308]: mismatched types"));
174    }
175
176    #[test]
177    fn structural_marker_missing() {
178        assert!(!has_structural_marker("this is a simple line"));
179    }
180
181    #[test]
182    fn score_lines_returns_all_lines() {
183        let text = "line one\nline two\nline three";
184        let scores = score_lines(text);
185        assert_eq!(scores.len(), 3);
186    }
187
188    #[test]
189    fn repetitive_lines_get_lower_score() {
190        let text = "exactly the same line repeated here\nexactly the same line repeated here\nunique content with different words";
191        let scores = score_lines(text);
192        assert!(
193            scores[2].combined >= scores[1].combined,
194            "unique line should score >= repeated: {} vs {}",
195            scores[2].combined,
196            scores[1].combined
197        );
198    }
199}