Skip to main content

lean_ctx/core/terse/
scoring.rs

1//! Surprisal-based line scoring for deterministic compression.
2//!
3//! Each line receives an information density score based on:
4//! - Character-level entropy (Shannon)
5//! - Structural markers (paths, numbers, identifiers)
6//! - Repetition detection (overlap with previous lines)
7
8use std::collections::HashSet;
9
10/// Score for a single line — higher means more informative.
11#[derive(Debug, Clone)]
12pub struct LineScore {
13    pub line_idx: usize,
14    pub entropy: f32,
15    pub has_structural_marker: bool,
16    pub repetition_ratio: f32,
17    pub combined: f32,
18}
19
20/// Scores all lines in the input text for information density.
21pub fn score_lines(text: &str) -> Vec<LineScore> {
22    let lines: Vec<&str> = text.lines().collect();
23    let mut seen_trigrams: HashSet<String> = HashSet::new();
24    let mut scores = Vec::with_capacity(lines.len());
25
26    for (idx, line) in lines.iter().enumerate() {
27        let trimmed = line.trim();
28
29        let entropy = char_entropy(trimmed);
30        let has_marker = has_structural_marker(trimmed);
31        let rep_ratio = repetition_ratio(trimmed, &seen_trigrams);
32
33        register_trigrams(trimmed, &mut seen_trigrams);
34
35        let combined = compute_combined(entropy, has_marker, rep_ratio);
36
37        scores.push(LineScore {
38            line_idx: idx,
39            entropy,
40            has_structural_marker: has_marker,
41            repetition_ratio: rep_ratio,
42            combined,
43        });
44    }
45
46    scores
47}
48
49fn char_entropy(line: &str) -> f32 {
50    if line.is_empty() {
51        return 0.0;
52    }
53    let mut freq = [0u32; 128];
54    let mut total = 0u32;
55    for b in line.bytes() {
56        if (b as usize) < 128 {
57            freq[b as usize] += 1;
58            total += 1;
59        }
60    }
61    if total == 0 {
62        return 0.0;
63    }
64    let mut ent = 0.0f32;
65    for &count in &freq {
66        if count > 0 {
67            let p = count as f32 / total as f32;
68            ent -= p * p.log2();
69        }
70    }
71    ent
72}
73
74fn has_structural_marker(line: &str) -> bool {
75    if line.contains('/') && (line.contains('.') || line.contains("src")) {
76        return true;
77    }
78    if line.chars().any(|c| c.is_ascii_digit()) {
79        return true;
80    }
81    if line.contains("error") || line.contains("Error") || line.contains("ERROR") {
82        return true;
83    }
84    if line.contains("warning") || line.contains("Warning") || line.contains("WARN") {
85        return true;
86    }
87    let long_idents = line
88        .split(|c: char| !c.is_alphanumeric() && c != '_')
89        .filter(|w| w.len() >= 6)
90        .count();
91    long_idents >= 2
92}
93
94fn repetition_ratio(line: &str, seen: &HashSet<String>) -> f32 {
95    let chars: Vec<char> = line.chars().collect();
96    if chars.len() < 9 {
97        return 0.0;
98    }
99    let total = chars.len().saturating_sub(2);
100    if total == 0 {
101        return 0.0;
102    }
103    let mut repeated = 0;
104    for i in 0..total {
105        let end = (i + 3).min(chars.len());
106        let trigram: String = chars[i..end].iter().collect();
107        if seen.contains(&trigram) {
108            repeated += 1;
109        }
110    }
111    repeated as f32 / total as f32
112}
113
114fn register_trigrams(line: &str, seen: &mut HashSet<String>) {
115    let chars: Vec<char> = line.chars().collect();
116    if chars.len() < 3 {
117        return;
118    }
119    for i in 0..chars.len().saturating_sub(2) {
120        let end = (i + 3).min(chars.len());
121        let trigram: String = chars[i..end].iter().collect();
122        seen.insert(trigram);
123    }
124}
125
126fn compute_combined(entropy: f32, has_marker: bool, rep_ratio: f32) -> f32 {
127    let marker_bonus = if has_marker { 0.3 } else { 0.0 };
128    let rep_penalty = rep_ratio * 0.5;
129    (entropy + marker_bonus - rep_penalty).max(0.0)
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135
136    #[test]
137    fn empty_line_zero_entropy() {
138        assert_eq!(char_entropy(""), 0.0);
139    }
140
141    #[test]
142    fn uniform_string_low_entropy() {
143        let e = char_entropy("aaaaaaaaaa");
144        assert!(e < 0.01, "uniform string should have ~0 entropy, got {e}");
145    }
146
147    #[test]
148    fn mixed_string_higher_entropy() {
149        let low = char_entropy("aaaaaaaaaa");
150        let high = char_entropy("abcdefghij");
151        assert!(high > low, "mixed > uniform entropy");
152    }
153
154    #[test]
155    fn structural_marker_path() {
156        assert!(has_structural_marker("src/core/config.rs"));
157    }
158
159    #[test]
160    fn structural_marker_error() {
161        assert!(has_structural_marker("error[E0308]: mismatched types"));
162    }
163
164    #[test]
165    fn structural_marker_missing() {
166        assert!(!has_structural_marker("this is a simple line"));
167    }
168
169    #[test]
170    fn score_lines_returns_all_lines() {
171        let text = "line one\nline two\nline three";
172        let scores = score_lines(text);
173        assert_eq!(scores.len(), 3);
174    }
175
176    #[test]
177    fn repetitive_lines_get_lower_score() {
178        let text = "exactly the same line repeated here\nexactly the same line repeated here\nunique content with different words";
179        let scores = score_lines(text);
180        assert!(
181            scores[2].combined >= scores[1].combined,
182            "unique line should score >= repeated: {} vs {}",
183            scores[2].combined,
184            scores[1].combined
185        );
186    }
187}