lean_ctx/core/terse/
scoring.rs1use std::collections::HashSet;
9
10#[derive(Debug, Clone)]
12pub struct LineScore {
13 pub line_idx: usize,
14 pub entropy: f32,
15 pub has_structural_marker: bool,
16 pub repetition_ratio: f32,
17 pub combined: f32,
18}
19
20const MAX_TRIGRAM_SET_SIZE: usize = 10_000;
21
22pub fn score_lines(text: &str) -> Vec<LineScore> {
24 let lines: Vec<&str> = text.lines().collect();
25 let mut seen_trigrams: HashSet<String> = HashSet::new();
26 let mut trigram_saturated = false;
27 let mut scores = Vec::with_capacity(lines.len());
28
29 for (idx, line) in lines.iter().enumerate() {
30 let trimmed = line.trim();
31
32 let entropy = char_entropy(trimmed);
33 let has_marker = has_structural_marker(trimmed);
34 let rep_ratio = if trigram_saturated {
35 0.0
36 } else {
37 repetition_ratio(trimmed, &seen_trigrams)
38 };
39
40 if !trigram_saturated {
41 register_trigrams(trimmed, &mut seen_trigrams);
42 if seen_trigrams.len() >= MAX_TRIGRAM_SET_SIZE {
43 trigram_saturated = true;
44 }
45 }
46
47 let combined = compute_combined(entropy, has_marker, rep_ratio);
48
49 scores.push(LineScore {
50 line_idx: idx,
51 entropy,
52 has_structural_marker: has_marker,
53 repetition_ratio: rep_ratio,
54 combined,
55 });
56 }
57
58 scores
59}
60
61fn char_entropy(line: &str) -> f32 {
62 if line.is_empty() {
63 return 0.0;
64 }
65 let mut freq = [0u32; 128];
66 let mut total = 0u32;
67 for b in line.bytes() {
68 if (b as usize) < 128 {
69 freq[b as usize] += 1;
70 total += 1;
71 }
72 }
73 if total == 0 {
74 return 0.0;
75 }
76 let mut ent = 0.0f32;
77 for &count in &freq {
78 if count > 0 {
79 let p = count as f32 / total as f32;
80 ent -= p * p.log2();
81 }
82 }
83 ent
84}
85
86fn has_structural_marker(line: &str) -> bool {
87 if line.contains('/') && (line.contains('.') || line.contains("src")) {
88 return true;
89 }
90 if line.chars().any(|c| c.is_ascii_digit()) {
91 return true;
92 }
93 if line.contains("error") || line.contains("Error") || line.contains("ERROR") {
94 return true;
95 }
96 if line.contains("warning") || line.contains("Warning") || line.contains("WARN") {
97 return true;
98 }
99 let long_idents = line
100 .split(|c: char| !c.is_alphanumeric() && c != '_')
101 .filter(|w| w.len() >= 6)
102 .count();
103 long_idents >= 2
104}
105
106fn repetition_ratio(line: &str, seen: &HashSet<String>) -> f32 {
107 let chars: Vec<char> = line.chars().collect();
108 if chars.len() < 9 {
109 return 0.0;
110 }
111 let total = chars.len().saturating_sub(2);
112 if total == 0 {
113 return 0.0;
114 }
115 let mut repeated = 0;
116 for i in 0..total {
117 let end = (i + 3).min(chars.len());
118 let trigram: String = chars[i..end].iter().collect();
119 if seen.contains(&trigram) {
120 repeated += 1;
121 }
122 }
123 repeated as f32 / total as f32
124}
125
126fn register_trigrams(line: &str, seen: &mut HashSet<String>) {
127 let chars: Vec<char> = line.chars().collect();
128 if chars.len() < 3 {
129 return;
130 }
131 for i in 0..chars.len().saturating_sub(2) {
132 let end = (i + 3).min(chars.len());
133 let trigram: String = chars[i..end].iter().collect();
134 seen.insert(trigram);
135 }
136}
137
138fn compute_combined(entropy: f32, has_marker: bool, rep_ratio: f32) -> f32 {
139 let marker_bonus = if has_marker { 0.3 } else { 0.0 };
140 let rep_penalty = rep_ratio * 0.5;
141 (entropy + marker_bonus - rep_penalty).max(0.0)
142}
143
144#[cfg(test)]
145mod tests {
146 use super::*;
147
148 #[test]
149 fn empty_line_zero_entropy() {
150 assert_eq!(char_entropy(""), 0.0);
151 }
152
153 #[test]
154 fn uniform_string_low_entropy() {
155 let e = char_entropy("aaaaaaaaaa");
156 assert!(e < 0.01, "uniform string should have ~0 entropy, got {e}");
157 }
158
159 #[test]
160 fn mixed_string_higher_entropy() {
161 let low = char_entropy("aaaaaaaaaa");
162 let high = char_entropy("abcdefghij");
163 assert!(high > low, "mixed > uniform entropy");
164 }
165
166 #[test]
167 fn structural_marker_path() {
168 assert!(has_structural_marker("src/core/config.rs"));
169 }
170
171 #[test]
172 fn structural_marker_error() {
173 assert!(has_structural_marker("error[E0308]: mismatched types"));
174 }
175
176 #[test]
177 fn structural_marker_missing() {
178 assert!(!has_structural_marker("this is a simple line"));
179 }
180
181 #[test]
182 fn score_lines_returns_all_lines() {
183 let text = "line one\nline two\nline three";
184 let scores = score_lines(text);
185 assert_eq!(scores.len(), 3);
186 }
187
188 #[test]
189 fn repetitive_lines_get_lower_score() {
190 let text = "exactly the same line repeated here\nexactly the same line repeated here\nunique content with different words";
191 let scores = score_lines(text);
192 assert!(
193 scores[2].combined >= scores[1].combined,
194 "unique line should score >= repeated: {} vs {}",
195 scores[2].combined,
196 scores[1].combined
197 );
198 }
199}