lean_ctx/core/terse/
scoring.rs1use std::collections::HashSet;
9
10#[derive(Debug, Clone)]
12pub struct LineScore {
13 pub line_idx: usize,
14 pub entropy: f32,
15 pub has_structural_marker: bool,
16 pub repetition_ratio: f32,
17 pub combined: f32,
18}
19
20pub fn score_lines(text: &str) -> Vec<LineScore> {
22 let lines: Vec<&str> = text.lines().collect();
23 let mut seen_trigrams: HashSet<String> = HashSet::new();
24 let mut scores = Vec::with_capacity(lines.len());
25
26 for (idx, line) in lines.iter().enumerate() {
27 let trimmed = line.trim();
28
29 let entropy = char_entropy(trimmed);
30 let has_marker = has_structural_marker(trimmed);
31 let rep_ratio = repetition_ratio(trimmed, &seen_trigrams);
32
33 register_trigrams(trimmed, &mut seen_trigrams);
34
35 let combined = compute_combined(entropy, has_marker, rep_ratio);
36
37 scores.push(LineScore {
38 line_idx: idx,
39 entropy,
40 has_structural_marker: has_marker,
41 repetition_ratio: rep_ratio,
42 combined,
43 });
44 }
45
46 scores
47}
48
49fn char_entropy(line: &str) -> f32 {
50 if line.is_empty() {
51 return 0.0;
52 }
53 let mut freq = [0u32; 128];
54 let mut total = 0u32;
55 for b in line.bytes() {
56 if (b as usize) < 128 {
57 freq[b as usize] += 1;
58 total += 1;
59 }
60 }
61 if total == 0 {
62 return 0.0;
63 }
64 let mut ent = 0.0f32;
65 for &count in &freq {
66 if count > 0 {
67 let p = count as f32 / total as f32;
68 ent -= p * p.log2();
69 }
70 }
71 ent
72}
73
74fn has_structural_marker(line: &str) -> bool {
75 if line.contains('/') && (line.contains('.') || line.contains("src")) {
76 return true;
77 }
78 if line.chars().any(|c| c.is_ascii_digit()) {
79 return true;
80 }
81 if line.contains("error") || line.contains("Error") || line.contains("ERROR") {
82 return true;
83 }
84 if line.contains("warning") || line.contains("Warning") || line.contains("WARN") {
85 return true;
86 }
87 let long_idents = line
88 .split(|c: char| !c.is_alphanumeric() && c != '_')
89 .filter(|w| w.len() >= 6)
90 .count();
91 long_idents >= 2
92}
93
94fn repetition_ratio(line: &str, seen: &HashSet<String>) -> f32 {
95 let chars: Vec<char> = line.chars().collect();
96 if chars.len() < 9 {
97 return 0.0;
98 }
99 let total = chars.len().saturating_sub(2);
100 if total == 0 {
101 return 0.0;
102 }
103 let mut repeated = 0;
104 for i in 0..total {
105 let end = (i + 3).min(chars.len());
106 let trigram: String = chars[i..end].iter().collect();
107 if seen.contains(&trigram) {
108 repeated += 1;
109 }
110 }
111 repeated as f32 / total as f32
112}
113
114fn register_trigrams(line: &str, seen: &mut HashSet<String>) {
115 let chars: Vec<char> = line.chars().collect();
116 if chars.len() < 3 {
117 return;
118 }
119 for i in 0..chars.len().saturating_sub(2) {
120 let end = (i + 3).min(chars.len());
121 let trigram: String = chars[i..end].iter().collect();
122 seen.insert(trigram);
123 }
124}
125
126fn compute_combined(entropy: f32, has_marker: bool, rep_ratio: f32) -> f32 {
127 let marker_bonus = if has_marker { 0.3 } else { 0.0 };
128 let rep_penalty = rep_ratio * 0.5;
129 (entropy + marker_bonus - rep_penalty).max(0.0)
130}
131
132#[cfg(test)]
133mod tests {
134 use super::*;
135
136 #[test]
137 fn empty_line_zero_entropy() {
138 assert_eq!(char_entropy(""), 0.0);
139 }
140
141 #[test]
142 fn uniform_string_low_entropy() {
143 let e = char_entropy("aaaaaaaaaa");
144 assert!(e < 0.01, "uniform string should have ~0 entropy, got {e}");
145 }
146
147 #[test]
148 fn mixed_string_higher_entropy() {
149 let low = char_entropy("aaaaaaaaaa");
150 let high = char_entropy("abcdefghij");
151 assert!(high > low, "mixed > uniform entropy");
152 }
153
154 #[test]
155 fn structural_marker_path() {
156 assert!(has_structural_marker("src/core/config.rs"));
157 }
158
159 #[test]
160 fn structural_marker_error() {
161 assert!(has_structural_marker("error[E0308]: mismatched types"));
162 }
163
164 #[test]
165 fn structural_marker_missing() {
166 assert!(!has_structural_marker("this is a simple line"));
167 }
168
169 #[test]
170 fn score_lines_returns_all_lines() {
171 let text = "line one\nline two\nline three";
172 let scores = score_lines(text);
173 assert_eq!(scores.len(), 3);
174 }
175
176 #[test]
177 fn repetitive_lines_get_lower_score() {
178 let text = "exactly the same line repeated here\nexactly the same line repeated here\nunique content with different words";
179 let scores = score_lines(text);
180 assert!(
181 scores[2].combined >= scores[1].combined,
182 "unique line should score >= repeated: {} vs {}",
183 scores[2].combined,
184 scores[1].combined
185 );
186 }
187}