lean_ctx/core/
codebook.rs1use std::collections::HashMap;
2
3#[derive(Debug, Clone)]
10pub struct CodebookEntry {
11 pub id: String,
12 pub pattern: String,
13 pub frequency: usize,
14 pub idf: f64,
15}
16
17#[derive(Debug, Default)]
18pub struct Codebook {
19 entries: Vec<CodebookEntry>,
20 pattern_to_id: HashMap<String, String>,
21 next_id: usize,
22}
23
24impl Codebook {
25 pub fn new() -> Self {
26 Self::default()
27 }
28
29 pub fn build_from_files(&mut self, files: &[(String, String)]) {
32 let total_docs = files.len() as f64;
33 if total_docs < 2.0 {
34 return;
35 }
36
37 let mut doc_freq: HashMap<String, usize> = HashMap::new();
39 let mut term_freq: HashMap<String, usize> = HashMap::new();
40
41 for (_, content) in files {
42 let mut seen_in_doc: std::collections::HashSet<String> =
43 std::collections::HashSet::new();
44 for line in content.lines() {
45 let normalized = normalize_line(line);
46 if normalized.len() < 10 {
47 continue;
48 }
49
50 *term_freq.entry(normalized.clone()).or_insert(0) += 1;
51
52 if seen_in_doc.insert(normalized.clone()) {
53 *doc_freq.entry(normalized).or_insert(0) += 1;
54 }
55 }
56 }
57
58 let mut candidates: Vec<(String, usize, f64)> = doc_freq
60 .into_iter()
61 .filter(|(_, df)| *df >= 3) .map(|(pattern, df)| {
63 let idf = (total_docs / df as f64).ln();
64 let tf = *term_freq.get(&pattern).unwrap_or(&0);
65 (pattern, tf, idf)
66 })
67 .collect();
68
69 candidates.sort_by(|a, b| b.1.cmp(&a.1));
71
72 for (pattern, freq, idf) in candidates.into_iter().take(50) {
74 let id = format!("§{}", self.next_id);
75 self.next_id += 1;
76 self.pattern_to_id.insert(pattern.clone(), id.clone());
77 self.entries.push(CodebookEntry {
78 id,
79 pattern,
80 frequency: freq,
81 idf,
82 });
83 }
84 }
85
86 pub fn compress(&self, content: &str) -> (String, Vec<String>) {
89 if self.entries.is_empty() {
90 return (content.to_string(), vec![]);
91 }
92
93 let mut result = Vec::new();
94 let mut refs_used = Vec::new();
95
96 for line in content.lines() {
97 let normalized = normalize_line(line);
98 if let Some(id) = self.pattern_to_id.get(&normalized) {
99 if !refs_used.contains(id) {
100 refs_used.push(id.clone());
101 }
102 result.push(format!("[{id}]"));
103 } else {
104 result.push(line.to_string());
105 }
106 }
107
108 (result.join("\n"), refs_used)
109 }
110
111 pub fn format_legend(&self, refs_used: &[String]) -> String {
113 if refs_used.is_empty() {
114 return String::new();
115 }
116
117 let mut lines = vec!["§CODEBOOK:".to_string()];
118 for entry in &self.entries {
119 if refs_used.contains(&entry.id) {
120 let short = if entry.pattern.len() > 60 {
121 format!("{}...", &entry.pattern[..57])
122 } else {
123 entry.pattern.clone()
124 };
125 lines.push(format!(" {}={}", entry.id, short));
126 }
127 }
128 lines.join("\n")
129 }
130
131 pub fn len(&self) -> usize {
132 self.entries.len()
133 }
134
135 pub fn is_empty(&self) -> bool {
136 self.entries.is_empty()
137 }
138}
139
140pub fn tfidf_cosine_similarity(doc_a: &str, doc_b: &str) -> f64 {
143 let tf_a = term_frequencies(doc_a);
144 let tf_b = term_frequencies(doc_b);
145
146 let all_terms: std::collections::HashSet<&str> =
147 tf_a.keys().chain(tf_b.keys()).copied().collect();
148 if all_terms.is_empty() {
149 return 0.0;
150 }
151
152 let mut dot = 0.0;
153 let mut mag_a = 0.0;
154 let mut mag_b = 0.0;
155
156 for term in &all_terms {
157 let a = *tf_a.get(term).unwrap_or(&0.0);
158 let b = *tf_b.get(term).unwrap_or(&0.0);
159 dot += a * b;
160 mag_a += a * a;
161 mag_b += b * b;
162 }
163
164 let magnitude = (mag_a * mag_b).sqrt();
165 if magnitude < f64::EPSILON {
166 return 0.0;
167 }
168
169 dot / magnitude
170}
171
172pub fn find_semantic_duplicates(
175 files: &[(String, String)],
176 threshold: f64,
177) -> Vec<(String, String, f64)> {
178 let mut duplicates = Vec::new();
179
180 for i in 0..files.len() {
181 for j in (i + 1)..files.len() {
182 let sim = tfidf_cosine_similarity(&files[i].1, &files[j].1);
183 if sim >= threshold {
184 duplicates.push((files[i].0.clone(), files[j].0.clone(), sim));
185 }
186 }
187 }
188
189 duplicates.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
190 duplicates
191}
192
193fn term_frequencies(text: &str) -> HashMap<&str, f64> {
194 let mut freq: HashMap<&str, f64> = HashMap::new();
195 let words: Vec<&str> = text.split_whitespace().collect();
196 let total = words.len() as f64;
197 if total == 0.0 {
198 return freq;
199 }
200
201 for word in &words {
202 *freq.entry(word).or_insert(0.0) += 1.0;
203 }
204
205 for val in freq.values_mut() {
206 *val /= total;
207 }
208
209 freq
210}
211
212fn normalize_line(line: &str) -> String {
213 line.split_whitespace().collect::<Vec<&str>>().join(" ")
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219
220 #[test]
221 fn codebook_identifies_common_patterns() {
222 let files = vec![
223 (
224 "a.rs".to_string(),
225 "use std::io;\nuse std::collections::HashMap;\nfn main() {}\n".to_string(),
226 ),
227 (
228 "b.rs".to_string(),
229 "use std::io;\nuse std::collections::HashMap;\nfn helper() {}\n".to_string(),
230 ),
231 (
232 "c.rs".to_string(),
233 "use std::io;\nuse std::collections::HashMap;\nfn other() {}\n".to_string(),
234 ),
235 (
236 "d.rs".to_string(),
237 "use std::io;\nfn unique() {}\n".to_string(),
238 ),
239 ];
240
241 let mut cb = Codebook::new();
242 cb.build_from_files(&files);
243 assert!(!cb.is_empty(), "should find common patterns");
244 }
245
246 #[test]
247 fn cosine_identical_is_one() {
248 let sim = tfidf_cosine_similarity("hello world foo", "hello world foo");
249 assert!((sim - 1.0).abs() < 0.01);
250 }
251
252 #[test]
253 fn cosine_disjoint_is_zero() {
254 let sim = tfidf_cosine_similarity("alpha beta gamma", "delta epsilon zeta");
255 assert!(sim < 0.01);
256 }
257
258 #[test]
259 fn cosine_partial_overlap() {
260 let sim = tfidf_cosine_similarity("hello world foo bar", "hello world baz qux");
261 assert!(sim > 0.0 && sim < 1.0);
262 }
263
264 #[test]
265 fn find_duplicates_detects_similar_files() {
266 let files = vec![
267 (
268 "a.rs".to_string(),
269 "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
270 ),
271 (
272 "b.rs".to_string(),
273 "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
274 ),
275 (
276 "c.rs".to_string(),
277 "completely different content here with no overlap at all".to_string(),
278 ),
279 ];
280
281 let dups = find_semantic_duplicates(&files, 0.8);
282 assert_eq!(dups.len(), 1);
283 assert!(dups[0].2 > 0.99);
284 }
285}