1use std::collections::HashMap;
2
3#[derive(Debug, Clone)]
10pub struct CodebookEntry {
11 pub id: String,
12 pub pattern: String,
13 pub frequency: usize,
14 pub idf: f64,
15}
16
17#[derive(Debug, Default)]
18pub struct Codebook {
19 entries: Vec<CodebookEntry>,
20 pattern_to_id: HashMap<String, String>,
21 next_id: usize,
22}
23
24impl Codebook {
25 pub fn new() -> Self {
26 Self::default()
27 }
28
29 pub fn build_from_files(&mut self, files: &[(&str, &str)]) {
34 let total_docs = files.len() as f64;
35 if total_docs < 2.0 {
36 return;
37 }
38
39 let total_lines: usize = files.iter().map(|(_, c)| c.lines().count()).sum();
40 if total_lines > 50_000 {
41 return;
42 }
43
44 let mut doc_freq: HashMap<String, usize> = HashMap::new();
45 let mut term_freq: HashMap<String, usize> = HashMap::new();
46
47 for (_, content) in files {
48 let mut seen_in_doc: std::collections::HashSet<String> =
49 std::collections::HashSet::new();
50 for line in content.lines() {
51 let normalized = normalize_line(line);
52 if normalized.len() < 10 {
53 continue;
54 }
55
56 *term_freq.entry(normalized.clone()).or_insert(0) += 1;
57
58 if seen_in_doc.insert(normalized.clone()) {
59 *doc_freq.entry(normalized).or_insert(0) += 1;
60 }
61 }
62 }
63
64 let mut candidates: Vec<(String, usize, f64)> = doc_freq
66 .into_iter()
67 .filter(|(_, df)| *df >= 3) .map(|(pattern, df)| {
69 let idf = (total_docs / df as f64).ln();
70 let tf = *term_freq.get(&pattern).unwrap_or(&0);
71 (pattern, tf, idf)
72 })
73 .collect();
74
75 candidates.sort_by_key(|x| std::cmp::Reverse(x.1));
77
78 for (pattern, freq, idf) in candidates.into_iter().take(50) {
80 let id = format!("§{}", self.next_id);
81 self.next_id += 1;
82 self.pattern_to_id.insert(pattern.clone(), id.clone());
83 self.entries.push(CodebookEntry {
84 id,
85 pattern,
86 frequency: freq,
87 idf,
88 });
89 }
90 }
91
92 pub fn compress(&self, content: &str) -> (String, Vec<String>) {
95 if self.entries.is_empty() {
96 return (content.to_string(), vec![]);
97 }
98
99 let mut result = Vec::new();
100 let mut refs_used = Vec::new();
101
102 for line in content.lines() {
103 let normalized = normalize_line(line);
104 if let Some(id) = self.pattern_to_id.get(&normalized) {
105 if !refs_used.contains(id) {
106 refs_used.push(id.clone());
107 }
108 result.push(format!("[{id}]"));
109 } else {
110 result.push(line.to_string());
111 }
112 }
113
114 (result.join("\n"), refs_used)
115 }
116
117 pub fn format_legend(&self, refs_used: &[String]) -> String {
119 if refs_used.is_empty() {
120 return String::new();
121 }
122
123 let mut lines = vec!["§CODEBOOK:".to_string()];
124 for entry in &self.entries {
125 if refs_used.contains(&entry.id) {
126 let short = if entry.pattern.len() > 60 {
127 format!("{}...", &entry.pattern[..57])
128 } else {
129 entry.pattern.clone()
130 };
131 lines.push(format!(" {}={}", entry.id, short));
132 }
133 }
134 lines.join("\n")
135 }
136
137 pub fn len(&self) -> usize {
138 self.entries.len()
139 }
140
141 pub fn is_empty(&self) -> bool {
142 self.entries.is_empty()
143 }
144}
145
146pub fn tfidf_cosine_similarity(doc_a: &str, doc_b: &str) -> f64 {
150 tfidf_cosine_similarity_with_corpus(&[doc_a, doc_b], doc_a, doc_b)
151}
152
153pub fn tfidf_cosine_similarity_with_corpus(corpus: &[&str], doc_a: &str, doc_b: &str) -> f64 {
155 let idf = compute_idf(corpus);
156 let tfidf_a = tfidf_vector(doc_a, &idf);
157 let tfidf_b = tfidf_vector(doc_b, &idf);
158
159 let all_terms: std::collections::HashSet<&str> =
160 tfidf_a.keys().chain(tfidf_b.keys()).copied().collect();
161 if all_terms.is_empty() {
162 return 0.0;
163 }
164
165 let mut dot = 0.0;
166 let mut mag_a = 0.0;
167 let mut mag_b = 0.0;
168
169 for term in &all_terms {
170 let a = *tfidf_a.get(term).unwrap_or(&0.0);
171 let b = *tfidf_b.get(term).unwrap_or(&0.0);
172 dot += a * b;
173 mag_a += a * a;
174 mag_b += b * b;
175 }
176
177 let magnitude = (mag_a * mag_b).sqrt();
178 if magnitude < f64::EPSILON {
179 return 0.0;
180 }
181
182 dot / magnitude
183}
184
185pub fn find_semantic_duplicates(
188 files: &[(String, String)],
189 threshold: f64,
190) -> Vec<(String, String, f64)> {
191 let corpus: Vec<&str> = files.iter().map(|(_, c)| c.as_str()).collect();
192 let idf = compute_idf(&corpus);
193 let vectors: Vec<HashMap<&str, f64>> =
194 files.iter().map(|(_, c)| tfidf_vector(c, &idf)).collect();
195
196 let mut duplicates = Vec::new();
197
198 for i in 0..files.len() {
199 for j in (i + 1)..files.len() {
200 let sim = cosine_from_vectors(&vectors[i], &vectors[j]);
201 if sim >= threshold {
202 duplicates.push((files[i].0.clone(), files[j].0.clone(), sim));
203 }
204 }
205 }
206
207 duplicates.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
208 duplicates
209}
210
211fn compute_idf<'a>(corpus: &[&'a str]) -> HashMap<&'a str, f64> {
212 let n = corpus.len() as f64;
213 if n == 0.0 {
214 return HashMap::new();
215 }
216
217 let mut doc_freq: HashMap<&str, usize> = HashMap::new();
218 for doc in corpus {
219 let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new();
220 for word in doc.split_whitespace() {
221 if seen.insert(word) {
222 *doc_freq.entry(word).or_insert(0) += 1;
223 }
224 }
225 }
226
227 doc_freq
228 .into_iter()
229 .map(|(term, df)| (term, (n / (1.0 + df as f64)).ln() + 1.0))
230 .collect()
231}
232
233fn tfidf_vector<'a>(doc: &'a str, idf: &HashMap<&str, f64>) -> HashMap<&'a str, f64> {
234 let words: Vec<&str> = doc.split_whitespace().collect();
235 let total = words.len() as f64;
236 if total == 0.0 {
237 return HashMap::new();
238 }
239
240 let mut tf: HashMap<&str, f64> = HashMap::new();
241 for word in &words {
242 *tf.entry(word).or_insert(0.0) += 1.0;
243 }
244 for val in tf.values_mut() {
245 *val /= total;
246 }
247
248 tf.into_iter()
249 .map(|(term, tf_val)| {
250 let idf_val = idf.get(term).copied().unwrap_or(1.0);
251 (term, tf_val * idf_val)
252 })
253 .collect()
254}
255
256fn cosine_from_vectors(a: &HashMap<&str, f64>, b: &HashMap<&str, f64>) -> f64 {
257 let all_terms: std::collections::HashSet<&&str> = a.keys().chain(b.keys()).collect();
258 if all_terms.is_empty() {
259 return 0.0;
260 }
261
262 let mut dot = 0.0;
263 let mut mag_a = 0.0;
264 let mut mag_b = 0.0;
265
266 for term in &all_terms {
267 let va = a.get(*term).copied().unwrap_or(0.0);
268 let vb = b.get(*term).copied().unwrap_or(0.0);
269 dot += va * vb;
270 mag_a += va * va;
271 mag_b += vb * vb;
272 }
273
274 let magnitude = (mag_a * mag_b).sqrt();
275 if magnitude < f64::EPSILON {
276 return 0.0;
277 }
278
279 dot / magnitude
280}
281
282fn normalize_line(line: &str) -> String {
283 line.split_whitespace().collect::<Vec<&str>>().join(" ")
284}
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 #[test]
291 fn codebook_identifies_common_patterns() {
292 let files: Vec<(&str, &str)> = vec![
293 (
294 "a.rs",
295 "use std::io;\nuse std::collections::HashMap;\nfn main() {}\n",
296 ),
297 (
298 "b.rs",
299 "use std::io;\nuse std::collections::HashMap;\nfn helper() {}\n",
300 ),
301 (
302 "c.rs",
303 "use std::io;\nuse std::collections::HashMap;\nfn other() {}\n",
304 ),
305 ("d.rs", "use std::io;\nfn unique() {}\n"),
306 ];
307
308 let mut cb = Codebook::new();
309 cb.build_from_files(&files);
310 assert!(!cb.is_empty(), "should find common patterns");
311 }
312
313 #[test]
314 fn cosine_identical_is_one() {
315 let sim = tfidf_cosine_similarity("hello world foo", "hello world foo");
316 assert!((sim - 1.0).abs() < 0.01);
317 }
318
319 #[test]
320 fn cosine_disjoint_is_zero() {
321 let sim = tfidf_cosine_similarity("alpha beta gamma", "delta epsilon zeta");
322 assert!(sim < 0.01);
323 }
324
325 #[test]
326 fn cosine_partial_overlap() {
327 let sim = tfidf_cosine_similarity("hello world foo bar", "hello world baz qux");
328 assert!(sim > 0.0 && sim < 1.0);
329 }
330
331 #[test]
332 fn find_duplicates_detects_similar_files() {
333 let files = vec![
334 (
335 "a.rs".to_string(),
336 "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
337 ),
338 (
339 "b.rs".to_string(),
340 "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
341 ),
342 (
343 "c.rs".to_string(),
344 "completely different content here with no overlap at all".to_string(),
345 ),
346 ];
347
348 let dups = find_semantic_duplicates(&files, 0.8);
349 assert_eq!(dups.len(), 1);
350 assert!(dups[0].2 > 0.99);
351 }
352}