Skip to main content

lean_ctx/core/
codebook.rs

1use std::collections::HashMap;
2
3/// Cross-file semantic deduplication via TF-IDF codebook.
4///
5/// Identifies patterns that appear frequently across files (high TF, low IDF)
6/// and creates short references for them. This avoids sending the same
7/// boilerplate to the LLM multiple times across different file reads.
8
9#[derive(Debug, Clone)]
10pub struct CodebookEntry {
11    pub id: String,
12    pub pattern: String,
13    pub frequency: usize,
14    pub idf: f64,
15}
16
17#[derive(Debug, Default)]
18pub struct Codebook {
19    entries: Vec<CodebookEntry>,
20    pattern_to_id: HashMap<String, String>,
21    next_id: usize,
22}
23
24impl Codebook {
25    pub fn new() -> Self {
26        Self::default()
27    }
28
29    /// Build codebook from multiple file contents.
30    /// Identifies lines that appear in 3+ files and creates short references.
31    pub fn build_from_files(&mut self, files: &[(String, String)]) {
32        let total_docs = files.len() as f64;
33        if total_docs < 2.0 {
34            return;
35        }
36
37        // Count document frequency for each normalized line
38        let mut doc_freq: HashMap<String, usize> = HashMap::new();
39        let mut term_freq: HashMap<String, usize> = HashMap::new();
40
41        for (_, content) in files {
42            let mut seen_in_doc: std::collections::HashSet<String> =
43                std::collections::HashSet::new();
44            for line in content.lines() {
45                let normalized = normalize_line(line);
46                if normalized.len() < 10 {
47                    continue;
48                }
49
50                *term_freq.entry(normalized.clone()).or_insert(0) += 1;
51
52                if seen_in_doc.insert(normalized.clone()) {
53                    *doc_freq.entry(normalized).or_insert(0) += 1;
54                }
55            }
56        }
57
58        // Select patterns with high DF (appear in many files) — these are boilerplate
59        let mut candidates: Vec<(String, usize, f64)> = doc_freq
60            .into_iter()
61            .filter(|(_, df)| *df >= 3) // appears in 3+ files
62            .map(|(pattern, df)| {
63                let idf = (total_docs / df as f64).ln();
64                let tf = *term_freq.get(&pattern).unwrap_or(&0);
65                (pattern, tf, idf)
66            })
67            .collect();
68
69        // Sort by frequency descending (most common boilerplate first)
70        candidates.sort_by(|a, b| b.1.cmp(&a.1));
71
72        // Take top 50 patterns to keep codebook compact
73        for (pattern, freq, idf) in candidates.into_iter().take(50) {
74            let id = format!("§{}", self.next_id);
75            self.next_id += 1;
76            self.pattern_to_id.insert(pattern.clone(), id.clone());
77            self.entries.push(CodebookEntry {
78                id,
79                pattern,
80                frequency: freq,
81                idf,
82            });
83        }
84    }
85
86    /// Apply codebook to content: replace known patterns with short references.
87    /// Returns (compressed content, references used).
88    pub fn compress(&self, content: &str) -> (String, Vec<String>) {
89        if self.entries.is_empty() {
90            return (content.to_string(), vec![]);
91        }
92
93        let mut result = Vec::new();
94        let mut refs_used = Vec::new();
95
96        for line in content.lines() {
97            let normalized = normalize_line(line);
98            if let Some(id) = self.pattern_to_id.get(&normalized) {
99                if !refs_used.contains(id) {
100                    refs_used.push(id.clone());
101                }
102                result.push(format!("[{id}]"));
103            } else {
104                result.push(line.to_string());
105            }
106        }
107
108        (result.join("\n"), refs_used)
109    }
110
111    /// Format the codebook legend for lines that were referenced.
112    pub fn format_legend(&self, refs_used: &[String]) -> String {
113        if refs_used.is_empty() {
114            return String::new();
115        }
116
117        let mut lines = vec!["§CODEBOOK:".to_string()];
118        for entry in &self.entries {
119            if refs_used.contains(&entry.id) {
120                let short = if entry.pattern.len() > 60 {
121                    format!("{}...", &entry.pattern[..57])
122                } else {
123                    entry.pattern.clone()
124                };
125                lines.push(format!("  {}={}", entry.id, short));
126            }
127        }
128        lines.join("\n")
129    }
130
131    pub fn len(&self) -> usize {
132        self.entries.len()
133    }
134
135    pub fn is_empty(&self) -> bool {
136        self.entries.is_empty()
137    }
138}
139
140/// Cosine similarity between two documents using TF-IDF vectors.
141/// Used for embedding-space deduplication approximation.
142pub fn tfidf_cosine_similarity(doc_a: &str, doc_b: &str) -> f64 {
143    let tf_a = term_frequencies(doc_a);
144    let tf_b = term_frequencies(doc_b);
145
146    let all_terms: std::collections::HashSet<&str> =
147        tf_a.keys().chain(tf_b.keys()).copied().collect();
148    if all_terms.is_empty() {
149        return 0.0;
150    }
151
152    let mut dot = 0.0;
153    let mut mag_a = 0.0;
154    let mut mag_b = 0.0;
155
156    for term in &all_terms {
157        let a = *tf_a.get(term).unwrap_or(&0.0);
158        let b = *tf_b.get(term).unwrap_or(&0.0);
159        dot += a * b;
160        mag_a += a * a;
161        mag_b += b * b;
162    }
163
164    let magnitude = (mag_a * mag_b).sqrt();
165    if magnitude < f64::EPSILON {
166        return 0.0;
167    }
168
169    dot / magnitude
170}
171
172/// Identify semantically duplicate blocks across files.
173/// Returns pairs of (file_a, file_b, similarity) where similarity > threshold.
174pub fn find_semantic_duplicates(
175    files: &[(String, String)],
176    threshold: f64,
177) -> Vec<(String, String, f64)> {
178    let mut duplicates = Vec::new();
179
180    for i in 0..files.len() {
181        for j in (i + 1)..files.len() {
182            let sim = tfidf_cosine_similarity(&files[i].1, &files[j].1);
183            if sim >= threshold {
184                duplicates.push((files[i].0.clone(), files[j].0.clone(), sim));
185            }
186        }
187    }
188
189    duplicates.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
190    duplicates
191}
192
193fn term_frequencies(text: &str) -> HashMap<&str, f64> {
194    let mut freq: HashMap<&str, f64> = HashMap::new();
195    let words: Vec<&str> = text.split_whitespace().collect();
196    let total = words.len() as f64;
197    if total == 0.0 {
198        return freq;
199    }
200
201    for word in &words {
202        *freq.entry(word).or_insert(0.0) += 1.0;
203    }
204
205    for val in freq.values_mut() {
206        *val /= total;
207    }
208
209    freq
210}
211
212fn normalize_line(line: &str) -> String {
213    line.split_whitespace().collect::<Vec<&str>>().join(" ")
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    #[test]
221    fn codebook_identifies_common_patterns() {
222        let files = vec![
223            (
224                "a.rs".to_string(),
225                "use std::io;\nuse std::collections::HashMap;\nfn main() {}\n".to_string(),
226            ),
227            (
228                "b.rs".to_string(),
229                "use std::io;\nuse std::collections::HashMap;\nfn helper() {}\n".to_string(),
230            ),
231            (
232                "c.rs".to_string(),
233                "use std::io;\nuse std::collections::HashMap;\nfn other() {}\n".to_string(),
234            ),
235            (
236                "d.rs".to_string(),
237                "use std::io;\nfn unique() {}\n".to_string(),
238            ),
239        ];
240
241        let mut cb = Codebook::new();
242        cb.build_from_files(&files);
243        assert!(!cb.is_empty(), "should find common patterns");
244    }
245
246    #[test]
247    fn cosine_identical_is_one() {
248        let sim = tfidf_cosine_similarity("hello world foo", "hello world foo");
249        assert!((sim - 1.0).abs() < 0.01);
250    }
251
252    #[test]
253    fn cosine_disjoint_is_zero() {
254        let sim = tfidf_cosine_similarity("alpha beta gamma", "delta epsilon zeta");
255        assert!(sim < 0.01);
256    }
257
258    #[test]
259    fn cosine_partial_overlap() {
260        let sim = tfidf_cosine_similarity("hello world foo bar", "hello world baz qux");
261        assert!(sim > 0.0 && sim < 1.0);
262    }
263
264    #[test]
265    fn find_duplicates_detects_similar_files() {
266        let files = vec![
267            (
268                "a.rs".to_string(),
269                "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
270            ),
271            (
272                "b.rs".to_string(),
273                "fn main() { let x = 1; let y = 2; println!(x + y); }".to_string(),
274            ),
275            (
276                "c.rs".to_string(),
277                "completely different content here with no overlap at all".to_string(),
278            ),
279        ];
280
281        let dups = find_semantic_duplicates(&files, 0.8);
282        assert_eq!(dups.len(), 1);
283        assert!(dups[0].2 > 0.99);
284    }
285}