lean_ctx/tools/
ctx_dedup.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8    analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12    match action {
13        "apply" => apply_dedup(cache),
14        _ => analyze(cache),
15    }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19    let entries = cache.get_all_entries();
20    if entries.len() < 2 {
21        return "Need at least 2 cached files for cross-file dedup.".to_string();
22    }
23
24    let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25    for (path, entry) in &entries {
26        let Some(full_content) = entry.content() else {
27            continue;
28        };
29        let lines: Vec<&str> = full_content.lines().collect();
30        for (idx, chunk) in lines.chunks(5).enumerate() {
31            if chunk.len() == 5 {
32                let block = chunk.join("\n");
33                let trimmed = block.trim().to_string();
34                if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
35                    block_occurrences
36                        .entry(trimmed)
37                        .or_default()
38                        .push(((*path).clone(), idx * 5 + 1));
39                }
40            }
41        }
42    }
43
44    let mut shared = Vec::new();
45    for (content, occurrences) in &block_occurrences {
46        let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
47        if unique_files.len() >= 2 {
48            let (canonical_path, start_line) = &occurrences[0];
49            let ref_label = cache
50                .file_ref_map()
51                .get(canonical_path)
52                .cloned()
53                .unwrap_or_else(|| "F?".to_string());
54            shared.push(SharedBlock {
55                canonical_path: canonical_path.clone(),
56                canonical_ref: ref_label,
57                start_line: *start_line,
58                end_line: start_line + 4,
59                content: content.clone(),
60            });
61        }
62    }
63
64    let count = shared.len();
65    let savings: usize = shared
66        .iter()
67        .map(|b| {
68            let occurrences = block_occurrences.get(&b.content).map_or(0, |o| {
69                let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
70                unique.len() - 1
71            });
72            count_tokens(&b.content) * occurrences
73        })
74        .sum();
75
76    cache.set_shared_blocks(shared);
77
78    format!(
79        "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
80    )
81}
82
83fn analyze(cache: &SessionCache) -> String {
84    let entries = cache.get_all_entries();
85    if entries.len() < 2 {
86        return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
87    }
88
89    let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
90    let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
91
92    for (path, entry) in &entries {
93        let Some(full_content) = entry.content() else {
94            continue;
95        };
96        let lines: Vec<&str> = full_content.lines().collect();
97
98        let imports: Vec<&str> = lines
99            .iter()
100            .copied()
101            .filter(|l| {
102                let t = l.trim();
103                t.starts_with("import ")
104                    || t.starts_with("use ")
105                    || t.starts_with("from ")
106                    || t.starts_with("require(")
107                    || t.starts_with("#include")
108            })
109            .collect();
110
111        for imp in &imports {
112            let key = imp.trim().to_string();
113            import_patterns
114                .entry(key)
115                .or_default()
116                .push((*path).clone());
117        }
118
119        for chunk in lines.chunks(5) {
120            if chunk.len() == 5 {
121                let block = chunk.join("\n");
122                let block_trimmed = block.trim().to_string();
123                if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
124                    boilerplate_blocks
125                        .entry(block_trimmed)
126                        .or_default()
127                        .push((*path).clone());
128                }
129            }
130        }
131    }
132
133    let shared_imports: Vec<_> = import_patterns
134        .iter()
135        .filter(|(_, files)| files.len() >= 2)
136        .collect();
137
138    let shared_blocks: Vec<_> = boilerplate_blocks
139        .iter()
140        .filter(|(_, files)| {
141            let unique: std::collections::HashSet<_> = files.iter().collect();
142            unique.len() >= 2
143        })
144        .collect();
145
146    let mut result = Vec::new();
147    result.push(format!(
148        "Cross-file deduplication analysis ({} cached files):",
149        entries.len()
150    ));
151
152    if !shared_imports.is_empty() {
153        let total_import_tokens: usize = shared_imports
154            .iter()
155            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
156            .sum();
157
158        result.push(format!(
159            "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
160            shared_imports.len()
161        ));
162        for (imp, files) in shared_imports.iter().take(10) {
163            let short_files: Vec<String> = files
164                .iter()
165                .map(|f| crate::core::protocol::shorten_path(f))
166                .collect();
167            result.push(format!("  {imp}"));
168            result.push(format!("    in: {}", short_files.join(", ")));
169        }
170        if shared_imports.len() > 10 {
171            result.push(format!("  ... +{} more", shared_imports.len() - 10));
172        }
173    }
174
175    if !shared_blocks.is_empty() {
176        let total_block_tokens: usize = shared_blocks
177            .iter()
178            .map(|(block, files)| {
179                let unique: std::collections::HashSet<_> = files.iter().collect();
180                count_tokens(block) * (unique.len() - 1)
181            })
182            .sum();
183
184        result.push(format!(
185            "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
186            shared_blocks.len()
187        ));
188        for (block, files) in shared_blocks.iter().take(5) {
189            let unique: std::collections::HashSet<_> = files.iter().collect();
190            let preview = block.lines().next().unwrap_or("...");
191            result.push(format!("  \"{preview}...\" (in {} files)", unique.len()));
192        }
193    }
194
195    // TF-IDF cosine similarity analysis for semantic duplicates
196    let file_pairs: Vec<(String, String)> = entries
197        .iter()
198        .filter_map(|(path, entry)| Some(((*path).clone(), entry.content()?)))
199        .collect();
200    let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
201    if !semantic_dups.is_empty() {
202        result.push(format!(
203            "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
204            semantic_dups.len()
205        ));
206        for (a, b, sim) in semantic_dups.iter().take(8) {
207            result.push(format!(
208                "  {:.0}% similar: {} ↔ {}",
209                sim * 100.0,
210                crate::core::protocol::shorten_path(a),
211                crate::core::protocol::shorten_path(b)
212            ));
213        }
214        if semantic_dups.len() > 8 {
215            result.push(format!("  ... +{} more pairs", semantic_dups.len() - 8));
216        }
217    }
218
219    if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
220        result.push("\nNo significant cross-file duplication detected.".to_string());
221    } else {
222        let total_savings: usize = shared_imports
223            .iter()
224            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
225            .sum::<usize>()
226            + shared_blocks
227                .iter()
228                .map(|(block, files)| {
229                    let unique: std::collections::HashSet<_> = files.iter().collect();
230                    count_tokens(block) * (unique.len() - 1)
231                })
232                .sum::<usize>();
233
234        result.push(format!(
235            "\nTotal potential savings: ~{total_savings} tokens"
236        ));
237    }
238
239    result.join("\n")
240}
lean_ctx/tools/ctx_dedup.rs

lean_ctx/tools/
ctx_dedup.rs