lean_ctx/tools/
ctx_dedup.rs1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8 analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12 match action {
13 "apply" => apply_dedup(cache),
14 _ => analyze(cache),
15 }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19 let entries = cache.get_all_entries();
20 if entries.len() < 2 {
21 return "Need at least 2 cached files for cross-file dedup.".to_string();
22 }
23
24 let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25 for (path, entry) in &entries {
26 let Some(full_content) = entry.content() else {
27 continue;
28 };
29 let lines: Vec<&str> = full_content.lines().collect();
30 for (idx, chunk) in lines.chunks(5).enumerate() {
31 if chunk.len() == 5 {
32 let block = chunk.join("\n");
33 let trimmed = block.trim().to_string();
34 if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
35 block_occurrences
36 .entry(trimmed)
37 .or_default()
38 .push(((*path).clone(), idx * 5 + 1));
39 }
40 }
41 }
42 }
43
44 let mut shared = Vec::new();
45 for (content, occurrences) in &block_occurrences {
46 let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
47 if unique_files.len() >= 2 {
48 let (canonical_path, start_line) = &occurrences[0];
49 let ref_label = cache
50 .file_ref_map()
51 .get(canonical_path)
52 .cloned()
53 .unwrap_or_else(|| "F?".to_string());
54 shared.push(SharedBlock {
55 canonical_path: canonical_path.clone(),
56 canonical_ref: ref_label,
57 start_line: *start_line,
58 end_line: start_line + 4,
59 content: content.clone(),
60 });
61 }
62 }
63
64 let count = shared.len();
65 let savings: usize = shared
66 .iter()
67 .map(|b| {
68 let occurrences = block_occurrences.get(&b.content).map_or(0, |o| {
69 let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
70 unique.len() - 1
71 });
72 count_tokens(&b.content) * occurrences
73 })
74 .sum();
75
76 cache.set_shared_blocks(shared);
77
78 format!(
79 "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
80 )
81}
82
83fn analyze(cache: &SessionCache) -> String {
84 let entries = cache.get_all_entries();
85 if entries.len() < 2 {
86 return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
87 }
88
89 let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
90 let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
91
92 for (path, entry) in &entries {
93 let Some(full_content) = entry.content() else {
94 continue;
95 };
96 let lines: Vec<&str> = full_content.lines().collect();
97
98 let imports: Vec<&str> = lines
99 .iter()
100 .copied()
101 .filter(|l| {
102 let t = l.trim();
103 t.starts_with("import ")
104 || t.starts_with("use ")
105 || t.starts_with("from ")
106 || t.starts_with("require(")
107 || t.starts_with("#include")
108 })
109 .collect();
110
111 for imp in &imports {
112 let key = imp.trim().to_string();
113 import_patterns
114 .entry(key)
115 .or_default()
116 .push((*path).clone());
117 }
118
119 for chunk in lines.chunks(5) {
120 if chunk.len() == 5 {
121 let block = chunk.join("\n");
122 let block_trimmed = block.trim().to_string();
123 if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
124 boilerplate_blocks
125 .entry(block_trimmed)
126 .or_default()
127 .push((*path).clone());
128 }
129 }
130 }
131 }
132
133 let shared_imports: Vec<_> = import_patterns
134 .iter()
135 .filter(|(_, files)| files.len() >= 2)
136 .collect();
137
138 let shared_blocks: Vec<_> = boilerplate_blocks
139 .iter()
140 .filter(|(_, files)| {
141 let unique: std::collections::HashSet<_> = files.iter().collect();
142 unique.len() >= 2
143 })
144 .collect();
145
146 let mut result = Vec::new();
147 result.push(format!(
148 "Cross-file deduplication analysis ({} cached files):",
149 entries.len()
150 ));
151
152 if !shared_imports.is_empty() {
153 let total_import_tokens: usize = shared_imports
154 .iter()
155 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
156 .sum();
157
158 result.push(format!(
159 "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
160 shared_imports.len()
161 ));
162 for (imp, files) in shared_imports.iter().take(10) {
163 let short_files: Vec<String> = files
164 .iter()
165 .map(|f| crate::core::protocol::shorten_path(f))
166 .collect();
167 result.push(format!(" {imp}"));
168 result.push(format!(" in: {}", short_files.join(", ")));
169 }
170 if shared_imports.len() > 10 {
171 result.push(format!(" ... +{} more", shared_imports.len() - 10));
172 }
173 }
174
175 if !shared_blocks.is_empty() {
176 let total_block_tokens: usize = shared_blocks
177 .iter()
178 .map(|(block, files)| {
179 let unique: std::collections::HashSet<_> = files.iter().collect();
180 count_tokens(block) * (unique.len() - 1)
181 })
182 .sum();
183
184 result.push(format!(
185 "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
186 shared_blocks.len()
187 ));
188 for (block, files) in shared_blocks.iter().take(5) {
189 let unique: std::collections::HashSet<_> = files.iter().collect();
190 let preview = block.lines().next().unwrap_or("...");
191 result.push(format!(" \"{preview}...\" (in {} files)", unique.len()));
192 }
193 }
194
195 let file_pairs: Vec<(String, String)> = entries
197 .iter()
198 .filter_map(|(path, entry)| Some(((*path).clone(), entry.content()?)))
199 .collect();
200 let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
201 if !semantic_dups.is_empty() {
202 result.push(format!(
203 "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
204 semantic_dups.len()
205 ));
206 for (a, b, sim) in semantic_dups.iter().take(8) {
207 result.push(format!(
208 " {:.0}% similar: {} ↔ {}",
209 sim * 100.0,
210 crate::core::protocol::shorten_path(a),
211 crate::core::protocol::shorten_path(b)
212 ));
213 }
214 if semantic_dups.len() > 8 {
215 result.push(format!(" ... +{} more pairs", semantic_dups.len() - 8));
216 }
217 }
218
219 if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
220 result.push("\nNo significant cross-file duplication detected.".to_string());
221 } else {
222 let total_savings: usize = shared_imports
223 .iter()
224 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
225 .sum::<usize>()
226 + shared_blocks
227 .iter()
228 .map(|(block, files)| {
229 let unique: std::collections::HashSet<_> = files.iter().collect();
230 count_tokens(block) * (unique.len() - 1)
231 })
232 .sum::<usize>();
233
234 result.push(format!(
235 "\nTotal potential savings: ~{total_savings} tokens"
236 ));
237 }
238
239 result.join("\n")
240}