lean_ctx/tools/
ctx_dedup.rs1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8 analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12 match action {
13 "apply" => apply_dedup(cache),
14 _ => analyze(cache),
15 }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19 let entries = cache.get_all_entries();
20 if entries.len() < 2 {
21 return "Need at least 2 cached files for cross-file dedup.".to_string();
22 }
23
24 let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25 for (path, entry) in &entries {
26 let lines: Vec<&str> = entry.content.lines().collect();
27 for (idx, chunk) in lines.chunks(5).enumerate() {
28 if chunk.len() == 5 {
29 let block = chunk.join("\n");
30 let trimmed = block.trim().to_string();
31 if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
32 block_occurrences
33 .entry(trimmed)
34 .or_default()
35 .push(((*path).clone(), idx * 5 + 1));
36 }
37 }
38 }
39 }
40
41 let mut shared = Vec::new();
42 for (content, occurrences) in &block_occurrences {
43 let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
44 if unique_files.len() >= 2 {
45 let (canonical_path, start_line) = &occurrences[0];
46 let ref_label = cache
47 .file_ref_map()
48 .get(canonical_path)
49 .cloned()
50 .unwrap_or_else(|| "F?".to_string());
51 shared.push(SharedBlock {
52 canonical_path: canonical_path.clone(),
53 canonical_ref: ref_label,
54 start_line: *start_line,
55 end_line: start_line + 4,
56 content: content.clone(),
57 });
58 }
59 }
60
61 let count = shared.len();
62 let savings: usize = shared
63 .iter()
64 .map(|b| {
65 let occurrences = block_occurrences.get(&b.content).map_or(0, |o| {
66 let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
67 unique.len() - 1
68 });
69 count_tokens(&b.content) * occurrences
70 })
71 .sum();
72
73 cache.set_shared_blocks(shared);
74
75 format!(
76 "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
77 )
78}
79
80fn analyze(cache: &SessionCache) -> String {
81 let entries = cache.get_all_entries();
82 if entries.len() < 2 {
83 return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
84 }
85
86 let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
87 let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
88
89 for (path, entry) in &entries {
90 let lines: Vec<&str> = entry.content.lines().collect();
91
92 let imports: Vec<&str> = lines
93 .iter()
94 .copied()
95 .filter(|l| {
96 let t = l.trim();
97 t.starts_with("import ")
98 || t.starts_with("use ")
99 || t.starts_with("from ")
100 || t.starts_with("require(")
101 || t.starts_with("#include")
102 })
103 .collect();
104
105 for imp in &imports {
106 let key = imp.trim().to_string();
107 import_patterns
108 .entry(key)
109 .or_default()
110 .push((*path).clone());
111 }
112
113 for chunk in lines.chunks(5) {
114 if chunk.len() == 5 {
115 let block = chunk.join("\n");
116 let block_trimmed = block.trim().to_string();
117 if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
118 boilerplate_blocks
119 .entry(block_trimmed)
120 .or_default()
121 .push((*path).clone());
122 }
123 }
124 }
125 }
126
127 let shared_imports: Vec<_> = import_patterns
128 .iter()
129 .filter(|(_, files)| files.len() >= 2)
130 .collect();
131
132 let shared_blocks: Vec<_> = boilerplate_blocks
133 .iter()
134 .filter(|(_, files)| {
135 let unique: std::collections::HashSet<_> = files.iter().collect();
136 unique.len() >= 2
137 })
138 .collect();
139
140 let mut result = Vec::new();
141 result.push(format!(
142 "Cross-file deduplication analysis ({} cached files):",
143 entries.len()
144 ));
145
146 if !shared_imports.is_empty() {
147 let total_import_tokens: usize = shared_imports
148 .iter()
149 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
150 .sum();
151
152 result.push(format!(
153 "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
154 shared_imports.len()
155 ));
156 for (imp, files) in shared_imports.iter().take(10) {
157 let short_files: Vec<String> = files
158 .iter()
159 .map(|f| crate::core::protocol::shorten_path(f))
160 .collect();
161 result.push(format!(" {imp}"));
162 result.push(format!(" in: {}", short_files.join(", ")));
163 }
164 if shared_imports.len() > 10 {
165 result.push(format!(" ... +{} more", shared_imports.len() - 10));
166 }
167 }
168
169 if !shared_blocks.is_empty() {
170 let total_block_tokens: usize = shared_blocks
171 .iter()
172 .map(|(block, files)| {
173 let unique: std::collections::HashSet<_> = files.iter().collect();
174 count_tokens(block) * (unique.len() - 1)
175 })
176 .sum();
177
178 result.push(format!(
179 "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
180 shared_blocks.len()
181 ));
182 for (block, files) in shared_blocks.iter().take(5) {
183 let unique: std::collections::HashSet<_> = files.iter().collect();
184 let preview = block.lines().next().unwrap_or("...");
185 result.push(format!(" \"{preview}...\" (in {} files)", unique.len()));
186 }
187 }
188
189 let file_pairs: Vec<(String, String)> = entries
191 .iter()
192 .map(|(path, entry)| ((*path).clone(), entry.content.clone()))
193 .collect();
194 let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
195 if !semantic_dups.is_empty() {
196 result.push(format!(
197 "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
198 semantic_dups.len()
199 ));
200 for (a, b, sim) in semantic_dups.iter().take(8) {
201 result.push(format!(
202 " {:.0}% similar: {} ↔ {}",
203 sim * 100.0,
204 crate::core::protocol::shorten_path(a),
205 crate::core::protocol::shorten_path(b)
206 ));
207 }
208 if semantic_dups.len() > 8 {
209 result.push(format!(" ... +{} more pairs", semantic_dups.len() - 8));
210 }
211 }
212
213 if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
214 result.push("\nNo significant cross-file duplication detected.".to_string());
215 } else {
216 let total_savings: usize = shared_imports
217 .iter()
218 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
219 .sum::<usize>()
220 + shared_blocks
221 .iter()
222 .map(|(block, files)| {
223 let unique: std::collections::HashSet<_> = files.iter().collect();
224 count_tokens(block) * (unique.len() - 1)
225 })
226 .sum::<usize>();
227
228 result.push(format!(
229 "\nTotal potential savings: ~{total_savings} tokens"
230 ));
231 }
232
233 result.join("\n")
234}