lean_ctx/tools/
ctx_dedup.rs1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8 analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12 match action {
13 "apply" => apply_dedup(cache),
14 _ => analyze(cache),
15 }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19 let entries = cache.get_all_entries();
20 if entries.len() < 2 {
21 return "Need at least 2 cached files for cross-file dedup.".to_string();
22 }
23
24 let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25 for (path, entry) in &entries {
26 let lines: Vec<&str> = entry.content.lines().collect();
27 for (idx, chunk) in lines.chunks(5).enumerate() {
28 if chunk.len() == 5 {
29 let block = chunk.join("\n");
30 let trimmed = block.trim().to_string();
31 if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
32 block_occurrences
33 .entry(trimmed)
34 .or_default()
35 .push((path.to_string(), idx * 5 + 1));
36 }
37 }
38 }
39 }
40
41 let mut shared = Vec::new();
42 for (content, occurrences) in &block_occurrences {
43 let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
44 if unique_files.len() >= 2 {
45 let (canonical_path, start_line) = &occurrences[0];
46 let ref_label = cache
47 .file_ref_map()
48 .get(canonical_path)
49 .cloned()
50 .unwrap_or_else(|| "F?".to_string());
51 shared.push(SharedBlock {
52 canonical_path: canonical_path.clone(),
53 canonical_ref: ref_label,
54 start_line: *start_line,
55 end_line: start_line + 4,
56 content: content.clone(),
57 });
58 }
59 }
60
61 let count = shared.len();
62 let savings: usize = shared
63 .iter()
64 .map(|b| {
65 let occurrences = block_occurrences
66 .get(&b.content)
67 .map(|o| {
68 let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
69 unique.len() - 1
70 })
71 .unwrap_or(0);
72 count_tokens(&b.content) * occurrences
73 })
74 .sum();
75
76 cache.set_shared_blocks(shared);
77
78 format!(
79 "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
80 )
81}
82
83fn analyze(cache: &SessionCache) -> String {
84 let entries = cache.get_all_entries();
85 if entries.len() < 2 {
86 return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
87 }
88
89 let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
90 let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
91
92 for (path, entry) in &entries {
93 let lines: Vec<&str> = entry.content.lines().collect();
94
95 let imports: Vec<&str> = lines
96 .iter()
97 .copied()
98 .filter(|l| {
99 let t = l.trim();
100 t.starts_with("import ")
101 || t.starts_with("use ")
102 || t.starts_with("from ")
103 || t.starts_with("require(")
104 || t.starts_with("#include")
105 })
106 .collect();
107
108 for imp in &imports {
109 let key = imp.trim().to_string();
110 import_patterns
111 .entry(key)
112 .or_default()
113 .push(path.to_string());
114 }
115
116 for chunk in lines.chunks(5) {
117 if chunk.len() == 5 {
118 let block = chunk.join("\n");
119 let block_trimmed = block.trim().to_string();
120 if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
121 boilerplate_blocks
122 .entry(block_trimmed)
123 .or_default()
124 .push(path.to_string());
125 }
126 }
127 }
128 }
129
130 let shared_imports: Vec<_> = import_patterns
131 .iter()
132 .filter(|(_, files)| files.len() >= 2)
133 .collect();
134
135 let shared_blocks: Vec<_> = boilerplate_blocks
136 .iter()
137 .filter(|(_, files)| {
138 let unique: std::collections::HashSet<_> = files.iter().collect();
139 unique.len() >= 2
140 })
141 .collect();
142
143 let mut result = Vec::new();
144 result.push(format!(
145 "Cross-file deduplication analysis ({} cached files):",
146 entries.len()
147 ));
148
149 if !shared_imports.is_empty() {
150 let total_import_tokens: usize = shared_imports
151 .iter()
152 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
153 .sum();
154
155 result.push(format!(
156 "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
157 shared_imports.len()
158 ));
159 for (imp, files) in shared_imports.iter().take(10) {
160 let short_files: Vec<String> = files
161 .iter()
162 .map(|f| crate::core::protocol::shorten_path(f))
163 .collect();
164 result.push(format!(" {imp}"));
165 result.push(format!(" in: {}", short_files.join(", ")));
166 }
167 if shared_imports.len() > 10 {
168 result.push(format!(" ... +{} more", shared_imports.len() - 10));
169 }
170 }
171
172 if !shared_blocks.is_empty() {
173 let total_block_tokens: usize = shared_blocks
174 .iter()
175 .map(|(block, files)| {
176 let unique: std::collections::HashSet<_> = files.iter().collect();
177 count_tokens(block) * (unique.len() - 1)
178 })
179 .sum();
180
181 result.push(format!(
182 "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
183 shared_blocks.len()
184 ));
185 for (block, files) in shared_blocks.iter().take(5) {
186 let unique: std::collections::HashSet<_> = files.iter().collect();
187 let preview = block.lines().next().unwrap_or("...");
188 result.push(format!(" \"{preview}...\" (in {} files)", unique.len()));
189 }
190 }
191
192 let file_pairs: Vec<(String, String)> = entries
194 .iter()
195 .map(|(path, entry)| (path.to_string(), entry.content.clone()))
196 .collect();
197 let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
198 if !semantic_dups.is_empty() {
199 result.push(format!(
200 "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
201 semantic_dups.len()
202 ));
203 for (a, b, sim) in semantic_dups.iter().take(8) {
204 result.push(format!(
205 " {:.0}% similar: {} ↔ {}",
206 sim * 100.0,
207 crate::core::protocol::shorten_path(a),
208 crate::core::protocol::shorten_path(b)
209 ));
210 }
211 if semantic_dups.len() > 8 {
212 result.push(format!(" ... +{} more pairs", semantic_dups.len() - 8));
213 }
214 }
215
216 if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
217 result.push("\nNo significant cross-file duplication detected.".to_string());
218 } else {
219 let total_savings: usize = shared_imports
220 .iter()
221 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
222 .sum::<usize>()
223 + shared_blocks
224 .iter()
225 .map(|(block, files)| {
226 let unique: std::collections::HashSet<_> = files.iter().collect();
227 count_tokens(block) * (unique.len() - 1)
228 })
229 .sum::<usize>();
230
231 result.push(format!(
232 "\nTotal potential savings: ~{total_savings} tokens"
233 ));
234 }
235
236 result.join("\n")
237}