lean_ctx/tools/
ctx_dedup.rs1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8 analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12 match action {
13 "apply" => apply_dedup(cache),
14 _ => analyze(cache),
15 }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19 let entries = cache.get_all_entries();
20 if entries.len() < 2 {
21 return "Need at least 2 cached files for cross-file dedup.".to_string();
22 }
23
24 let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25 for (path, entry) in &entries {
26 let full_content = entry.content();
27 let lines: Vec<&str> = full_content.lines().collect();
28 for (idx, chunk) in lines.chunks(5).enumerate() {
29 if chunk.len() == 5 {
30 let block = chunk.join("\n");
31 let trimmed = block.trim().to_string();
32 if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
33 block_occurrences
34 .entry(trimmed)
35 .or_default()
36 .push(((*path).clone(), idx * 5 + 1));
37 }
38 }
39 }
40 }
41
42 let mut shared = Vec::new();
43 for (content, occurrences) in &block_occurrences {
44 let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
45 if unique_files.len() >= 2 {
46 let (canonical_path, start_line) = &occurrences[0];
47 let ref_label = cache
48 .file_ref_map()
49 .get(canonical_path)
50 .cloned()
51 .unwrap_or_else(|| "F?".to_string());
52 shared.push(SharedBlock {
53 canonical_path: canonical_path.clone(),
54 canonical_ref: ref_label,
55 start_line: *start_line,
56 end_line: start_line + 4,
57 content: content.clone(),
58 });
59 }
60 }
61
62 let count = shared.len();
63 let savings: usize = shared
64 .iter()
65 .map(|b| {
66 let occurrences = block_occurrences.get(&b.content).map_or(0, |o| {
67 let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
68 unique.len() - 1
69 });
70 count_tokens(&b.content) * occurrences
71 })
72 .sum();
73
74 cache.set_shared_blocks(shared);
75
76 format!(
77 "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
78 )
79}
80
81fn analyze(cache: &SessionCache) -> String {
82 let entries = cache.get_all_entries();
83 if entries.len() < 2 {
84 return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
85 }
86
87 let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
88 let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
89
90 for (path, entry) in &entries {
91 let full_content = entry.content();
92 let lines: Vec<&str> = full_content.lines().collect();
93
94 let imports: Vec<&str> = lines
95 .iter()
96 .copied()
97 .filter(|l| {
98 let t = l.trim();
99 t.starts_with("import ")
100 || t.starts_with("use ")
101 || t.starts_with("from ")
102 || t.starts_with("require(")
103 || t.starts_with("#include")
104 })
105 .collect();
106
107 for imp in &imports {
108 let key = imp.trim().to_string();
109 import_patterns
110 .entry(key)
111 .or_default()
112 .push((*path).clone());
113 }
114
115 for chunk in lines.chunks(5) {
116 if chunk.len() == 5 {
117 let block = chunk.join("\n");
118 let block_trimmed = block.trim().to_string();
119 if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
120 boilerplate_blocks
121 .entry(block_trimmed)
122 .or_default()
123 .push((*path).clone());
124 }
125 }
126 }
127 }
128
129 let shared_imports: Vec<_> = import_patterns
130 .iter()
131 .filter(|(_, files)| files.len() >= 2)
132 .collect();
133
134 let shared_blocks: Vec<_> = boilerplate_blocks
135 .iter()
136 .filter(|(_, files)| {
137 let unique: std::collections::HashSet<_> = files.iter().collect();
138 unique.len() >= 2
139 })
140 .collect();
141
142 let mut result = Vec::new();
143 result.push(format!(
144 "Cross-file deduplication analysis ({} cached files):",
145 entries.len()
146 ));
147
148 if !shared_imports.is_empty() {
149 let total_import_tokens: usize = shared_imports
150 .iter()
151 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
152 .sum();
153
154 result.push(format!(
155 "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
156 shared_imports.len()
157 ));
158 for (imp, files) in shared_imports.iter().take(10) {
159 let short_files: Vec<String> = files
160 .iter()
161 .map(|f| crate::core::protocol::shorten_path(f))
162 .collect();
163 result.push(format!(" {imp}"));
164 result.push(format!(" in: {}", short_files.join(", ")));
165 }
166 if shared_imports.len() > 10 {
167 result.push(format!(" ... +{} more", shared_imports.len() - 10));
168 }
169 }
170
171 if !shared_blocks.is_empty() {
172 let total_block_tokens: usize = shared_blocks
173 .iter()
174 .map(|(block, files)| {
175 let unique: std::collections::HashSet<_> = files.iter().collect();
176 count_tokens(block) * (unique.len() - 1)
177 })
178 .sum();
179
180 result.push(format!(
181 "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
182 shared_blocks.len()
183 ));
184 for (block, files) in shared_blocks.iter().take(5) {
185 let unique: std::collections::HashSet<_> = files.iter().collect();
186 let preview = block.lines().next().unwrap_or("...");
187 result.push(format!(" \"{preview}...\" (in {} files)", unique.len()));
188 }
189 }
190
191 let file_pairs: Vec<(String, String)> = entries
193 .iter()
194 .map(|(path, entry)| ((*path).clone(), entry.content()))
195 .collect();
196 let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
197 if !semantic_dups.is_empty() {
198 result.push(format!(
199 "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
200 semantic_dups.len()
201 ));
202 for (a, b, sim) in semantic_dups.iter().take(8) {
203 result.push(format!(
204 " {:.0}% similar: {} ↔ {}",
205 sim * 100.0,
206 crate::core::protocol::shorten_path(a),
207 crate::core::protocol::shorten_path(b)
208 ));
209 }
210 if semantic_dups.len() > 8 {
211 result.push(format!(" ... +{} more pairs", semantic_dups.len() - 8));
212 }
213 }
214
215 if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
216 result.push("\nNo significant cross-file duplication detected.".to_string());
217 } else {
218 let total_savings: usize = shared_imports
219 .iter()
220 .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
221 .sum::<usize>()
222 + shared_blocks
223 .iter()
224 .map(|(block, files)| {
225 let unique: std::collections::HashSet<_> = files.iter().collect();
226 count_tokens(block) * (unique.len() - 1)
227 })
228 .sum::<usize>();
229
230 result.push(format!(
231 "\nTotal potential savings: ~{total_savings} tokens"
232 ));
233 }
234
235 result.join("\n")
236}