lean_ctx/tools/
ctx_dedup.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8    analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12    match action {
13        "apply" => apply_dedup(cache),
14        _ => analyze(cache),
15    }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19    let entries = cache.get_all_entries();
20    if entries.len() < 2 {
21        return "Need at least 2 cached files for cross-file dedup.".to_string();
22    }
23
24    let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25    for (path, entry) in &entries {
26        let lines: Vec<&str> = entry.content.lines().collect();
27        for (idx, chunk) in lines.chunks(5).enumerate() {
28            if chunk.len() == 5 {
29                let block = chunk.join("\n");
30                let trimmed = block.trim().to_string();
31                if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
32                    block_occurrences
33                        .entry(trimmed)
34                        .or_default()
35                        .push((path.to_string(), idx * 5 + 1));
36                }
37            }
38        }
39    }
40
41    let mut shared = Vec::new();
42    for (content, occurrences) in &block_occurrences {
43        let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
44        if unique_files.len() >= 2 {
45            let (canonical_path, start_line) = &occurrences[0];
46            let ref_label = cache
47                .file_ref_map()
48                .get(canonical_path)
49                .cloned()
50                .unwrap_or_else(|| "F?".to_string());
51            shared.push(SharedBlock {
52                canonical_path: canonical_path.clone(),
53                canonical_ref: ref_label,
54                start_line: *start_line,
55                end_line: start_line + 4,
56                content: content.clone(),
57            });
58        }
59    }
60
61    let count = shared.len();
62    let savings: usize = shared
63        .iter()
64        .map(|b| {
65            let occurrences = block_occurrences
66                .get(&b.content)
67                .map(|o| {
68                    let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
69                    unique.len() - 1
70                })
71                .unwrap_or(0);
72            count_tokens(&b.content) * occurrences
73        })
74        .sum();
75
76    cache.set_shared_blocks(shared);
77
78    format!(
79        "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
80    )
81}
82
83fn analyze(cache: &SessionCache) -> String {
84    let entries = cache.get_all_entries();
85    if entries.len() < 2 {
86        return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
87    }
88
89    let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
90    let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
91
92    for (path, entry) in &entries {
93        let lines: Vec<&str> = entry.content.lines().collect();
94
95        let imports: Vec<&str> = lines
96            .iter()
97            .copied()
98            .filter(|l| {
99                let t = l.trim();
100                t.starts_with("import ")
101                    || t.starts_with("use ")
102                    || t.starts_with("from ")
103                    || t.starts_with("require(")
104                    || t.starts_with("#include")
105            })
106            .collect();
107
108        for imp in &imports {
109            let key = imp.trim().to_string();
110            import_patterns
111                .entry(key)
112                .or_default()
113                .push(path.to_string());
114        }
115
116        for chunk in lines.chunks(5) {
117            if chunk.len() == 5 {
118                let block = chunk.join("\n");
119                let block_trimmed = block.trim().to_string();
120                if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
121                    boilerplate_blocks
122                        .entry(block_trimmed)
123                        .or_default()
124                        .push(path.to_string());
125                }
126            }
127        }
128    }
129
130    let shared_imports: Vec<_> = import_patterns
131        .iter()
132        .filter(|(_, files)| files.len() >= 2)
133        .collect();
134
135    let shared_blocks: Vec<_> = boilerplate_blocks
136        .iter()
137        .filter(|(_, files)| {
138            let unique: std::collections::HashSet<_> = files.iter().collect();
139            unique.len() >= 2
140        })
141        .collect();
142
143    let mut result = Vec::new();
144    result.push(format!(
145        "Cross-file deduplication analysis ({} cached files):",
146        entries.len()
147    ));
148
149    if !shared_imports.is_empty() {
150        let total_import_tokens: usize = shared_imports
151            .iter()
152            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
153            .sum();
154
155        result.push(format!(
156            "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
157            shared_imports.len()
158        ));
159        for (imp, files) in shared_imports.iter().take(10) {
160            let short_files: Vec<String> = files
161                .iter()
162                .map(|f| crate::core::protocol::shorten_path(f))
163                .collect();
164            result.push(format!("  {imp}"));
165            result.push(format!("    in: {}", short_files.join(", ")));
166        }
167        if shared_imports.len() > 10 {
168            result.push(format!("  ... +{} more", shared_imports.len() - 10));
169        }
170    }
171
172    if !shared_blocks.is_empty() {
173        let total_block_tokens: usize = shared_blocks
174            .iter()
175            .map(|(block, files)| {
176                let unique: std::collections::HashSet<_> = files.iter().collect();
177                count_tokens(block) * (unique.len() - 1)
178            })
179            .sum();
180
181        result.push(format!(
182            "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
183            shared_blocks.len()
184        ));
185        for (block, files) in shared_blocks.iter().take(5) {
186            let unique: std::collections::HashSet<_> = files.iter().collect();
187            let preview = block.lines().next().unwrap_or("...");
188            result.push(format!("  \"{preview}...\" (in {} files)", unique.len()));
189        }
190    }
191
192    // TF-IDF cosine similarity analysis for semantic duplicates
193    let file_pairs: Vec<(String, String)> = entries
194        .iter()
195        .map(|(path, entry)| (path.to_string(), entry.content.clone()))
196        .collect();
197    let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
198    if !semantic_dups.is_empty() {
199        result.push(format!(
200            "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
201            semantic_dups.len()
202        ));
203        for (a, b, sim) in semantic_dups.iter().take(8) {
204            result.push(format!(
205                "  {:.0}% similar: {} ↔ {}",
206                sim * 100.0,
207                crate::core::protocol::shorten_path(a),
208                crate::core::protocol::shorten_path(b)
209            ));
210        }
211        if semantic_dups.len() > 8 {
212            result.push(format!("  ... +{} more pairs", semantic_dups.len() - 8));
213        }
214    }
215
216    if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
217        result.push("\nNo significant cross-file duplication detected.".to_string());
218    } else {
219        let total_savings: usize = shared_imports
220            .iter()
221            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
222            .sum::<usize>()
223            + shared_blocks
224                .iter()
225                .map(|(block, files)| {
226                    let unique: std::collections::HashSet<_> = files.iter().collect();
227                    count_tokens(block) * (unique.len() - 1)
228                })
229                .sum::<usize>();
230
231        result.push(format!(
232            "\nTotal potential savings: ~{total_savings} tokens"
233        ));
234    }
235
236    result.join("\n")
237}
lean_ctx/tools/ctx_dedup.rs

lean_ctx/tools/
ctx_dedup.rs