Skip to main content

lean_ctx/tools/
ctx_dedup.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::core::cache::{SessionCache, SharedBlock};
4use crate::core::codebook;
5use crate::core::tokens::count_tokens;
6
7pub fn handle(cache: &SessionCache) -> String {
8    analyze(cache)
9}
10
11pub fn handle_action(cache: &mut SessionCache, action: &str) -> String {
12    match action {
13        "apply" => apply_dedup(cache),
14        _ => analyze(cache),
15    }
16}
17
18fn apply_dedup(cache: &mut SessionCache) -> String {
19    let entries = cache.get_all_entries();
20    if entries.len() < 2 {
21        return "Need at least 2 cached files for cross-file dedup.".to_string();
22    }
23
24    let mut block_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
25    for (path, entry) in &entries {
26        let full_content = entry.content();
27        let lines: Vec<&str> = full_content.lines().collect();
28        for (idx, chunk) in lines.chunks(5).enumerate() {
29            if chunk.len() == 5 {
30                let block = chunk.join("\n");
31                let trimmed = block.trim().to_string();
32                if !trimmed.is_empty() && count_tokens(&trimmed) > 10 {
33                    block_occurrences
34                        .entry(trimmed)
35                        .or_default()
36                        .push(((*path).clone(), idx * 5 + 1));
37                }
38            }
39        }
40    }
41
42    let mut shared = Vec::new();
43    for (content, occurrences) in &block_occurrences {
44        let unique_files: HashSet<&str> = occurrences.iter().map(|(p, _)| p.as_str()).collect();
45        if unique_files.len() >= 2 {
46            let (canonical_path, start_line) = &occurrences[0];
47            let ref_label = cache
48                .file_ref_map()
49                .get(canonical_path)
50                .cloned()
51                .unwrap_or_else(|| "F?".to_string());
52            shared.push(SharedBlock {
53                canonical_path: canonical_path.clone(),
54                canonical_ref: ref_label,
55                start_line: *start_line,
56                end_line: start_line + 4,
57                content: content.clone(),
58            });
59        }
60    }
61
62    let count = shared.len();
63    let savings: usize = shared
64        .iter()
65        .map(|b| {
66            let occurrences = block_occurrences.get(&b.content).map_or(0, |o| {
67                let unique: HashSet<&str> = o.iter().map(|(p, _)| p.as_str()).collect();
68                unique.len() - 1
69            });
70            count_tokens(&b.content) * occurrences
71        })
72        .sum();
73
74    cache.set_shared_blocks(shared);
75
76    format!(
77        "Applied cross-file dedup: {count} shared blocks registered (~{savings} tokens saveable)"
78    )
79}
80
81fn analyze(cache: &SessionCache) -> String {
82    let entries = cache.get_all_entries();
83    if entries.len() < 2 {
84        return "Need at least 2 cached files for cross-file deduplication analysis.".to_string();
85    }
86
87    let mut import_patterns: HashMap<String, Vec<String>> = HashMap::new();
88    let mut boilerplate_blocks: HashMap<String, Vec<String>> = HashMap::new();
89
90    for (path, entry) in &entries {
91        let full_content = entry.content();
92        let lines: Vec<&str> = full_content.lines().collect();
93
94        let imports: Vec<&str> = lines
95            .iter()
96            .copied()
97            .filter(|l| {
98                let t = l.trim();
99                t.starts_with("import ")
100                    || t.starts_with("use ")
101                    || t.starts_with("from ")
102                    || t.starts_with("require(")
103                    || t.starts_with("#include")
104            })
105            .collect();
106
107        for imp in &imports {
108            let key = imp.trim().to_string();
109            import_patterns
110                .entry(key)
111                .or_default()
112                .push((*path).clone());
113        }
114
115        for chunk in lines.chunks(5) {
116            if chunk.len() == 5 {
117                let block = chunk.join("\n");
118                let block_trimmed = block.trim().to_string();
119                if !block_trimmed.is_empty() && count_tokens(&block_trimmed) > 10 {
120                    boilerplate_blocks
121                        .entry(block_trimmed)
122                        .or_default()
123                        .push((*path).clone());
124                }
125            }
126        }
127    }
128
129    let shared_imports: Vec<_> = import_patterns
130        .iter()
131        .filter(|(_, files)| files.len() >= 2)
132        .collect();
133
134    let shared_blocks: Vec<_> = boilerplate_blocks
135        .iter()
136        .filter(|(_, files)| {
137            let unique: std::collections::HashSet<_> = files.iter().collect();
138            unique.len() >= 2
139        })
140        .collect();
141
142    let mut result = Vec::new();
143    result.push(format!(
144        "Cross-file deduplication analysis ({} cached files):",
145        entries.len()
146    ));
147
148    if !shared_imports.is_empty() {
149        let total_import_tokens: usize = shared_imports
150            .iter()
151            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
152            .sum();
153
154        result.push(format!(
155            "\nShared imports ({}, ~{total_import_tokens} redundant tokens):",
156            shared_imports.len()
157        ));
158        for (imp, files) in shared_imports.iter().take(10) {
159            let short_files: Vec<String> = files
160                .iter()
161                .map(|f| crate::core::protocol::shorten_path(f))
162                .collect();
163            result.push(format!("  {imp}"));
164            result.push(format!("    in: {}", short_files.join(", ")));
165        }
166        if shared_imports.len() > 10 {
167            result.push(format!("  ... +{} more", shared_imports.len() - 10));
168        }
169    }
170
171    if !shared_blocks.is_empty() {
172        let total_block_tokens: usize = shared_blocks
173            .iter()
174            .map(|(block, files)| {
175                let unique: std::collections::HashSet<_> = files.iter().collect();
176                count_tokens(block) * (unique.len() - 1)
177            })
178            .sum();
179
180        result.push(format!(
181            "\nShared code blocks ({}, ~{total_block_tokens} redundant tokens):",
182            shared_blocks.len()
183        ));
184        for (block, files) in shared_blocks.iter().take(5) {
185            let unique: std::collections::HashSet<_> = files.iter().collect();
186            let preview = block.lines().next().unwrap_or("...");
187            result.push(format!("  \"{preview}...\" (in {} files)", unique.len()));
188        }
189    }
190
191    // TF-IDF cosine similarity analysis for semantic duplicates
192    let file_pairs: Vec<(String, String)> = entries
193        .iter()
194        .map(|(path, entry)| ((*path).clone(), entry.content()))
195        .collect();
196    let semantic_dups = codebook::find_semantic_duplicates(&file_pairs, 0.75);
197    if !semantic_dups.is_empty() {
198        result.push(format!(
199            "\nSemantic duplicates (TF-IDF cosine > 0.75, {} pairs):",
200            semantic_dups.len()
201        ));
202        for (a, b, sim) in semantic_dups.iter().take(8) {
203            result.push(format!(
204                "  {:.0}% similar: {} ↔ {}",
205                sim * 100.0,
206                crate::core::protocol::shorten_path(a),
207                crate::core::protocol::shorten_path(b)
208            ));
209        }
210        if semantic_dups.len() > 8 {
211            result.push(format!("  ... +{} more pairs", semantic_dups.len() - 8));
212        }
213    }
214
215    if shared_imports.is_empty() && shared_blocks.is_empty() && semantic_dups.is_empty() {
216        result.push("\nNo significant cross-file duplication detected.".to_string());
217    } else {
218        let total_savings: usize = shared_imports
219            .iter()
220            .map(|(imp, files)| count_tokens(imp) * (files.len() - 1))
221            .sum::<usize>()
222            + shared_blocks
223                .iter()
224                .map(|(block, files)| {
225                    let unique: std::collections::HashSet<_> = files.iter().collect();
226                    count_tokens(block) * (unique.len() - 1)
227                })
228                .sum::<usize>();
229
230        result.push(format!(
231            "\nTotal potential savings: ~{total_savings} tokens"
232        ));
233    }
234
235    result.join("\n")
236}