1use super::{Recommendation, RecommendationKind, RiskLevel};
9use crate::cleaners::calculate_dir_size;
10use crate::error::Result;
11use serde::Deserialize;
12use std::collections::HashMap;
13use std::path::{Path, PathBuf};
14use walkdir::WalkDir;
15
16pub struct DuplicateFinder {
18 pub min_duplicate_size: u64,
20 pub min_duplicate_count: usize,
22}
23
24impl Default for DuplicateFinder {
25 fn default() -> Self {
26 Self {
27 min_duplicate_size: 10_000_000, min_duplicate_count: 2,
29 }
30 }
31}
32
33#[derive(Debug, Clone)]
35pub struct DuplicateGroup {
36 pub name: String,
38 pub versions: Vec<String>,
40 pub locations: Vec<PathBuf>,
42 pub total_size: u64,
44 pub potential_savings: u64,
46}
47
48#[derive(Debug, Deserialize)]
50struct PackageJson {
51 name: Option<String>,
52 version: Option<String>,
53}
54
55impl DuplicateFinder {
56 pub fn new() -> Self {
58 Self::default()
59 }
60
61 pub fn scan(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
63 let mut recommendations = Vec::new();
64
65 recommendations.extend(self.find_node_duplicates(root, max_depth)?);
67
68 recommendations.extend(self.find_python_duplicates(root, max_depth)?);
70
71 recommendations.extend(self.find_rust_duplicates(root, max_depth)?);
73
74 Ok(recommendations)
75 }
76
77 fn find_node_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
79 let mut package_locations: HashMap<String, Vec<(PathBuf, String, u64)>> = HashMap::new();
80
81 for entry in WalkDir::new(root)
83 .max_depth(max_depth + 5) .follow_links(false)
85 .into_iter()
86 .filter_entry(|e| {
87 let name = e.file_name().to_string_lossy();
88 name != ".git" && name != "target" && name != "venv"
89 })
90 .filter_map(|e| e.ok())
91 {
92 let path = entry.path();
93
94 if path.is_file() && path.file_name().map(|n| n == "package.json").unwrap_or(false) {
96 let path_str = path.to_string_lossy();
98 if !path_str.contains("node_modules") {
99 continue;
100 }
101
102 if let Ok(content) = std::fs::read_to_string(path) {
104 if let Ok(pkg) = serde_json::from_str::<PackageJson>(&content) {
105 if let (Some(name), Some(version)) = (pkg.name, pkg.version) {
106 if name.starts_with('@') && name.contains('/') {
108 continue;
109 }
110
111 if let Some(pkg_dir) = path.parent() {
113 let size = calculate_dir_size(pkg_dir)
114 .map(|(s, _)| s)
115 .unwrap_or(0);
116
117 package_locations
118 .entry(name)
119 .or_default()
120 .push((pkg_dir.to_path_buf(), version, size));
121 }
122 }
123 }
124 }
125 }
126 }
127
128 let mut recommendations = Vec::new();
130
131 for (name, locations) in package_locations {
132 if locations.len() < self.min_duplicate_count {
133 continue;
134 }
135
136 let total_size: u64 = locations.iter().map(|(_, _, s)| s).sum();
137
138 if total_size < self.min_duplicate_size {
139 continue;
140 }
141
142 let mut version_map: HashMap<String, Vec<PathBuf>> = HashMap::new();
144 for (path, version, _) in &locations {
145 version_map
146 .entry(version.clone())
147 .or_default()
148 .push(path.clone());
149 }
150
151 let versions: Vec<String> = version_map.keys().cloned().collect();
152 let unique_versions = versions.len();
153
154 let potential_savings = if unique_versions == 1 {
156 total_size - locations.first().map(|(_, _, s)| *s).unwrap_or(0)
158 } else {
159 0
161 };
162
163 if potential_savings < 1_000_000 {
164 continue;
165 }
166
167 let locations_paths: Vec<PathBuf> = locations.iter().map(|(p, _, _)| p.clone()).collect();
168
169 recommendations.push(Recommendation {
170 kind: RecommendationKind::DuplicateDependency,
171 title: format!(
172 "📦 {} ({} copies, {})",
173 name,
174 locations.len(),
175 format_size(total_size)
176 ),
177 description: if unique_versions == 1 {
178 format!(
179 "Same version ({}) installed {} times. Could save {} with deduplication.",
180 versions.first().unwrap_or(&"?".to_string()),
181 locations.len(),
182 format_size(potential_savings)
183 )
184 } else {
185 format!(
186 "{} different versions across {} installations. Consider using pnpm or yarn workspaces.",
187 unique_versions,
188 locations.len()
189 )
190 },
191 path: locations_paths.first().cloned().unwrap_or_default(),
192 potential_savings,
193 fix_command: Some("Consider using pnpm or yarn workspaces for deduplication".to_string()),
194 risk: RiskLevel::Low,
195 });
196 }
197
198 recommendations.sort_by(|a, b| b.potential_savings.cmp(&a.potential_savings));
200
201 recommendations.truncate(20);
203
204 Ok(recommendations)
205 }
206
207 fn find_python_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
209 let mut venvs: Vec<(PathBuf, u64)> = Vec::new();
210
211 for entry in WalkDir::new(root)
213 .max_depth(max_depth)
214 .follow_links(false)
215 .into_iter()
216 .filter_entry(|e| {
217 let name = e.file_name().to_string_lossy();
218 name != ".git" && name != "node_modules" && name != "target"
219 })
220 .filter_map(|e| e.ok())
221 {
222 let path = entry.path();
223 let name = path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default();
224
225 if path.is_dir() && (name == "venv" || name == ".venv" || name == "env") {
227 let pyvenv_cfg = path.join("pyvenv.cfg");
228 if pyvenv_cfg.exists() {
229 let size = calculate_dir_size(path).map(|(s, _)| s).unwrap_or(0);
230 venvs.push((path.to_path_buf(), size));
231 }
232 }
233 }
234
235 let mut recommendations = Vec::new();
236
237 if venvs.len() >= 3 {
238 let total_size: u64 = venvs.iter().map(|(_, s)| s).sum();
239 let avg_size = total_size / venvs.len() as u64;
240
241 let potential_savings = (total_size as f64 * 0.4) as u64;
244
245 if total_size > 500_000_000 {
246 recommendations.push(Recommendation {
247 kind: RecommendationKind::DuplicateDependency,
248 title: format!(
249 "🐍 {} Python venvs ({})",
250 venvs.len(),
251 format_size(total_size)
252 ),
253 description: format!(
254 "Found {} virtual environments averaging {}. Consider using uv, poetry, or conda for better dependency management.",
255 venvs.len(),
256 format_size(avg_size)
257 ),
258 path: venvs.first().map(|(p, _)| p.clone()).unwrap_or_default(),
259 potential_savings,
260 fix_command: Some("Consider using uv or poetry with centralized cache".to_string()),
261 risk: RiskLevel::Low,
262 });
263 }
264 }
265
266 Ok(recommendations)
267 }
268
269 fn find_rust_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
271 let mut targets: Vec<(PathBuf, u64)> = Vec::new();
272
273 for entry in WalkDir::new(root)
275 .max_depth(max_depth)
276 .follow_links(false)
277 .into_iter()
278 .filter_entry(|e| {
279 let name = e.file_name().to_string_lossy();
280 name != ".git" && name != "node_modules" && name != "venv"
281 })
282 .filter_map(|e| e.ok())
283 {
284 let path = entry.path();
285
286 if path.is_dir() && path.file_name().map(|n| n == "target").unwrap_or(false) {
288 if let Some(parent) = path.parent() {
289 if parent.join("Cargo.toml").exists() {
290 let size = calculate_dir_size(path).map(|(s, _)| s).unwrap_or(0);
291 if size > 50_000_000 {
292 targets.push((path.to_path_buf(), size));
293 }
294 }
295 }
296 }
297 }
298
299 let mut recommendations = Vec::new();
300
301 if targets.len() >= 2 {
302 let total_size: u64 = targets.iter().map(|(_, s)| s).sum();
303
304 let potential_savings = (total_size as f64 * 0.35) as u64;
307
308 if total_size > 1_000_000_000 {
309 recommendations.push(Recommendation {
310 kind: RecommendationKind::DuplicateDependency,
311 title: format!(
312 "🦀 {} Rust targets ({})",
313 targets.len(),
314 format_size(total_size)
315 ),
316 description: format!(
317 "Found {} Rust projects with separate target directories. Consider using CARGO_TARGET_DIR for shared compilation cache.",
318 targets.len()
319 ),
320 path: targets.first().map(|(p, _)| p.clone()).unwrap_or_default(),
321 potential_savings,
322 fix_command: Some("export CARGO_TARGET_DIR=~/.cargo/target".to_string()),
323 risk: RiskLevel::None,
324 });
325 }
326 }
327
328 Ok(recommendations)
329 }
330}
331
332fn format_size(bytes: u64) -> String {
334 super::format_size(bytes)
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340
341 #[test]
342 fn test_duplicate_finder_creation() {
343 let finder = DuplicateFinder::new();
344 assert_eq!(finder.min_duplicate_count, 2);
345 }
346
347 #[test]
348 fn test_duplicate_scan() {
349 let finder = DuplicateFinder::new();
350 if let Ok(recommendations) = finder.scan(Path::new("."), 3) {
351 println!("Found {} duplicate recommendations", recommendations.len());
352 for rec in &recommendations {
353 println!(" {} - {}", rec.title, rec.description);
354 }
355 }
356 }
357}