Skip to main content

null_e/analysis/
duplicates.rs

1//! Duplicate dependency detection
2//!
3//! Finds duplicate dependencies across projects:
4//! - Same npm package in multiple node_modules
5//! - Multiple Python venvs with similar packages
6//! - Duplicate cargo dependencies in different targets
7
8use super::{Recommendation, RecommendationKind, RiskLevel};
9use crate::cleaners::calculate_dir_size;
10use crate::error::Result;
11use serde::Deserialize;
12use std::collections::HashMap;
13use std::path::{Path, PathBuf};
14use walkdir::WalkDir;
15
16/// Duplicate dependency finder
17pub struct DuplicateFinder {
18    /// Minimum size to report duplicates (bytes)
19    pub min_duplicate_size: u64,
20    /// Minimum number of duplicates to report
21    pub min_duplicate_count: usize,
22}
23
24impl Default for DuplicateFinder {
25    fn default() -> Self {
26        Self {
27            min_duplicate_size: 10_000_000, // 10MB total across duplicates
28            min_duplicate_count: 2,
29        }
30    }
31}
32
33/// A detected duplicate
34#[derive(Debug, Clone)]
35pub struct DuplicateGroup {
36    /// Package name
37    pub name: String,
38    /// Versions found
39    pub versions: Vec<String>,
40    /// Locations where this package is duplicated
41    pub locations: Vec<PathBuf>,
42    /// Total size across all duplicates
43    pub total_size: u64,
44    /// Potential savings if consolidated
45    pub potential_savings: u64,
46}
47
48/// Package info extracted from package.json
49#[derive(Debug, Deserialize)]
50struct PackageJson {
51    name: Option<String>,
52    version: Option<String>,
53}
54
55impl DuplicateFinder {
56    /// Create a new duplicate finder
57    pub fn new() -> Self {
58        Self::default()
59    }
60
61    /// Scan for duplicate dependencies
62    pub fn scan(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
63        let mut recommendations = Vec::new();
64
65        // Find node_modules duplicates
66        recommendations.extend(self.find_node_duplicates(root, max_depth)?);
67
68        // Find Python venv duplicates
69        recommendations.extend(self.find_python_duplicates(root, max_depth)?);
70
71        // Find Rust target duplicates
72        recommendations.extend(self.find_rust_duplicates(root, max_depth)?);
73
74        Ok(recommendations)
75    }
76
77    /// Find duplicate packages in node_modules
78    fn find_node_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
79        let mut package_locations: HashMap<String, Vec<(PathBuf, String, u64)>> = HashMap::new();
80
81        // Find all node_modules directories
82        for entry in WalkDir::new(root)
83            .max_depth(max_depth + 5) // Go deeper to find nested node_modules
84            .follow_links(false)
85            .into_iter()
86            .filter_entry(|e| {
87                let name = e.file_name().to_string_lossy();
88                name != ".git" && name != "target" && name != "venv"
89            })
90            .filter_map(|e| e.ok())
91        {
92            let path = entry.path();
93
94            // Look for package.json files in node_modules
95            if path.is_file() && path.file_name().map(|n| n == "package.json").unwrap_or(false) {
96                // Check if inside node_modules
97                let path_str = path.to_string_lossy();
98                if !path_str.contains("node_modules") {
99                    continue;
100                }
101
102                // Read package.json
103                if let Ok(content) = std::fs::read_to_string(path) {
104                    if let Ok(pkg) = serde_json::from_str::<PackageJson>(&content) {
105                        if let (Some(name), Some(version)) = (pkg.name, pkg.version) {
106                            // Skip scoped packages that are likely unique
107                            if name.starts_with('@') && name.contains('/') {
108                                continue;
109                            }
110
111                            // Get package directory
112                            if let Some(pkg_dir) = path.parent() {
113                                let size = calculate_dir_size(pkg_dir)
114                                    .map(|(s, _)| s)
115                                    .unwrap_or(0);
116
117                                package_locations
118                                    .entry(name)
119                                    .or_default()
120                                    .push((pkg_dir.to_path_buf(), version, size));
121                            }
122                        }
123                    }
124                }
125            }
126        }
127
128        // Find packages that appear multiple times
129        let mut recommendations = Vec::new();
130
131        for (name, locations) in package_locations {
132            if locations.len() < self.min_duplicate_count {
133                continue;
134            }
135
136            let total_size: u64 = locations.iter().map(|(_, _, s)| s).sum();
137
138            if total_size < self.min_duplicate_size {
139                continue;
140            }
141
142            // Group by version
143            let mut version_map: HashMap<String, Vec<PathBuf>> = HashMap::new();
144            for (path, version, _) in &locations {
145                version_map
146                    .entry(version.clone())
147                    .or_default()
148                    .push(path.clone());
149            }
150
151            let versions: Vec<String> = version_map.keys().cloned().collect();
152            let unique_versions = versions.len();
153
154            // Potential savings: keep only one copy of each version
155            let potential_savings = if unique_versions == 1 {
156                // Same version everywhere - could theoretically use symlinks
157                total_size - locations.first().map(|(_, _, s)| *s).unwrap_or(0)
158            } else {
159                // Different versions - harder to dedupe
160                0
161            };
162
163            if potential_savings < 1_000_000 {
164                continue;
165            }
166
167            let locations_paths: Vec<PathBuf> = locations.iter().map(|(p, _, _)| p.clone()).collect();
168
169            recommendations.push(Recommendation {
170                kind: RecommendationKind::DuplicateDependency,
171                title: format!(
172                    "📦 {} ({} copies, {})",
173                    name,
174                    locations.len(),
175                    format_size(total_size)
176                ),
177                description: if unique_versions == 1 {
178                    format!(
179                        "Same version ({}) installed {} times. Could save {} with deduplication.",
180                        versions.first().unwrap_or(&"?".to_string()),
181                        locations.len(),
182                        format_size(potential_savings)
183                    )
184                } else {
185                    format!(
186                        "{} different versions across {} installations. Consider using pnpm or yarn workspaces.",
187                        unique_versions,
188                        locations.len()
189                    )
190                },
191                path: locations_paths.first().cloned().unwrap_or_default(),
192                potential_savings,
193                fix_command: Some("Consider using pnpm or yarn workspaces for deduplication".to_string()),
194                risk: RiskLevel::Low,
195            });
196        }
197
198        // Sort by potential savings
199        recommendations.sort_by(|a, b| b.potential_savings.cmp(&a.potential_savings));
200
201        // Limit to top 20
202        recommendations.truncate(20);
203
204        Ok(recommendations)
205    }
206
207    /// Find duplicate Python venvs
208    fn find_python_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
209        let mut venvs: Vec<(PathBuf, u64)> = Vec::new();
210
211        // Find all venv/virtualenv directories
212        for entry in WalkDir::new(root)
213            .max_depth(max_depth)
214            .follow_links(false)
215            .into_iter()
216            .filter_entry(|e| {
217                let name = e.file_name().to_string_lossy();
218                name != ".git" && name != "node_modules" && name != "target"
219            })
220            .filter_map(|e| e.ok())
221        {
222            let path = entry.path();
223            let name = path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default();
224
225            // Check for venv markers
226            if path.is_dir() && (name == "venv" || name == ".venv" || name == "env") {
227                let pyvenv_cfg = path.join("pyvenv.cfg");
228                if pyvenv_cfg.exists() {
229                    let size = calculate_dir_size(path).map(|(s, _)| s).unwrap_or(0);
230                    venvs.push((path.to_path_buf(), size));
231                }
232            }
233        }
234
235        let mut recommendations = Vec::new();
236
237        if venvs.len() >= 3 {
238            let total_size: u64 = venvs.iter().map(|(_, s)| s).sum();
239            let avg_size = total_size / venvs.len() as u64;
240
241            // Many venvs often have duplicate packages
242            // Estimate ~40% could be shared
243            let potential_savings = (total_size as f64 * 0.4) as u64;
244
245            if total_size > 500_000_000 {
246                recommendations.push(Recommendation {
247                    kind: RecommendationKind::DuplicateDependency,
248                    title: format!(
249                        "🐍 {} Python venvs ({})",
250                        venvs.len(),
251                        format_size(total_size)
252                    ),
253                    description: format!(
254                        "Found {} virtual environments averaging {}. Consider using uv, poetry, or conda for better dependency management.",
255                        venvs.len(),
256                        format_size(avg_size)
257                    ),
258                    path: venvs.first().map(|(p, _)| p.clone()).unwrap_or_default(),
259                    potential_savings,
260                    fix_command: Some("Consider using uv or poetry with centralized cache".to_string()),
261                    risk: RiskLevel::Low,
262                });
263            }
264        }
265
266        Ok(recommendations)
267    }
268
269    /// Find duplicate Rust target directories
270    fn find_rust_duplicates(&self, root: &Path, max_depth: usize) -> Result<Vec<Recommendation>> {
271        let mut targets: Vec<(PathBuf, u64)> = Vec::new();
272
273        // Find all Rust target directories
274        for entry in WalkDir::new(root)
275            .max_depth(max_depth)
276            .follow_links(false)
277            .into_iter()
278            .filter_entry(|e| {
279                let name = e.file_name().to_string_lossy();
280                name != ".git" && name != "node_modules" && name != "venv"
281            })
282            .filter_map(|e| e.ok())
283        {
284            let path = entry.path();
285
286            // Check for target with Cargo.toml sibling
287            if path.is_dir() && path.file_name().map(|n| n == "target").unwrap_or(false) {
288                if let Some(parent) = path.parent() {
289                    if parent.join("Cargo.toml").exists() {
290                        let size = calculate_dir_size(path).map(|(s, _)| s).unwrap_or(0);
291                        if size > 50_000_000 {
292                            targets.push((path.to_path_buf(), size));
293                        }
294                    }
295                }
296            }
297        }
298
299        let mut recommendations = Vec::new();
300
301        if targets.len() >= 2 {
302            let total_size: u64 = targets.iter().map(|(_, s)| s).sum();
303
304            // Rust target directories often share compiled dependencies
305            // With a shared target directory, could save ~30-50%
306            let potential_savings = (total_size as f64 * 0.35) as u64;
307
308            if total_size > 1_000_000_000 {
309                recommendations.push(Recommendation {
310                    kind: RecommendationKind::DuplicateDependency,
311                    title: format!(
312                        "🦀 {} Rust targets ({})",
313                        targets.len(),
314                        format_size(total_size)
315                    ),
316                    description: format!(
317                        "Found {} Rust projects with separate target directories. Consider using CARGO_TARGET_DIR for shared compilation cache.",
318                        targets.len()
319                    ),
320                    path: targets.first().map(|(p, _)| p.clone()).unwrap_or_default(),
321                    potential_savings,
322                    fix_command: Some("export CARGO_TARGET_DIR=~/.cargo/target".to_string()),
323                    risk: RiskLevel::None,
324                });
325            }
326        }
327
328        Ok(recommendations)
329    }
330}
331
332/// Format bytes as human-readable size
333fn format_size(bytes: u64) -> String {
334    super::format_size(bytes)
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    #[test]
342    fn test_duplicate_finder_creation() {
343        let finder = DuplicateFinder::new();
344        assert_eq!(finder.min_duplicate_count, 2);
345    }
346
347    #[test]
348    fn test_duplicate_scan() {
349        let finder = DuplicateFinder::new();
350        if let Ok(recommendations) = finder.scan(Path::new("."), 3) {
351            println!("Found {} duplicate recommendations", recommendations.len());
352            for rec in &recommendations {
353                println!("  {} - {}", rec.title, rec.description);
354            }
355        }
356    }
357}