1use std::collections::HashMap;
2use std::fs::File;
3use std::io::Read;
4use std::os::unix::fs::MetadataExt;
5use std::path::{Path, PathBuf};
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::{Arc, Mutex};
8use std::time::SystemTime;
9
10use anyhow::Result;
11use ignore::WalkBuilder;
12use rayon::prelude::*;
13
14#[derive(Debug, Clone, Default)]
16pub struct DupeOptions {
17 pub root_paths: Vec<PathBuf>,
19 pub min_size: Option<u64>,
21 pub min_group_size: Option<usize>,
23}
24
25#[derive(Debug, Clone)]
27pub struct DuplicateFile {
28 pub path: PathBuf,
29 pub size: u64,
30 pub mtime: Option<SystemTime>,
31 pub inode: u64,
32 pub device: u64,
33}
34
35#[derive(Debug, Clone)]
37pub struct DuplicateGroup {
38 pub hash: String,
40 pub size: u64,
42 pub files: Vec<DuplicateFile>,
44 pub is_clone_group: bool,
46}
47
48impl DuplicateGroup {
49 pub fn wasted_space(&self) -> u64 {
51 if self.files.len() <= 1 {
52 0
53 } else {
54 self.size * (self.files.len() as u64 - 1)
55 }
56 }
57}
58
59#[derive(Debug)]
61pub struct DupeResult {
62 pub groups: Vec<DuplicateGroup>,
63 pub total_wasted: u64,
64 pub files_scanned: u64,
65 pub empty_files_skipped: u64,
66}
67
68pub struct DupeProgress {
70 pub files_walked: AtomicU64,
71 pub size_groups: AtomicU64,
72 pub files_to_hash: AtomicU64,
73 pub files_hashed: AtomicU64,
74 pub bytes_hashed: AtomicU64,
75}
76
77impl Default for DupeProgress {
78 fn default() -> Self {
79 Self {
80 files_walked: AtomicU64::new(0),
81 size_groups: AtomicU64::new(0),
82 files_to_hash: AtomicU64::new(0),
83 files_hashed: AtomicU64::new(0),
84 bytes_hashed: AtomicU64::new(0),
85 }
86 }
87}
88
89impl DupeProgress {
90 pub fn new() -> Self {
91 Self::default()
92 }
93}
94
95const DEFAULT_EXCLUSIONS: &[&str] = &[
97 "/System",
98 "/usr/bin",
99 "/usr/lib",
100 "/usr/libexec",
101 "/usr/sbin",
102 "/usr/share",
103 "/bin",
104 "/sbin",
105 "/private/var/db",
106];
107
108const EXCLUDED_COMPONENTS: &[&str] = &[".git", ".app"];
110
111fn should_exclude(path: &Path) -> bool {
113 let path_str = path.to_string_lossy();
114 for excl in DEFAULT_EXCLUSIONS {
115 if path_str.starts_with(excl) {
116 return true;
117 }
118 }
119 for component in path.components() {
120 let s = component.as_os_str().to_string_lossy();
121 for excl in EXCLUDED_COMPONENTS {
122 if s.ends_with(excl) {
123 return true;
124 }
125 }
126 }
127 false
128}
129
130const PARTIAL_HASH_SIZE: usize = 4096;
132
133const MMAP_THRESHOLD: u64 = 1024 * 1024; pub fn find_duplicates(
138 options: &DupeOptions,
139 progress: Option<&DupeProgress>,
140) -> Result<DupeResult> {
141 let min_size = options.min_size.unwrap_or(1);
142 let min_group = options.min_group_size.unwrap_or(2);
143
144 let (size_groups, empty_count) = walk_and_group_by_size(options, min_size, progress)?;
146
147 if let Some(p) = progress {
148 p.size_groups
149 .store(size_groups.len() as u64, Ordering::Relaxed);
150 }
151
152 let size_groups: Vec<Vec<DuplicateFile>> = size_groups
154 .into_values()
155 .filter(|group| group.len() >= min_group)
156 .collect();
157
158 let files_to_hash: u64 = size_groups.iter().map(|g| g.len() as u64).sum();
160 if let Some(p) = progress {
161 p.files_to_hash.store(files_to_hash, Ordering::Relaxed);
162 }
163
164 let deduped_groups: Vec<Vec<DuplicateFile>> = size_groups
166 .into_iter()
167 .map(dedup_by_inode)
168 .filter(|g| g.len() >= min_group)
169 .collect();
170
171 let partial_groups: Vec<Vec<DuplicateFile>> = deduped_groups
173 .into_par_iter()
174 .flat_map(|group| partial_hash_group(group, progress))
175 .filter(|g| g.len() >= min_group)
176 .collect();
177
178 let final_groups: Vec<DuplicateGroup> = partial_groups
180 .into_par_iter()
181 .flat_map(|group| full_hash_group(group, progress))
182 .filter(|g| g.files.len() >= min_group)
183 .collect();
184
185 let mut groups = final_groups;
187 groups.sort_by_key(|g| std::cmp::Reverse(g.wasted_space()));
188
189 let total_wasted: u64 = groups.iter().map(|g| g.wasted_space()).sum();
190 let files_scanned = progress
191 .map(|p| p.files_walked.load(Ordering::Relaxed))
192 .unwrap_or(0);
193
194 Ok(DupeResult {
195 groups,
196 total_wasted,
197 files_scanned,
198 empty_files_skipped: empty_count,
199 })
200}
201
202fn walk_and_group_by_size(
205 options: &DupeOptions,
206 min_size: u64,
207 progress: Option<&DupeProgress>,
208) -> Result<(HashMap<u64, Vec<DuplicateFile>>, u64)> {
209 let collector: Arc<Mutex<HashMap<u64, Vec<DuplicateFile>>>> =
210 Arc::new(Mutex::new(HashMap::new()));
211 let empty_count = Arc::new(AtomicU64::new(0));
212
213 for root in &options.root_paths {
214 if !root.exists() {
215 continue;
216 }
217
218 let mut builder = WalkBuilder::new(root);
219 builder
220 .hidden(false)
221 .git_ignore(true)
222 .git_global(false)
223 .git_exclude(false)
224 .follow_links(false) .threads(num_cpus());
226
227 let walker = builder.build_parallel();
228 let collector_ref = Arc::clone(&collector);
229 let empty_ref = Arc::clone(&empty_count);
230
231 walker.run(|| {
232 let collector = Arc::clone(&collector_ref);
233 let empty_count = Arc::clone(&empty_ref);
234
235 Box::new(move |entry| {
236 let Ok(entry) = entry else {
237 return ignore::WalkState::Continue;
238 };
239
240 let path = entry.path();
241
242 if should_exclude(path) {
243 return ignore::WalkState::Skip;
244 }
245
246 let Some(file_type) = entry.file_type() else {
247 return ignore::WalkState::Continue;
248 };
249 if !file_type.is_file() {
250 return ignore::WalkState::Continue;
251 }
252
253 if let Some(p) = progress {
254 p.files_walked.fetch_add(1, Ordering::Relaxed);
255 }
256
257 let Ok(meta) = entry.metadata() else {
258 return ignore::WalkState::Continue;
260 };
261
262 let size = meta.len();
263
264 if size == 0 {
266 empty_count.fetch_add(1, Ordering::Relaxed);
267 return ignore::WalkState::Continue;
268 }
269
270 if size < min_size {
271 return ignore::WalkState::Continue;
272 }
273
274 let file = DuplicateFile {
275 path: path.to_path_buf(),
276 size,
277 mtime: meta.modified().ok(),
278 inode: meta.ino(),
279 device: meta.dev(),
280 };
281
282 if let Ok(mut map) = collector.lock() {
283 map.entry(size).or_default().push(file);
284 }
285
286 ignore::WalkState::Continue
287 })
288 });
289 }
290
291 let map = Arc::try_unwrap(collector)
292 .map(|mutex| mutex.into_inner().unwrap_or_else(|e| e.into_inner()))
293 .unwrap_or_else(|arc| arc.lock().unwrap().clone());
294
295 let empty = empty_count.load(Ordering::Relaxed);
296
297 Ok((map, empty))
298}
299
300fn dedup_by_inode(group: Vec<DuplicateFile>) -> Vec<DuplicateFile> {
302 let mut seen: HashMap<(u64, u64), usize> = HashMap::new();
303 let mut result = Vec::new();
304
305 for file in group {
306 let key = (file.device, file.inode);
307 if seen.contains_key(&key) {
308 continue; }
310 seen.insert(key, result.len());
311 result.push(file);
312 }
313
314 result
315}
316
317fn partial_hash_group(
320 group: Vec<DuplicateFile>,
321 progress: Option<&DupeProgress>,
322) -> Vec<Vec<DuplicateFile>> {
323 let mut hash_groups: HashMap<[u8; 32], Vec<DuplicateFile>> = HashMap::new();
324
325 for file in group {
326 match partial_hash(&file.path) {
327 Ok(hash) => {
328 if let Some(p) = progress {
329 p.files_hashed.fetch_add(1, Ordering::Relaxed);
330 let hashed = PARTIAL_HASH_SIZE.min(file.size as usize) as u64;
331 p.bytes_hashed.fetch_add(hashed, Ordering::Relaxed);
332 }
333 hash_groups.entry(hash).or_default().push(file);
334 }
335 Err(_) => {
336 }
338 }
339 }
340
341 hash_groups.into_values().collect()
342}
343
344fn partial_hash(path: &Path) -> Result<[u8; 32]> {
346 let mut file = File::open(path)?;
347 let mut buf = vec![0u8; PARTIAL_HASH_SIZE];
348 let n = file.read(&mut buf)?;
349 buf.truncate(n);
350 Ok(*blake3::hash(&buf).as_bytes())
351}
352
353fn full_hash_group(
355 group: Vec<DuplicateFile>,
356 progress: Option<&DupeProgress>,
357) -> Vec<DuplicateGroup> {
358 let mut hash_groups: HashMap<String, Vec<DuplicateFile>> = HashMap::new();
359
360 for file in &group {
361 match full_hash(&file.path, file.size) {
362 Ok(hash) => {
363 if let Some(p) = progress {
364 p.files_hashed.fetch_add(1, Ordering::Relaxed);
365 p.bytes_hashed.fetch_add(file.size, Ordering::Relaxed);
366 }
367 hash_groups.entry(hash).or_default().push(file.clone());
368 }
369 Err(_) => {
370 }
372 }
373 }
374
375 let size = group.first().map(|f| f.size).unwrap_or(0);
376
377 hash_groups
378 .into_iter()
379 .map(|(hash, files)| {
380 let is_clone_group = check_apfs_clones(&files);
381 DuplicateGroup {
382 hash,
383 size,
384 files,
385 is_clone_group,
386 }
387 })
388 .collect()
389}
390
391fn full_hash(path: &Path, size: u64) -> Result<String> {
394 if size > MMAP_THRESHOLD {
395 full_hash_mmap(path)
396 } else {
397 full_hash_buffered(path)
398 }
399}
400
401fn full_hash_buffered(path: &Path) -> Result<String> {
403 let mut file = File::open(path)?;
404 let mut hasher = blake3::Hasher::new();
405 let mut buf = vec![0u8; 65536];
406 loop {
407 let n = file.read(&mut buf)?;
408 if n == 0 {
409 break;
410 }
411 hasher.update(&buf[..n]);
412 }
413 Ok(hasher.finalize().to_hex().to_string())
414}
415
416fn full_hash_mmap(path: &Path) -> Result<String> {
418 let file = File::open(path)?;
419 let mmap = unsafe { memmap2::MmapOptions::new().map(&file)? };
422 let hash = blake3::hash(&mmap);
423 Ok(hash.to_hex().to_string())
424}
425
426#[cfg(target_os = "macos")]
428fn check_apfs_clones(files: &[DuplicateFile]) -> bool {
429 use std::os::macos::fs::MetadataExt;
430
431 const EF_MAY_SHARE_BLOCKS: u32 = 0x0000_0008;
433
434 if files.is_empty() {
435 return false;
436 }
437
438 files.iter().all(|f| {
439 std::fs::metadata(&f.path)
440 .map(|m| (m.st_flags() & EF_MAY_SHARE_BLOCKS) != 0)
441 .unwrap_or(false)
442 })
443}
444
445#[cfg(not(target_os = "macos"))]
446fn check_apfs_clones(_files: &[DuplicateFile]) -> bool {
447 false
448}
449
450fn num_cpus() -> usize {
452 std::thread::available_parallelism()
453 .map(|n| n.get())
454 .unwrap_or(4)
455}
456
457#[cfg(test)]
458mod tests {
459 use super::*;
460 use std::fs;
461
462 fn setup_test_dir(name: &str) -> PathBuf {
463 let tmp = std::env::temp_dir().join(format!("diskforge_test_dupe_{name}"));
464 let _ = fs::remove_dir_all(&tmp);
465 fs::create_dir_all(&tmp).unwrap();
466 tmp
467 }
468
469 fn create_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
470 let path = dir.join(name);
471 if let Some(parent) = path.parent() {
472 fs::create_dir_all(parent).unwrap();
473 }
474 fs::write(&path, content).unwrap();
475 path
476 }
477
478 #[test]
479 fn unique_sizes_no_duplicates() {
480 let tmp = setup_test_dir("unique_sizes");
481 create_file(&tmp, "a.txt", &[1u8; 100]);
482 create_file(&tmp, "b.txt", &[2u8; 200]);
483 create_file(&tmp, "c.txt", &vec![3u8; 300]);
484
485 let opts = DupeOptions {
486 root_paths: vec![tmp.clone()],
487 min_size: Some(1),
488 ..Default::default()
489 };
490
491 let result = find_duplicates(&opts, None).unwrap();
492 assert!(
493 result.groups.is_empty(),
494 "Files with unique sizes should produce no groups"
495 );
496
497 fs::remove_dir_all(&tmp).ok();
498 }
499
500 #[test]
501 fn same_size_different_content_no_duplicates() {
502 let tmp = setup_test_dir("same_size_diff_content");
503 create_file(&tmp, "a.txt", b"hello world!");
504 create_file(&tmp, "b.txt", b"goodbye now!");
505
506 let opts = DupeOptions {
507 root_paths: vec![tmp.clone()],
508 min_size: Some(1),
509 ..Default::default()
510 };
511
512 let result = find_duplicates(&opts, None).unwrap();
513 assert!(
514 result.groups.is_empty(),
515 "Same size but different content should not be grouped"
516 );
517
518 fs::remove_dir_all(&tmp).ok();
519 }
520
521 #[test]
522 fn identical_files_grouped() {
523 let tmp = setup_test_dir("identical_files");
524 let content = b"this is duplicate content for testing";
525 create_file(&tmp, "a.txt", content);
526 create_file(&tmp, "b.txt", content);
527 create_file(&tmp, "c.txt", content);
528
529 let opts = DupeOptions {
530 root_paths: vec![tmp.clone()],
531 min_size: Some(1),
532 ..Default::default()
533 };
534
535 let result = find_duplicates(&opts, None).unwrap();
536 assert_eq!(result.groups.len(), 1, "Should find 1 duplicate group");
537 assert_eq!(
538 result.groups[0].files.len(),
539 3,
540 "Group should contain 3 files"
541 );
542 assert_eq!(
543 result.groups[0].wasted_space(),
544 content.len() as u64 * 2,
545 "Wasted = size * (count - 1)"
546 );
547
548 fs::remove_dir_all(&tmp).ok();
549 }
550
551 #[test]
552 fn hard_links_deduplicated() {
553 let tmp = setup_test_dir("hard_links");
554 let content = b"content for hard link test that is long enough";
555 let original = create_file(&tmp, "original.txt", content);
556 let link_path = tmp.join("hardlink.txt");
557 fs::hard_link(&original, &link_path).unwrap();
558
559 let opts = DupeOptions {
560 root_paths: vec![tmp.clone()],
561 min_size: Some(1),
562 ..Default::default()
563 };
564
565 let result = find_duplicates(&opts, None).unwrap();
566 assert!(
567 result.groups.is_empty(),
568 "Hard links to the same inode should not be reported as duplicates"
569 );
570
571 fs::remove_dir_all(&tmp).ok();
572 }
573
574 #[test]
575 fn empty_files_excluded() {
576 let tmp = setup_test_dir("empty_files");
577 create_file(&tmp, "a.txt", b"");
578 create_file(&tmp, "b.txt", b"");
579 create_file(&tmp, "c.txt", b"");
580
581 let opts = DupeOptions {
582 root_paths: vec![tmp.clone()],
583 min_size: None, ..Default::default()
585 };
586
587 let result = find_duplicates(&opts, None).unwrap();
588 assert!(result.groups.is_empty(), "Empty files should be excluded");
589 assert_eq!(result.empty_files_skipped, 3);
590
591 fs::remove_dir_all(&tmp).ok();
592 }
593
594 #[test]
595 fn min_size_filter() {
596 let tmp = setup_test_dir("min_size");
597 let small = vec![1u8; 50];
598 let big = vec![2u8; 500];
599 create_file(&tmp, "small1.txt", &small);
600 create_file(&tmp, "small2.txt", &small);
601 create_file(&tmp, "big1.txt", &big);
602 create_file(&tmp, "big2.txt", &big);
603
604 let opts = DupeOptions {
605 root_paths: vec![tmp.clone()],
606 min_size: Some(100),
607 ..Default::default()
608 };
609
610 let result = find_duplicates(&opts, None).unwrap();
611 assert_eq!(
612 result.groups.len(),
613 1,
614 "Only the big files should form a group"
615 );
616 assert_eq!(result.groups[0].size, 500);
617
618 fs::remove_dir_all(&tmp).ok();
619 }
620
621 #[test]
622 fn partial_hash_eliminates_different_content() {
623 let tmp = setup_test_dir("partial_hash");
624 let mut content_a = vec![0xAAu8; 5000];
626 let mut content_b = vec![0xBBu8; 5000];
627 content_a[4096..].fill(0xFF);
629 content_b[4096..].fill(0xFF);
630
631 create_file(&tmp, "a.bin", &content_a);
632 create_file(&tmp, "b.bin", &content_b);
633
634 let opts = DupeOptions {
635 root_paths: vec![tmp.clone()],
636 min_size: Some(1),
637 ..Default::default()
638 };
639
640 let result = find_duplicates(&opts, None).unwrap();
641 assert!(
642 result.groups.is_empty(),
643 "Different first 4KB should be eliminated by partial hash"
644 );
645
646 fs::remove_dir_all(&tmp).ok();
647 }
648
649 #[test]
650 fn progress_counters_work() {
651 let tmp = setup_test_dir("progress");
652 let content = b"duplicate content for progress test!!";
653 create_file(&tmp, "a.txt", content);
654 create_file(&tmp, "b.txt", content);
655 create_file(&tmp, "unique.txt", b"something unique here");
656
657 let progress = DupeProgress::new();
658 let opts = DupeOptions {
659 root_paths: vec![tmp.clone()],
660 min_size: Some(1),
661 ..Default::default()
662 };
663
664 find_duplicates(&opts, Some(&progress)).unwrap();
665 assert!(progress.files_walked.load(Ordering::Relaxed) >= 3);
666 assert!(progress.files_hashed.load(Ordering::Relaxed) > 0);
667
668 fs::remove_dir_all(&tmp).ok();
669 }
670
671 #[test]
672 fn symlinks_not_followed() {
673 let tmp = setup_test_dir("symlinks");
674 let content = b"real file content for symlink test";
675 let real_file = create_file(&tmp, "real.txt", content);
676 let link_path = tmp.join("link.txt");
677 std::os::unix::fs::symlink(&real_file, &link_path).unwrap();
678
679 let opts = DupeOptions {
680 root_paths: vec![tmp.clone()],
681 min_size: Some(1),
682 ..Default::default()
683 };
684
685 let result = find_duplicates(&opts, None).unwrap();
686 assert!(
688 result.groups.is_empty(),
689 "Symlinks should not be treated as duplicate candidates"
690 );
691
692 fs::remove_dir_all(&tmp).ok();
693 }
694
695 #[test]
696 fn multiple_duplicate_groups() {
697 let tmp = setup_test_dir("multi_groups");
698 let content_a = b"group A duplicate content here!";
699 let content_b = vec![0xFFu8; 200];
700
701 create_file(&tmp, "a1.txt", content_a);
702 create_file(&tmp, "a2.txt", content_a);
703 create_file(&tmp, "b1.bin", &content_b);
704 create_file(&tmp, "b2.bin", &content_b);
705
706 let opts = DupeOptions {
707 root_paths: vec![tmp.clone()],
708 min_size: Some(1),
709 ..Default::default()
710 };
711
712 let result = find_duplicates(&opts, None).unwrap();
713 assert_eq!(result.groups.len(), 2, "Should find 2 duplicate groups");
714
715 fs::remove_dir_all(&tmp).ok();
716 }
717
718 #[cfg(target_os = "macos")]
719 #[test]
720 fn apfs_clone_detection() {
721 use std::process::Command;
722
723 let tmp = setup_test_dir("apfs_clones");
724 let content = vec![0u8; 8192]; let original = create_file(&tmp, "original.bin", &content);
726 let clone_path = tmp.join("clone.bin");
727
728 let status = Command::new("cp")
730 .args([
731 "-c",
732 &original.to_string_lossy(),
733 &clone_path.to_string_lossy(),
734 ])
735 .status();
736
737 match status {
738 Ok(s) if s.success() => {
739 let opts = DupeOptions {
740 root_paths: vec![tmp.clone()],
741 min_size: Some(1),
742 ..Default::default()
743 };
744
745 let result = find_duplicates(&opts, None).unwrap();
746 if !result.groups.is_empty() {
748 let _is_clone = result.groups[0].is_clone_group;
751 }
753 }
754 _ => {
755 }
757 }
758
759 fs::remove_dir_all(&tmp).ok();
760 }
761}