1use crate::binary::is_binary;
17use crate::config::Config;
18use crate::error::{FsearchError, FsearchResult};
19use glob::Pattern;
20use md5::{Digest as _, Md5};
21use rayon::prelude::*;
22use serde::{Deserialize, Serialize};
23use sha2::Sha256;
24use std::collections::{HashMap, HashSet};
25use std::fs;
26use std::io::{BufReader, Read};
27use std::path::{Path, PathBuf};
28use std::sync::atomic::{AtomicBool, Ordering};
29use std::sync::{Arc, Mutex};
30use walkdir::WalkDir;
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct DuplicateGroup {
37 pub hash: String,
40 pub size: u64,
42 pub paths: Vec<PathBuf>,
44 pub wasted_bytes: u64,
46}
47
48impl DuplicateGroup {
49 pub fn wasted_human(&self) -> String {
51 human_bytes(self.wasted_bytes)
52 }
53 pub fn size_human(&self) -> String {
55 human_bytes(self.size)
56 }
57}
58
59#[derive(Debug, Clone, Default, Serialize, Deserialize)]
61pub struct DuplicateSummary {
62 pub files_scanned: usize,
63 pub dirs_scanned: usize,
64 pub groups_found: usize,
65 pub duplicate_files: usize,
67 pub wasted_bytes: u64,
68}
69
70impl DuplicateSummary {
71 pub fn wasted_human(&self) -> String {
72 human_bytes(self.wasted_bytes)
73 }
74}
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
80pub enum HashAlgorithm {
81 Md5,
82 #[default]
83 Sha256,
84}
85
86impl HashAlgorithm {
87 pub fn as_str(&self) -> &'static str {
88 match self {
89 Self::Md5 => "md5",
90 Self::Sha256 => "sha256",
91 }
92 }
93}
94
95impl std::str::FromStr for HashAlgorithm {
96 type Err = FsearchError;
97 fn from_str(s: &str) -> Result<Self, Self::Err> {
98 match s.to_lowercase().as_str() {
99 "md5" => Ok(Self::Md5),
100 "sha256" => Ok(Self::Sha256),
101 other => Err(FsearchError::UnsupportedHashAlgorithm(other.into())),
102 }
103 }
104}
105
106#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
108pub enum DuplicateMode {
109 #[default]
111 Content,
112 Name,
114 Size,
116}
117
118#[derive(Debug, Clone)]
120pub struct DuplicateOptions {
121 pub base_dirs: Vec<PathBuf>,
125
126 pub max_depth: u32,
128 pub mode: DuplicateMode,
130 pub algorithm: HashAlgorithm,
132 pub buffer_size: usize,
134 pub min_size: u64,
136 pub max_size: u64,
138 pub skip_binary: bool,
140 pub binary_check_bytes: usize,
142 pub include_patterns: Vec<String>,
144 pub exclude_dirs: Vec<String>,
146 pub max_results: usize,
148}
149
150impl DuplicateOptions {
151 pub fn from_config(cfg: &Config, base_dirs: Vec<PathBuf>) -> FsearchResult<Self> {
153 Ok(Self {
154 base_dirs,
155 max_depth: cfg.default_depth,
156 mode: DuplicateMode::Content,
157 algorithm: cfg.hash_algorithm.parse::<HashAlgorithm>()?,
158 buffer_size: cfg.hash_buffer_size,
159 min_size: cfg.dup_min_size,
160 max_size: cfg.dup_max_size,
161 skip_binary: false,
162 binary_check_bytes: cfg.binary_check_bytes,
163 include_patterns: crate::config::split_csv(&cfg.default_include),
164 exclude_dirs: cfg.excluded_dirs(),
165 max_results: cfg.max_results,
166 })
167 }
168
169 pub fn builder(base_dirs: impl IntoDirs) -> DuplicateOptionsBuilder {
171 DuplicateOptionsBuilder::new(base_dirs.into_dirs())
172 }
173}
174
175pub trait IntoDirs {
178 fn into_dirs(self) -> Vec<PathBuf>;
179}
180
181impl IntoDirs for PathBuf {
182 fn into_dirs(self) -> Vec<PathBuf> {
183 vec![self]
184 }
185}
186impl IntoDirs for &str {
187 fn into_dirs(self) -> Vec<PathBuf> {
188 vec![PathBuf::from(self)]
189 }
190}
191impl IntoDirs for String {
192 fn into_dirs(self) -> Vec<PathBuf> {
193 vec![PathBuf::from(self)]
194 }
195}
196impl IntoDirs for &Path {
197 fn into_dirs(self) -> Vec<PathBuf> {
198 vec![self.to_path_buf()]
199 }
200}
201#[doc(hidden)]
203impl IntoDirs for &PathBuf {
204 fn into_dirs(self) -> Vec<PathBuf> {
205 vec![self.into()]
206 }
207}
208
209impl<P: Into<PathBuf>> IntoDirs for Vec<P> {
210 fn into_dirs(self) -> Vec<PathBuf> {
211 self.into_iter().map(Into::into).collect()
212 }
213}
214
215pub struct DuplicateOptionsBuilder(DuplicateOptions);
217
218impl DuplicateOptionsBuilder {
219 fn new(base_dirs: Vec<PathBuf>) -> Self {
220 Self(DuplicateOptions {
221 base_dirs,
222 max_depth: 10,
223 mode: DuplicateMode::Content,
224 algorithm: HashAlgorithm::Sha256,
225 buffer_size: 65_536,
226 min_size: 1,
227 max_size: 0,
228 skip_binary: false,
229 binary_check_bytes: 1024,
230 include_patterns: vec![],
231 exclude_dirs: vec![
232 ".git".into(),
233 "node_modules".into(),
234 "target".into(),
235 ".svn".into(),
236 "__pycache__".into(),
237 ".hg".into(),
238 ".cache".into(),
239 ],
240 max_results: 0,
241 })
242 }
243
244 pub fn add_dir(mut self, d: impl Into<PathBuf>) -> Self {
246 self.0.base_dirs.push(d.into());
247 self
248 }
249
250 pub fn max_depth(mut self, d: u32) -> Self {
251 self.0.max_depth = d;
252 self
253 }
254 pub fn mode(mut self, m: DuplicateMode) -> Self {
255 self.0.mode = m;
256 self
257 }
258 pub fn algorithm(mut self, a: HashAlgorithm) -> Self {
259 self.0.algorithm = a;
260 self
261 }
262 pub fn buffer_size(mut self, b: usize) -> Self {
263 self.0.buffer_size = b;
264 self
265 }
266 pub fn min_size(mut self, s: u64) -> Self {
267 self.0.min_size = s;
268 self
269 }
270 pub fn max_size(mut self, s: u64) -> Self {
271 self.0.max_size = s;
272 self
273 }
274 pub fn skip_binary(mut self, v: bool) -> Self {
275 self.0.skip_binary = v;
276 self
277 }
278 pub fn include_patterns(mut self, p: Vec<String>) -> Self {
279 self.0.include_patterns = p;
280 self
281 }
282 pub fn exclude_dirs(mut self, d: Vec<String>) -> Self {
283 self.0.exclude_dirs = d;
284 self
285 }
286 pub fn max_results(mut self, n: usize) -> Self {
287 self.0.max_results = n;
288 self
289 }
290 pub fn build(self) -> DuplicateOptions {
291 self.0
292 }
293}
294
295pub fn find_duplicates(
301 opts: &DuplicateOptions,
302 interrupted: Arc<AtomicBool>,
303) -> FsearchResult<(Vec<DuplicateGroup>, DuplicateSummary)> {
304 if opts.base_dirs.is_empty() {
306 return Err(FsearchError::DirectoryNotFound(
307 "(no directories specified)".into(),
308 ));
309 }
310 for d in &opts.base_dirs {
311 if !d.exists() {
312 return Err(FsearchError::DirectoryNotFound(d.display().to_string()));
313 }
314 if !d.is_dir() {
315 return Err(FsearchError::NotADirectory(d.display().to_string()));
316 }
317 }
318
319 let dirs_scanned = opts.base_dirs.len();
320
321 let files = collect_files(opts, &interrupted);
323 let total_scanned = files.len();
324
325 if interrupted.load(Ordering::Relaxed) {
326 return Err(FsearchError::Interrupted);
327 }
328
329 let mut groups = match opts.mode {
330 DuplicateMode::Content => by_content(files, opts, &interrupted)?,
331 DuplicateMode::Name => by_name(files, opts),
332 DuplicateMode::Size => by_size_only(files),
333 };
334
335 groups.sort_unstable_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
337
338 if opts.max_results > 0 && groups.len() > opts.max_results {
339 groups.truncate(opts.max_results);
340 }
341
342 let summary = DuplicateSummary {
343 files_scanned: total_scanned,
344 dirs_scanned,
345 groups_found: groups.len(),
346 duplicate_files: groups.iter().map(|g| g.paths.len() - 1).sum(),
347 wasted_bytes: groups.iter().map(|g| g.wasted_bytes).sum(),
348 };
349
350 Ok((groups, summary))
351}
352
353struct FileEntry {
356 path: PathBuf,
357 size: u64,
358}
359
360fn collect_files(opts: &DuplicateOptions, interrupted: &AtomicBool) -> Vec<FileEntry> {
361 let seen: Arc<Mutex<HashSet<PathBuf>>> = Arc::new(Mutex::new(HashSet::new()));
364
365 opts.base_dirs
366 .par_iter()
367 .flat_map(|base| {
368 if interrupted.load(Ordering::Relaxed) {
369 return vec![];
370 }
371 WalkDir::new(base)
372 .max_depth(opts.max_depth as usize + 1)
373 .follow_links(false)
374 .into_iter()
375 .filter_entry(|e| {
376 if e.file_type().is_dir() && e.depth() > 0 {
377 let name = e.file_name().to_string_lossy().to_string();
378 if is_excluded_dir_local(&name, &opts.exclude_dirs) {
379 return false;
380 }
381 }
382 true
383 })
384 .filter_map(|e| e.ok())
385 .filter(|e| {
386 if interrupted.load(Ordering::Relaxed) {
387 return false;
388 }
389 if !e.file_type().is_file() {
390 return false;
391 }
392
393 let name = e.file_name().to_string_lossy().to_string();
394 if !matches_include_local(&name, &opts.include_patterns) {
395 return false;
396 }
397
398 if let Ok(meta) = e.metadata() {
399 let sz = meta.len();
400 if opts.min_size > 0 && sz < opts.min_size {
401 return false;
402 }
403 if opts.max_size > 0 && sz > opts.max_size {
404 return false;
405 }
406 }
407
408 if opts.skip_binary && is_binary(e.path(), opts.binary_check_bytes) {
409 return false;
410 }
411 true
412 })
413 .filter_map(|e| {
414 let path = e.path().to_path_buf();
415 {
417 let mut guard = seen.lock().unwrap();
418 if !guard.insert(path.clone()) {
419 return None;
420 }
421 }
422 let size = e.metadata().ok()?.len();
423 Some(FileEntry { path, size })
424 })
425 .collect::<Vec<_>>()
426 })
427 .collect()
428}
429
430fn by_content(
433 files: Vec<FileEntry>,
434 opts: &DuplicateOptions,
435 interrupted: &AtomicBool,
436) -> FsearchResult<Vec<DuplicateGroup>> {
437 let mut size_count: HashMap<u64, usize> = HashMap::new();
439 for f in &files {
440 *size_count.entry(f.size).or_default() += 1;
441 }
442
443 let candidates: Vec<(u64, PathBuf)> = files
444 .into_iter()
445 .filter(|f| size_count.get(&f.size).copied().unwrap_or(0) > 1)
446 .map(|f| (f.size, f.path))
447 .collect();
448
449 if candidates.is_empty() {
450 return Ok(vec![]);
451 }
452
453 let buf = opts.buffer_size;
454 let algo = opts.algorithm;
455
456 let hashed: Vec<(String, u64, PathBuf)> = candidates
457 .into_par_iter()
458 .filter_map(|(size, path)| {
459 if interrupted.load(Ordering::Relaxed) {
460 return None;
461 }
462 let digest = hash_file(&path, algo, buf).ok()?;
463 Some((digest, size, path))
464 })
465 .collect();
466
467 let mut hash_map: HashMap<String, (u64, Vec<PathBuf>)> = HashMap::new();
468 for (hash, size, path) in hashed {
469 hash_map
470 .entry(hash)
471 .or_insert_with(|| (size, vec![]))
472 .1
473 .push(path);
474 }
475
476 Ok(hash_map
477 .into_iter()
478 .filter(|(_, (_, paths))| paths.len() > 1)
479 .map(|(hash, (size, mut paths))| {
480 paths.sort(); let wasted = size * (paths.len() as u64 - 1);
482 DuplicateGroup {
483 hash,
484 size,
485 paths,
486 wasted_bytes: wasted,
487 }
488 })
489 .collect())
490}
491
492fn by_name(files: Vec<FileEntry>, opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
495 let mut name_map: HashMap<String, Vec<(PathBuf, u64)>> = HashMap::new();
496 for f in files {
497 let name = f
498 .path
499 .file_name()
500 .map(|n| n.to_string_lossy().to_string())
501 .unwrap_or_default();
502 name_map.entry(name).or_default().push((f.path, f.size));
503 }
504 name_map
505 .into_iter()
506 .filter(|(_, v)| v.len() > 1)
507 .map(|(name, entries)| {
508 let size = entries.first().map(|(_, s)| *s).unwrap_or(0);
509 let mut paths: Vec<PathBuf> = entries.into_iter().map(|(p, _)| p).collect();
510 paths.sort();
511 let wasted = if opts.mode == DuplicateMode::Name {
512 size.saturating_mul(paths.len() as u64 - 1)
513 } else {
514 0
515 };
516 DuplicateGroup {
517 hash: format!("name:{name}"),
518 size,
519 wasted_bytes: wasted,
520 paths,
521 }
522 })
523 .collect()
524}
525
526fn by_size_only(files: Vec<FileEntry>) -> Vec<DuplicateGroup> {
529 let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
530 for f in files {
531 size_map.entry(f.size).or_default().push(f.path);
532 }
533 size_map
534 .into_iter()
535 .filter(|(_, v)| v.len() > 1)
536 .map(|(size, mut paths)| {
537 paths.sort();
538 let wasted = size * (paths.len() as u64 - 1);
539 DuplicateGroup {
540 hash: format!("size:{size}"),
541 size,
542 paths,
543 wasted_bytes: wasted,
544 }
545 })
546 .collect()
547}
548
549pub fn hash_file(path: &Path, algo: HashAlgorithm, buf_size: usize) -> FsearchResult<String> {
553 let file = fs::File::open(path).map_err(|e| FsearchError::Io {
554 path: path.display().to_string(),
555 source: e,
556 })?;
557 let mut reader = BufReader::with_capacity(buf_size, file);
558 let mut buf = vec![0u8; buf_size];
559
560 match algo {
561 HashAlgorithm::Md5 => {
562 let mut h = Md5::new();
563 loop {
564 let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
565 path: path.display().to_string(),
566 source: e,
567 })?;
568 if n == 0 {
569 break;
570 }
571 md5::Digest::update(&mut h, &buf[..n]);
572 }
573 Ok(format!("{:x}", md5::Digest::finalize(h)))
574 }
575 HashAlgorithm::Sha256 => {
576 let mut h = Sha256::new();
577 loop {
578 let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
579 path: path.display().to_string(),
580 source: e,
581 })?;
582 if n == 0 {
583 break;
584 }
585 sha2::Digest::update(&mut h, &buf[..n]);
586 }
587 Ok(format!("{:x}", sha2::Digest::finalize(h)))
588 }
589 }
590}
591
592fn is_excluded_dir_local(name: &str, excludes: &[String]) -> bool {
595 excludes
596 .iter()
597 .any(|ex| Pattern::new(ex).map(|p| p.matches(name)).unwrap_or(false) || ex == name)
598}
599
600fn matches_include_local(name: &str, patterns: &[String]) -> bool {
601 if patterns.is_empty() {
602 return true;
603 }
604 patterns.iter().any(|p| {
605 Pattern::new(p)
606 .map(|pat| pat.matches(name))
607 .unwrap_or(false)
608 })
609}
610
611pub fn human_bytes(bytes: u64) -> String {
613 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
614 if bytes == 0 {
615 return "0 B".into();
616 }
617 let exp = ((bytes as f64).log(1024.0).floor() as usize).min(UNITS.len() - 1);
618 let val = bytes as f64 / 1024_f64.powi(exp as i32);
619 if exp == 0 {
620 format!("{bytes} B")
621 } else {
622 format!("{val:.1} {}", UNITS[exp])
623 }
624}
625
626#[cfg(test)]
629mod tests {
630 use super::*;
631 use std::io::Write;
632 use tempfile::TempDir;
633
634 fn make_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
635 let p = dir.join(name);
636 fs::File::create(&p).unwrap().write_all(content).unwrap();
637 p
638 }
639
640 #[test]
641 fn human_bytes_formatting() {
642 assert_eq!(human_bytes(0), "0 B");
643 assert_eq!(human_bytes(512), "512 B");
644 assert_eq!(human_bytes(1024), "1.0 KiB");
645 assert_eq!(human_bytes(1024 * 1024), "1.0 MiB");
646 }
647
648 #[test]
649 fn detects_identical_files_single_dir() {
650 let tmp = TempDir::new().unwrap();
651 make_file(tmp.path(), "a.txt", b"hello world");
652 make_file(tmp.path(), "b.txt", b"hello world");
653 make_file(tmp.path(), "c.txt", b"different");
654
655 let opts = DuplicateOptions::builder(tmp.path()).max_depth(1).build();
656 let (groups, summary) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
657 assert_eq!(groups.len(), 1);
658 assert_eq!(groups[0].paths.len(), 2);
659 assert_eq!(summary.duplicate_files, 1);
660 assert_eq!(summary.dirs_scanned, 1);
661 }
662
663 #[test]
664 fn detects_duplicates_across_multiple_dirs() {
665 let dir1 = TempDir::new().unwrap();
666 let dir2 = TempDir::new().unwrap();
667 make_file(dir1.path(), "x.txt", b"shared content");
668 make_file(dir2.path(), "y.txt", b"shared content");
669 make_file(dir1.path(), "unique.txt", b"only here");
670
671 let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()])
672 .max_depth(1)
673 .build();
674 let (groups, summary) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
675 assert_eq!(groups.len(), 1, "cross-dir duplicates should be detected");
676 assert_eq!(summary.dirs_scanned, 2);
677 }
678
679 #[test]
680 fn no_false_positives_across_dirs() {
681 let dir1 = TempDir::new().unwrap();
682 let dir2 = TempDir::new().unwrap();
683 make_file(dir1.path(), "a.txt", b"aaa");
684 make_file(dir2.path(), "b.txt", b"bbb");
685
686 let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()]).build();
687 let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
688 assert!(groups.is_empty());
689 }
690
691 #[test]
692 fn overlapping_dirs_no_self_duplicates() {
693 let root = TempDir::new().unwrap();
695 let sub = root.path().join("sub");
696 fs::create_dir_all(&sub).unwrap();
697 make_file(root.path(), "top.txt", b"hello world");
698 make_file(&sub, "sub.txt", b"hello world");
699
700 let opts = DuplicateOptions::builder(vec![root.path(), sub.as_path()])
701 .max_depth(5)
702 .build();
703 let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
704 assert_eq!(groups.len(), 1);
706 assert_eq!(groups[0].paths.len(), 2);
707 }
708
709 #[test]
710 fn name_mode_across_dirs() {
711 let dir1 = TempDir::new().unwrap();
712 let dir2 = TempDir::new().unwrap();
713 make_file(dir1.path(), "readme.txt", b"v1");
714 make_file(dir2.path(), "readme.txt", b"v2");
715
716 let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()])
717 .mode(DuplicateMode::Name)
718 .build();
719 let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
720 assert!(!groups.is_empty());
721 }
722
723 #[test]
724 fn hash_file_md5_and_sha256() {
725 let tmp = TempDir::new().unwrap();
726 let p = make_file(tmp.path(), "f.txt", b"test content");
727 let md5 = hash_file(&p, HashAlgorithm::Md5, 4096).unwrap();
728 let sha256 = hash_file(&p, HashAlgorithm::Sha256, 4096).unwrap();
729 assert_eq!(md5.len(), 32);
730 assert_eq!(sha256.len(), 64);
731 }
732
733 #[test]
734 fn size_filter_excludes_small_files() {
735 let tmp = TempDir::new().unwrap();
736 make_file(tmp.path(), "s1.txt", b"hi");
737 make_file(tmp.path(), "s2.txt", b"hi");
738
739 let opts = DuplicateOptions::builder(tmp.path()).min_size(1000).build();
740 let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
741 assert!(groups.is_empty());
742 }
743}