Skip to main content

fsearch/
duplicates.rs

1//! Duplicate-file detection across one or more directories.
2//!
3//! # Strategy
4//!
5//! 1. **Collect** — walk every supplied root, applying size/binary/glob filters.
6//! 2. **Size pass** — group files by byte count; only groups with ≥ 2 members
7//!    proceed to the hash pass (makes zero extra IO for unique-size files).
8//! 3. **Hash pass** — within each size group, compute the configured digest
9//!    (MD5 for speed, SHA-256 for integrity) in parallel with Rayon.
10//! 4. **Group** — files sharing the same digest form a [`DuplicateGroup`].
11//!
12//! Multiple root directories are walked in parallel; results are
13//! de-duplicated by canonical path so overlapping trees never produce
14//! false duplicates.
15
16use crate::binary::is_binary;
17use crate::config::Config;
18use crate::error::{FsearchError, FsearchResult};
19use glob::Pattern;
20use md5::{Digest as _, Md5};
21use rayon::prelude::*;
22use serde::{Deserialize, Serialize};
23use sha2::Sha256;
24use std::collections::{HashMap, HashSet};
25use std::fs;
26use std::io::{BufReader, Read};
27use std::path::{Path, PathBuf};
28use std::sync::atomic::{AtomicBool, Ordering};
29use std::sync::{Arc, Mutex};
30use walkdir::WalkDir;
31
32// ── Public types ──────────────────────────────────────────────────────────────
33
34/// A group of files that are byte-for-byte identical.
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct DuplicateGroup {
37    /// Hex-encoded content digest (or `"name:<name>"` / `"size:<n>"` for
38    /// the corresponding detection modes).
39    pub hash: String,
40    /// File size shared by all members.
41    pub size: u64,
42    /// All paths in this group (≥ 2).
43    pub paths: Vec<PathBuf>,
44    /// Total wasted space = `size × (paths.len() − 1)`.
45    pub wasted_bytes: u64,
46}
47
48impl DuplicateGroup {
49    /// Human-readable wasted space, e.g. `"4.2 MiB"`.
50    pub fn wasted_human(&self) -> String {
51        human_bytes(self.wasted_bytes)
52    }
53    /// Human-readable file size.
54    pub fn size_human(&self) -> String {
55        human_bytes(self.size)
56    }
57}
58
59/// Summary statistics for a duplicate scan.
60#[derive(Debug, Clone, Default, Serialize, Deserialize)]
61pub struct DuplicateSummary {
62    pub files_scanned: usize,
63    pub dirs_scanned: usize,
64    pub groups_found: usize,
65    /// Duplicate files across all groups, not counting one "original" per group.
66    pub duplicate_files: usize,
67    pub wasted_bytes: u64,
68}
69
70impl DuplicateSummary {
71    pub fn wasted_human(&self) -> String {
72        human_bytes(self.wasted_bytes)
73    }
74}
75
76// ── Options ───────────────────────────────────────────────────────────────────
77
78/// Hashing algorithm for content comparison.
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
80pub enum HashAlgorithm {
81    Md5,
82    #[default]
83    Sha256,
84}
85
86impl HashAlgorithm {
87    pub fn as_str(&self) -> &'static str {
88        match self {
89            Self::Md5 => "md5",
90            Self::Sha256 => "sha256",
91        }
92    }
93}
94
95impl std::str::FromStr for HashAlgorithm {
96    type Err = FsearchError;
97    fn from_str(s: &str) -> Result<Self, Self::Err> {
98        match s.to_lowercase().as_str() {
99            "md5" => Ok(Self::Md5),
100            "sha256" => Ok(Self::Sha256),
101            other => Err(FsearchError::UnsupportedHashAlgorithm(other.into())),
102        }
103    }
104}
105
106/// Controls how duplicates are identified.
107#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
108pub enum DuplicateMode {
109    /// Hash-based: files with identical content (default).
110    #[default]
111    Content,
112    /// Name-based: files with the same filename (different directories).
113    Name,
114    /// Size-based: files that share the same byte count (fast, imprecise).
115    Size,
116}
117
118/// All parameters for a duplicate-detection run.
119#[derive(Debug, Clone)]
120pub struct DuplicateOptions {
121    /// One or more root directories to scan. At least one must be given.
122    /// Files found across **all** roots are pooled together before grouping,
123    /// so duplicates that live in different trees are detected correctly.
124    pub base_dirs: Vec<PathBuf>,
125
126    /// Maximum recursion depth per directory (0 = only that directory).
127    pub max_depth: u32,
128    /// Detection mode.
129    pub mode: DuplicateMode,
130    /// Hashing algorithm (only used in [`DuplicateMode::Content`]).
131    pub algorithm: HashAlgorithm,
132    /// Read buffer size for hashing (bytes).
133    pub buffer_size: usize,
134    /// Skip files smaller than this (bytes). 0 = include all.
135    pub min_size: u64,
136    /// Skip files larger than this (bytes). 0 = no limit.
137    pub max_size: u64,
138    /// Skip binary files.
139    pub skip_binary: bool,
140    /// Binary-check probe length.
141    pub binary_check_bytes: usize,
142    /// Only include files matching these glob patterns (empty = all).
143    pub include_patterns: Vec<String>,
144    /// Directory names to skip entirely.
145    pub exclude_dirs: Vec<String>,
146    /// Cap results (0 = unlimited).
147    pub max_results: usize,
148}
149
150impl DuplicateOptions {
151    /// Construct from a [`Config`] with explicit root directories.
152    pub fn from_config(cfg: &Config, base_dirs: Vec<PathBuf>) -> FsearchResult<Self> {
153        Ok(Self {
154            base_dirs,
155            max_depth: cfg.default_depth,
156            mode: DuplicateMode::Content,
157            algorithm: cfg.hash_algorithm.parse::<HashAlgorithm>()?,
158            buffer_size: cfg.hash_buffer_size,
159            min_size: cfg.dup_min_size,
160            max_size: cfg.dup_max_size,
161            skip_binary: false,
162            binary_check_bytes: cfg.binary_check_bytes,
163            include_patterns: crate::config::split_csv(&cfg.default_include),
164            exclude_dirs: cfg.excluded_dirs(),
165            max_results: cfg.max_results,
166        })
167    }
168
169    /// Fluent builder — start with one or more directories.
170    pub fn builder(base_dirs: impl IntoDirs) -> DuplicateOptionsBuilder {
171        DuplicateOptionsBuilder::new(base_dirs.into_dirs())
172    }
173}
174
175/// Trait that lets [`DuplicateOptions::builder`] accept either a single path
176/// or a `Vec` of paths.
177pub trait IntoDirs {
178    fn into_dirs(self) -> Vec<PathBuf>;
179}
180
181impl IntoDirs for PathBuf {
182    fn into_dirs(self) -> Vec<PathBuf> {
183        vec![self]
184    }
185}
186impl IntoDirs for &str {
187    fn into_dirs(self) -> Vec<PathBuf> {
188        vec![PathBuf::from(self)]
189    }
190}
191impl IntoDirs for String {
192    fn into_dirs(self) -> Vec<PathBuf> {
193        vec![PathBuf::from(self)]
194    }
195}
196impl IntoDirs for &Path {
197    fn into_dirs(self) -> Vec<PathBuf> {
198        vec![self.to_path_buf()]
199    }
200}
201// hidden — only the Vec impl below is the "real" multi-path entry
202#[doc(hidden)]
203impl IntoDirs for &PathBuf {
204    fn into_dirs(self) -> Vec<PathBuf> {
205        vec![self.into()]
206    }
207}
208
209impl<P: Into<PathBuf>> IntoDirs for Vec<P> {
210    fn into_dirs(self) -> Vec<PathBuf> {
211        self.into_iter().map(Into::into).collect()
212    }
213}
214
215/// Fluent builder for [`DuplicateOptions`].
216pub struct DuplicateOptionsBuilder(DuplicateOptions);
217
218impl DuplicateOptionsBuilder {
219    fn new(base_dirs: Vec<PathBuf>) -> Self {
220        Self(DuplicateOptions {
221            base_dirs,
222            max_depth: 10,
223            mode: DuplicateMode::Content,
224            algorithm: HashAlgorithm::Sha256,
225            buffer_size: 65_536,
226            min_size: 1,
227            max_size: 0,
228            skip_binary: false,
229            binary_check_bytes: 1024,
230            include_patterns: vec![],
231            exclude_dirs: vec![
232                ".git".into(),
233                "node_modules".into(),
234                "target".into(),
235                ".svn".into(),
236                "__pycache__".into(),
237                ".hg".into(),
238                ".cache".into(),
239            ],
240            max_results: 0,
241        })
242    }
243
244    /// Add another root directory to the scan.
245    pub fn add_dir(mut self, d: impl Into<PathBuf>) -> Self {
246        self.0.base_dirs.push(d.into());
247        self
248    }
249
250    pub fn max_depth(mut self, d: u32) -> Self {
251        self.0.max_depth = d;
252        self
253    }
254    pub fn mode(mut self, m: DuplicateMode) -> Self {
255        self.0.mode = m;
256        self
257    }
258    pub fn algorithm(mut self, a: HashAlgorithm) -> Self {
259        self.0.algorithm = a;
260        self
261    }
262    pub fn buffer_size(mut self, b: usize) -> Self {
263        self.0.buffer_size = b;
264        self
265    }
266    pub fn min_size(mut self, s: u64) -> Self {
267        self.0.min_size = s;
268        self
269    }
270    pub fn max_size(mut self, s: u64) -> Self {
271        self.0.max_size = s;
272        self
273    }
274    pub fn skip_binary(mut self, v: bool) -> Self {
275        self.0.skip_binary = v;
276        self
277    }
278    pub fn include_patterns(mut self, p: Vec<String>) -> Self {
279        self.0.include_patterns = p;
280        self
281    }
282    pub fn exclude_dirs(mut self, d: Vec<String>) -> Self {
283        self.0.exclude_dirs = d;
284        self
285    }
286    pub fn max_results(mut self, n: usize) -> Self {
287        self.0.max_results = n;
288        self
289    }
290    pub fn build(self) -> DuplicateOptions {
291        self.0
292    }
293}
294
295// ── Main entry point ──────────────────────────────────────────────────────────
296
297/// Find duplicate files across all directories in `opts.base_dirs`.
298///
299/// Returns `(groups, summary)`. Groups are sorted by `wasted_bytes` descending.
300pub fn find_duplicates(
301    opts: &DuplicateOptions,
302    interrupted: Arc<AtomicBool>,
303) -> FsearchResult<(Vec<DuplicateGroup>, DuplicateSummary)> {
304    // Validate all roots up-front so we never start a partial scan.
305    if opts.base_dirs.is_empty() {
306        return Err(FsearchError::DirectoryNotFound(
307            "(no directories specified)".into(),
308        ));
309    }
310    for d in &opts.base_dirs {
311        if !d.exists() {
312            return Err(FsearchError::DirectoryNotFound(d.display().to_string()));
313        }
314        if !d.is_dir() {
315            return Err(FsearchError::NotADirectory(d.display().to_string()));
316        }
317    }
318
319    let dirs_scanned = opts.base_dirs.len();
320
321    // Collect all candidate files from every root, de-duplicated by path.
322    let files = collect_files(opts, &interrupted);
323    let total_scanned = files.len();
324
325    if interrupted.load(Ordering::Relaxed) {
326        return Err(FsearchError::Interrupted);
327    }
328
329    let mut groups = match opts.mode {
330        DuplicateMode::Content => by_content(files, opts, &interrupted)?,
331        DuplicateMode::Name => by_name(files, opts),
332        DuplicateMode::Size => by_size_only(files),
333    };
334
335    // Sort: most wasteful groups first
336    groups.sort_unstable_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
337
338    if opts.max_results > 0 && groups.len() > opts.max_results {
339        groups.truncate(opts.max_results);
340    }
341
342    let summary = DuplicateSummary {
343        files_scanned: total_scanned,
344        dirs_scanned,
345        groups_found: groups.len(),
346        duplicate_files: groups.iter().map(|g| g.paths.len() - 1).sum(),
347        wasted_bytes: groups.iter().map(|g| g.wasted_bytes).sum(),
348    };
349
350    Ok((groups, summary))
351}
352
353// ── File collection ───────────────────────────────────────────────────────────
354
355struct FileEntry {
356    path: PathBuf,
357    size: u64,
358}
359
360fn collect_files(opts: &DuplicateOptions, interrupted: &AtomicBool) -> Vec<FileEntry> {
361    // Shared seen-set prevents counting the same file twice when the caller
362    // supplies overlapping directories (e.g. `/home` and `/home/user`).
363    let seen: Arc<Mutex<HashSet<PathBuf>>> = Arc::new(Mutex::new(HashSet::new()));
364
365    opts.base_dirs
366        .par_iter()
367        .flat_map(|base| {
368            if interrupted.load(Ordering::Relaxed) {
369                return vec![];
370            }
371            WalkDir::new(base)
372                .max_depth(opts.max_depth as usize + 1)
373                .follow_links(false)
374                .into_iter()
375                .filter_entry(|e| {
376                    if e.file_type().is_dir() && e.depth() > 0 {
377                        let name = e.file_name().to_string_lossy().to_string();
378                        if is_excluded_dir_local(&name, &opts.exclude_dirs) {
379                            return false;
380                        }
381                    }
382                    true
383                })
384                .filter_map(|e| e.ok())
385                .filter(|e| {
386                    if interrupted.load(Ordering::Relaxed) {
387                        return false;
388                    }
389                    if !e.file_type().is_file() {
390                        return false;
391                    }
392
393                    let name = e.file_name().to_string_lossy().to_string();
394                    if !matches_include_local(&name, &opts.include_patterns) {
395                        return false;
396                    }
397
398                    if let Ok(meta) = e.metadata() {
399                        let sz = meta.len();
400                        if opts.min_size > 0 && sz < opts.min_size {
401                            return false;
402                        }
403                        if opts.max_size > 0 && sz > opts.max_size {
404                            return false;
405                        }
406                    }
407
408                    if opts.skip_binary && is_binary(e.path(), opts.binary_check_bytes) {
409                        return false;
410                    }
411                    true
412                })
413                .filter_map(|e| {
414                    let path = e.path().to_path_buf();
415                    // Deduplicate across roots
416                    {
417                        let mut guard = seen.lock().unwrap();
418                        if !guard.insert(path.clone()) {
419                            return None;
420                        }
421                    }
422                    let size = e.metadata().ok()?.len();
423                    Some(FileEntry { path, size })
424                })
425                .collect::<Vec<_>>()
426        })
427        .collect()
428}
429
430// ── Content mode ──────────────────────────────────────────────────────────────
431
432fn by_content(
433    files: Vec<FileEntry>,
434    opts: &DuplicateOptions,
435    interrupted: &AtomicBool,
436) -> FsearchResult<Vec<DuplicateGroup>> {
437    // Size pre-filter: skip files whose size is unique — they cannot be dups.
438    let mut size_count: HashMap<u64, usize> = HashMap::new();
439    for f in &files {
440        *size_count.entry(f.size).or_default() += 1;
441    }
442
443    let candidates: Vec<(u64, PathBuf)> = files
444        .into_iter()
445        .filter(|f| size_count.get(&f.size).copied().unwrap_or(0) > 1)
446        .map(|f| (f.size, f.path))
447        .collect();
448
449    if candidates.is_empty() {
450        return Ok(vec![]);
451    }
452
453    let buf = opts.buffer_size;
454    let algo = opts.algorithm;
455
456    let hashed: Vec<(String, u64, PathBuf)> = candidates
457        .into_par_iter()
458        .filter_map(|(size, path)| {
459            if interrupted.load(Ordering::Relaxed) {
460                return None;
461            }
462            let digest = hash_file(&path, algo, buf).ok()?;
463            Some((digest, size, path))
464        })
465        .collect();
466
467    let mut hash_map: HashMap<String, (u64, Vec<PathBuf>)> = HashMap::new();
468    for (hash, size, path) in hashed {
469        hash_map
470            .entry(hash)
471            .or_insert_with(|| (size, vec![]))
472            .1
473            .push(path);
474    }
475
476    Ok(hash_map
477        .into_iter()
478        .filter(|(_, (_, paths))| paths.len() > 1)
479        .map(|(hash, (size, mut paths))| {
480            paths.sort(); // stable ordering within each group
481            let wasted = size * (paths.len() as u64 - 1);
482            DuplicateGroup {
483                hash,
484                size,
485                paths,
486                wasted_bytes: wasted,
487            }
488        })
489        .collect())
490}
491
492// ── Name mode ─────────────────────────────────────────────────────────────────
493
494fn by_name(files: Vec<FileEntry>, opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
495    let mut name_map: HashMap<String, Vec<(PathBuf, u64)>> = HashMap::new();
496    for f in files {
497        let name = f
498            .path
499            .file_name()
500            .map(|n| n.to_string_lossy().to_string())
501            .unwrap_or_default();
502        name_map.entry(name).or_default().push((f.path, f.size));
503    }
504    name_map
505        .into_iter()
506        .filter(|(_, v)| v.len() > 1)
507        .map(|(name, entries)| {
508            let size = entries.first().map(|(_, s)| *s).unwrap_or(0);
509            let mut paths: Vec<PathBuf> = entries.into_iter().map(|(p, _)| p).collect();
510            paths.sort();
511            let wasted = if opts.mode == DuplicateMode::Name {
512                size.saturating_mul(paths.len() as u64 - 1)
513            } else {
514                0
515            };
516            DuplicateGroup {
517                hash: format!("name:{name}"),
518                size,
519                wasted_bytes: wasted,
520                paths,
521            }
522        })
523        .collect()
524}
525
526// ── Size-only mode ────────────────────────────────────────────────────────────
527
528fn by_size_only(files: Vec<FileEntry>) -> Vec<DuplicateGroup> {
529    let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
530    for f in files {
531        size_map.entry(f.size).or_default().push(f.path);
532    }
533    size_map
534        .into_iter()
535        .filter(|(_, v)| v.len() > 1)
536        .map(|(size, mut paths)| {
537            paths.sort();
538            let wasted = size * (paths.len() as u64 - 1);
539            DuplicateGroup {
540                hash: format!("size:{size}"),
541                size,
542                paths,
543                wasted_bytes: wasted,
544            }
545        })
546        .collect()
547}
548
549// ── Hashing ───────────────────────────────────────────────────────────────────
550
551/// Hash a file with the chosen algorithm; returns a lower-hex digest string.
552pub fn hash_file(path: &Path, algo: HashAlgorithm, buf_size: usize) -> FsearchResult<String> {
553    let file = fs::File::open(path).map_err(|e| FsearchError::Io {
554        path: path.display().to_string(),
555        source: e,
556    })?;
557    let mut reader = BufReader::with_capacity(buf_size, file);
558    let mut buf = vec![0u8; buf_size];
559
560    match algo {
561        HashAlgorithm::Md5 => {
562            let mut h = Md5::new();
563            loop {
564                let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
565                    path: path.display().to_string(),
566                    source: e,
567                })?;
568                if n == 0 {
569                    break;
570                }
571                md5::Digest::update(&mut h, &buf[..n]);
572            }
573            Ok(format!("{:x}", md5::Digest::finalize(h)))
574        }
575        HashAlgorithm::Sha256 => {
576            let mut h = Sha256::new();
577            loop {
578                let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
579                    path: path.display().to_string(),
580                    source: e,
581                })?;
582                if n == 0 {
583                    break;
584                }
585                sha2::Digest::update(&mut h, &buf[..n]);
586            }
587            Ok(format!("{:x}", sha2::Digest::finalize(h)))
588        }
589    }
590}
591
592// ── Private helpers ───────────────────────────────────────────────────────────
593
594fn is_excluded_dir_local(name: &str, excludes: &[String]) -> bool {
595    excludes
596        .iter()
597        .any(|ex| Pattern::new(ex).map(|p| p.matches(name)).unwrap_or(false) || ex == name)
598}
599
600fn matches_include_local(name: &str, patterns: &[String]) -> bool {
601    if patterns.is_empty() {
602        return true;
603    }
604    patterns.iter().any(|p| {
605        Pattern::new(p)
606            .map(|pat| pat.matches(name))
607            .unwrap_or(false)
608    })
609}
610
611/// Format bytes as a human-readable IEC string.
612pub fn human_bytes(bytes: u64) -> String {
613    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
614    if bytes == 0 {
615        return "0 B".into();
616    }
617    let exp = ((bytes as f64).log(1024.0).floor() as usize).min(UNITS.len() - 1);
618    let val = bytes as f64 / 1024_f64.powi(exp as i32);
619    if exp == 0 {
620        format!("{bytes} B")
621    } else {
622        format!("{val:.1} {}", UNITS[exp])
623    }
624}
625
626// ── Tests ─────────────────────────────────────────────────────────────────────
627
628#[cfg(test)]
629mod tests {
630    use super::*;
631    use std::io::Write;
632    use tempfile::TempDir;
633
634    fn make_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
635        let p = dir.join(name);
636        fs::File::create(&p).unwrap().write_all(content).unwrap();
637        p
638    }
639
640    #[test]
641    fn human_bytes_formatting() {
642        assert_eq!(human_bytes(0), "0 B");
643        assert_eq!(human_bytes(512), "512 B");
644        assert_eq!(human_bytes(1024), "1.0 KiB");
645        assert_eq!(human_bytes(1024 * 1024), "1.0 MiB");
646    }
647
648    #[test]
649    fn detects_identical_files_single_dir() {
650        let tmp = TempDir::new().unwrap();
651        make_file(tmp.path(), "a.txt", b"hello world");
652        make_file(tmp.path(), "b.txt", b"hello world");
653        make_file(tmp.path(), "c.txt", b"different");
654
655        let opts = DuplicateOptions::builder(tmp.path()).max_depth(1).build();
656        let (groups, summary) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
657        assert_eq!(groups.len(), 1);
658        assert_eq!(groups[0].paths.len(), 2);
659        assert_eq!(summary.duplicate_files, 1);
660        assert_eq!(summary.dirs_scanned, 1);
661    }
662
663    #[test]
664    fn detects_duplicates_across_multiple_dirs() {
665        let dir1 = TempDir::new().unwrap();
666        let dir2 = TempDir::new().unwrap();
667        make_file(dir1.path(), "x.txt", b"shared content");
668        make_file(dir2.path(), "y.txt", b"shared content");
669        make_file(dir1.path(), "unique.txt", b"only here");
670
671        let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()])
672            .max_depth(1)
673            .build();
674        let (groups, summary) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
675        assert_eq!(groups.len(), 1, "cross-dir duplicates should be detected");
676        assert_eq!(summary.dirs_scanned, 2);
677    }
678
679    #[test]
680    fn no_false_positives_across_dirs() {
681        let dir1 = TempDir::new().unwrap();
682        let dir2 = TempDir::new().unwrap();
683        make_file(dir1.path(), "a.txt", b"aaa");
684        make_file(dir2.path(), "b.txt", b"bbb");
685
686        let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()]).build();
687        let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
688        assert!(groups.is_empty());
689    }
690
691    #[test]
692    fn overlapping_dirs_no_self_duplicates() {
693        // Supplying a dir and one of its subdirs should not produce self-duplicates.
694        let root = TempDir::new().unwrap();
695        let sub = root.path().join("sub");
696        fs::create_dir_all(&sub).unwrap();
697        make_file(root.path(), "top.txt", b"hello world");
698        make_file(&sub, "sub.txt", b"hello world");
699
700        let opts = DuplicateOptions::builder(vec![root.path(), sub.as_path()])
701            .max_depth(5)
702            .build();
703        let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
704        // Only 1 group with exactly 2 members — not 3
705        assert_eq!(groups.len(), 1);
706        assert_eq!(groups[0].paths.len(), 2);
707    }
708
709    #[test]
710    fn name_mode_across_dirs() {
711        let dir1 = TempDir::new().unwrap();
712        let dir2 = TempDir::new().unwrap();
713        make_file(dir1.path(), "readme.txt", b"v1");
714        make_file(dir2.path(), "readme.txt", b"v2");
715
716        let opts = DuplicateOptions::builder(vec![dir1.path(), dir2.path()])
717            .mode(DuplicateMode::Name)
718            .build();
719        let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
720        assert!(!groups.is_empty());
721    }
722
723    #[test]
724    fn hash_file_md5_and_sha256() {
725        let tmp = TempDir::new().unwrap();
726        let p = make_file(tmp.path(), "f.txt", b"test content");
727        let md5 = hash_file(&p, HashAlgorithm::Md5, 4096).unwrap();
728        let sha256 = hash_file(&p, HashAlgorithm::Sha256, 4096).unwrap();
729        assert_eq!(md5.len(), 32);
730        assert_eq!(sha256.len(), 64);
731    }
732
733    #[test]
734    fn size_filter_excludes_small_files() {
735        let tmp = TempDir::new().unwrap();
736        make_file(tmp.path(), "s1.txt", b"hi");
737        make_file(tmp.path(), "s2.txt", b"hi");
738
739        let opts = DuplicateOptions::builder(tmp.path()).min_size(1000).build();
740        let (groups, _) = find_duplicates(&opts, Arc::new(AtomicBool::new(false))).unwrap();
741        assert!(groups.is_empty());
742    }
743}