Skip to main content

fsearch/
duplicates.rs

1//! Duplicate-file detection.
2//!
3//! # Strategy
4//!
5//! 1. **Size pass** — group all files by byte size (free, no IO beyond `stat`).
6//! 2. **Hash pass** — within each size group, compute the configured hash
7//!    (MD5 for speed, SHA-256 for integrity) and group by digest.
8//! 3. Result is a `Vec<DuplicateGroup>` — each group holds two or more paths
9//!    that are byte-for-byte identical.
10//!
11//! The expensive hash pass is parallelised with Rayon.  Both passes respect
12//! the `interrupted` flag so Ctrl-C works cleanly.
13
14use crate::binary::is_binary;
15use crate::config::Config;
16use crate::error::{FsearchError, FsearchResult};
17use glob::Pattern;
18use md5::{Digest as _, Md5};
19use rayon::prelude::*;
20use serde::{Deserialize, Serialize};
21use sha2::Sha256;
22use std::collections::HashMap;
23use std::fs;
24use std::io::{BufReader, Read};
25use std::path::{Path, PathBuf};
26use std::sync::atomic::{AtomicBool, Ordering};
27use std::sync::Arc;
28use walkdir::WalkDir;
29
30// ── Public types ──────────────────────────────────────────────────────────────
31
32/// A group of files that are byte-for-byte identical.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct DuplicateGroup {
35    /// Hex-encoded content digest.
36    pub hash: String,
37    /// File size shared by all members.
38    pub size: u64,
39    /// All paths in this group (≥ 2).
40    pub paths: Vec<PathBuf>,
41    /// Total wasted space = `size * (paths.len() - 1)`.
42    pub wasted_bytes: u64,
43}
44
45impl DuplicateGroup {
46    /// Human-readable wasted space (e.g. `"4.2 MiB"`).
47    pub fn wasted_human(&self) -> String {
48        human_bytes(self.wasted_bytes)
49    }
50
51    /// Human-readable file size.
52    pub fn size_human(&self) -> String {
53        human_bytes(self.size)
54    }
55}
56
57/// Summary statistics for a duplicate scan.
58#[derive(Debug, Clone, Default, Serialize, Deserialize)]
59pub struct DuplicateSummary {
60    /// Number of files scanned.
61    pub files_scanned: usize,
62    /// Number of duplicate groups found.
63    pub groups_found: usize,
64    /// Total duplicate files (across all groups, not counting originals).
65    pub duplicate_files: usize,
66    /// Total wasted bytes.
67    pub wasted_bytes: u64,
68}
69
70impl DuplicateSummary {
71    pub fn wasted_human(&self) -> String {
72        human_bytes(self.wasted_bytes)
73    }
74}
75
76// ── Options ───────────────────────────────────────────────────────────────────
77
78/// Hashing algorithm for content comparison.
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
80pub enum HashAlgorithm {
81    Md5,
82    #[default]
83    Sha256,
84}
85
86impl HashAlgorithm {
87    pub fn as_str(&self) -> &'static str {
88        match self {
89            Self::Md5 => "md5",
90            Self::Sha256 => "sha256",
91        }
92    }
93}
94
95impl std::str::FromStr for HashAlgorithm {
96    type Err = FsearchError;
97
98    fn from_str(s: &str) -> Result<Self, Self::Err> {
99        match s.to_lowercase().as_str() {
100            "md5" => Ok(Self::Md5),
101            "sha256" => Ok(Self::Sha256),
102            other => Err(FsearchError::UnsupportedHashAlgorithm(other.into())),
103        }
104    }
105}
106
107/// Controls how duplicates are identified.
108#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
109pub enum DuplicateMode {
110    /// Hash-based: files with identical content (default).
111    #[default]
112    Content,
113    /// Name-based: files with the same filename (different directories).
114    Name,
115    /// Size-based: files that share the same byte count (fast but imprecise).
116    Size,
117}
118
119/// All parameters for a single duplicate-detection run.
120#[derive(Debug, Clone)]
121pub struct DuplicateOptions {
122    /// Root directory to scan.
123    pub base_dir: PathBuf,
124    /// Maximum recursion depth (0 = only `base_dir`).
125    pub max_depth: u32,
126    /// Detection mode.
127    pub mode: DuplicateMode,
128    /// Hashing algorithm (only used in [`DuplicateMode::Content`]).
129    pub algorithm: HashAlgorithm,
130    /// Read buffer size for hashing (bytes).
131    pub buffer_size: usize,
132    /// Skip files smaller than this (bytes). 0 = include all.
133    pub min_size: u64,
134    /// Skip files larger than this (bytes). 0 = no limit.
135    pub max_size: u64,
136    /// Skip binary files.
137    pub skip_binary: bool,
138    /// Binary-check probe length.
139    pub binary_check_bytes: usize,
140    /// Only include files matching these glob patterns (empty = all).
141    pub include_patterns: Vec<String>,
142    /// Directory names to skip entirely.
143    pub exclude_dirs: Vec<String>,
144    /// Cap results (0 = unlimited).
145    pub max_results: usize,
146}
147
148impl DuplicateOptions {
149    /// Construct from a [`Config`].
150    pub fn from_config(cfg: &Config, base_dir: PathBuf) -> FsearchResult<Self> {
151        Ok(Self {
152            base_dir,
153            max_depth: cfg.default_depth,
154            mode: DuplicateMode::Content,
155            algorithm: cfg.hash_algorithm.parse::<HashAlgorithm>()?,
156            buffer_size: cfg.hash_buffer_size,
157            min_size: cfg.dup_min_size,
158            max_size: cfg.dup_max_size,
159            skip_binary: false,
160            binary_check_bytes: cfg.binary_check_bytes,
161            include_patterns: crate::config::split_csv(&cfg.default_include),
162            exclude_dirs: cfg.excluded_dirs(),
163            max_results: cfg.max_results,
164        })
165    }
166
167    /// Fluent builder entry-point.
168    pub fn builder(base_dir: impl Into<PathBuf>) -> DuplicateOptionsBuilder {
169        DuplicateOptionsBuilder::new(base_dir.into())
170    }
171}
172
173/// Fluent builder for [`DuplicateOptions`].
174pub struct DuplicateOptionsBuilder(DuplicateOptions);
175
176impl DuplicateOptionsBuilder {
177    fn new(base_dir: PathBuf) -> Self {
178        Self(DuplicateOptions {
179            base_dir,
180            max_depth: 10,
181            mode: DuplicateMode::Content,
182            algorithm: HashAlgorithm::Sha256,
183            buffer_size: 65_536,
184            min_size: 1,
185            max_size: 0,
186            skip_binary: false,
187            binary_check_bytes: 1024,
188            include_patterns: vec![],
189            exclude_dirs: vec![
190                ".git".into(),
191                "node_modules".into(),
192                "target".into(),
193                ".svn".into(),
194                "__pycache__".into(),
195                ".hg".into(),
196                ".cache".into(),
197            ],
198            max_results: 0,
199        })
200    }
201    pub fn max_depth(mut self, d: u32) -> Self {
202        self.0.max_depth = d;
203        self
204    }
205    pub fn mode(mut self, m: DuplicateMode) -> Self {
206        self.0.mode = m;
207        self
208    }
209    pub fn algorithm(mut self, a: HashAlgorithm) -> Self {
210        self.0.algorithm = a;
211        self
212    }
213    pub fn buffer_size(mut self, b: usize) -> Self {
214        self.0.buffer_size = b;
215        self
216    }
217    pub fn min_size(mut self, s: u64) -> Self {
218        self.0.min_size = s;
219        self
220    }
221    pub fn max_size(mut self, s: u64) -> Self {
222        self.0.max_size = s;
223        self
224    }
225    pub fn skip_binary(mut self, v: bool) -> Self {
226        self.0.skip_binary = v;
227        self
228    }
229    pub fn include_patterns(mut self, p: Vec<String>) -> Self {
230        self.0.include_patterns = p;
231        self
232    }
233    pub fn exclude_dirs(mut self, d: Vec<String>) -> Self {
234        self.0.exclude_dirs = d;
235        self
236    }
237    pub fn max_results(mut self, n: usize) -> Self {
238        self.0.max_results = n;
239        self
240    }
241    pub fn build(self) -> DuplicateOptions {
242        self.0
243    }
244}
245
246// ── Main entry point ──────────────────────────────────────────────────────────
247
248/// Find duplicate files under `opts.base_dir`.
249///
250/// Returns a `(Vec<DuplicateGroup>, DuplicateSummary)` pair.
251/// Groups are sorted by `wasted_bytes` descending so the most wasteful
252/// duplicates appear first.
253pub fn find_duplicates(
254    opts: &DuplicateOptions,
255    interrupted: Arc<AtomicBool>,
256) -> FsearchResult<(Vec<DuplicateGroup>, DuplicateSummary)> {
257    if !opts.base_dir.exists() {
258        return Err(FsearchError::DirectoryNotFound(
259            opts.base_dir.display().to_string(),
260        ));
261    }
262    if !opts.base_dir.is_dir() {
263        return Err(FsearchError::NotADirectory(
264            opts.base_dir.display().to_string(),
265        ));
266    }
267
268    // ── Collect all candidate files ───────────────────────────────────────────
269    let files = collect_files(opts, &interrupted);
270    let total_scanned = files.len();
271
272    if interrupted.load(Ordering::Relaxed) {
273        return Err(FsearchError::Interrupted);
274    }
275
276    // ── Dispatch to the chosen mode ───────────────────────────────────────────
277    let mut groups = match opts.mode {
278        DuplicateMode::Content => by_content(files, opts, &interrupted)?,
279        DuplicateMode::Name => by_name(files, opts),
280        DuplicateMode::Size => by_size_only(files, opts),
281    };
282
283    // Sort groups: most wasteful first
284    groups.sort_unstable_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
285
286    // Cap results
287    if opts.max_results > 0 && groups.len() > opts.max_results {
288        groups.truncate(opts.max_results);
289    }
290
291    let summary = DuplicateSummary {
292        files_scanned: total_scanned,
293        groups_found: groups.len(),
294        duplicate_files: groups.iter().map(|g| g.paths.len() - 1).sum(),
295        wasted_bytes: groups.iter().map(|g| g.wasted_bytes).sum(),
296    };
297
298    Ok((groups, summary))
299}
300
301// ── File collection ───────────────────────────────────────────────────────────
302
303struct FileEntry {
304    path: PathBuf,
305    size: u64,
306}
307
308fn collect_files(opts: &DuplicateOptions, interrupted: &AtomicBool) -> Vec<FileEntry> {
309    WalkDir::new(&opts.base_dir)
310        .max_depth(opts.max_depth as usize + 1)
311        .follow_links(false)
312        .into_iter()
313        .filter_entry(|e| {
314            if e.file_type().is_dir() && e.depth() > 0 {
315                let name = e.file_name().to_string_lossy().to_string();
316                if is_excluded_dir_dup(&name, &opts.exclude_dirs) {
317                    return false;
318                }
319            }
320            true
321        })
322        .filter_map(|e| e.ok())
323        .filter(|e| {
324            if interrupted.load(Ordering::Relaxed) {
325                return false;
326            }
327            if !e.file_type().is_file() {
328                return false;
329            }
330
331            let name = e.file_name().to_string_lossy().to_string();
332            if !matches_include_dup(&name, &opts.include_patterns) {
333                return false;
334            }
335
336            if let Ok(meta) = e.metadata() {
337                let sz = meta.len();
338                if opts.min_size > 0 && sz < opts.min_size {
339                    return false;
340                }
341                if opts.max_size > 0 && sz > opts.max_size {
342                    return false;
343                }
344            }
345
346            if opts.skip_binary && is_binary(e.path(), opts.binary_check_bytes) {
347                return false;
348            }
349            true
350        })
351        .filter_map(|e| {
352            let size = e.metadata().ok()?.len();
353            Some(FileEntry {
354                path: e.path().to_path_buf(),
355                size,
356            })
357        })
358        .collect()
359}
360
361// ── Content mode ──────────────────────────────────────────────────────────────
362
363fn by_content(
364    files: Vec<FileEntry>,
365    opts: &DuplicateOptions,
366    interrupted: &AtomicBool,
367) -> FsearchResult<Vec<DuplicateGroup>> {
368    // ── Size pass: only hash files whose size appears more than once ──────────
369    let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
370    for f in files {
371        size_map.entry(f.size).or_default().push(f.path);
372    }
373    let candidates: Vec<(u64, PathBuf)> = size_map
374        .into_iter()
375        .filter(|(_, v)| v.len() > 1)
376        .flat_map(|(sz, paths)| paths.into_iter().map(move |p| (sz, p)))
377        .collect();
378
379    if candidates.is_empty() {
380        return Ok(vec![]);
381    }
382
383    // ── Hash pass (parallel) ──────────────────────────────────────────────────
384    let buf = opts.buffer_size;
385    let algo = opts.algorithm;
386
387    let hashed: Vec<(String, u64, PathBuf)> = candidates
388        .into_par_iter()
389        .filter_map(|(size, path)| {
390            if interrupted.load(Ordering::Relaxed) {
391                return None;
392            }
393            let digest = hash_file(&path, algo, buf).ok()?;
394            Some((digest, size, path))
395        })
396        .collect();
397
398    // ── Group by hash ─────────────────────────────────────────────────────────
399    let mut hash_map: HashMap<String, (u64, Vec<PathBuf>)> = HashMap::new();
400    for (hash, size, path) in hashed {
401        let entry = hash_map.entry(hash).or_insert_with(|| (size, vec![]));
402        entry.1.push(path);
403    }
404
405    Ok(hash_map
406        .into_iter()
407        .filter(|(_, (_, paths))| paths.len() > 1)
408        .map(|(hash, (size, paths))| {
409            let wasted = size * (paths.len() as u64 - 1);
410            DuplicateGroup {
411                hash,
412                size,
413                paths,
414                wasted_bytes: wasted,
415            }
416        })
417        .collect())
418}
419
420// ── Name mode ─────────────────────────────────────────────────────────────────
421
422fn by_name(files: Vec<FileEntry>, opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
423    let mut name_map: HashMap<String, Vec<(PathBuf, u64)>> = HashMap::new();
424    for f in files {
425        let name = f
426            .path
427            .file_name()
428            .map(|n| n.to_string_lossy().to_string())
429            .unwrap_or_default();
430        name_map.entry(name).or_default().push((f.path, f.size));
431    }
432    name_map
433        .into_iter()
434        .filter(|(_, v)| v.len() > 1)
435        .map(|(name, entries)| {
436            let size = entries.first().map(|(_, s)| *s).unwrap_or(0);
437            let paths: Vec<PathBuf> = entries.into_iter().map(|(p, _)| p).collect();
438            let wasted = size.saturating_mul(paths.len() as u64 - 1);
439            DuplicateGroup {
440                hash: format!("name:{}", name),
441                size,
442                wasted_bytes: if opts.mode == DuplicateMode::Name {
443                    wasted
444                } else {
445                    0
446                },
447                paths,
448            }
449        })
450        .collect()
451}
452
453// ── Size-only mode ────────────────────────────────────────────────────────────
454
455fn by_size_only(files: Vec<FileEntry>, _opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
456    let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
457    for f in files {
458        size_map.entry(f.size).or_default().push(f.path);
459    }
460    size_map
461        .into_iter()
462        .filter(|(_, v)| v.len() > 1)
463        .map(|(size, paths)| {
464            let wasted = size * (paths.len() as u64 - 1);
465            DuplicateGroup {
466                hash: format!("size:{}", size),
467                size,
468                paths,
469                wasted_bytes: wasted,
470            }
471        })
472        .collect()
473}
474
475// ── Hashing ───────────────────────────────────────────────────────────────────
476
477/// Hash a file with the chosen algorithm; returns a lower-hex digest string.
478pub fn hash_file(path: &Path, algo: HashAlgorithm, buf_size: usize) -> FsearchResult<String> {
479    let file = fs::File::open(path).map_err(|e| FsearchError::Io {
480        path: path.display().to_string(),
481        source: e,
482    })?;
483    let mut reader = BufReader::with_capacity(buf_size, file);
484    let mut buf = vec![0u8; buf_size];
485
486    match algo {
487        HashAlgorithm::Md5 => {
488            let mut h = Md5::new();
489            loop {
490                let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
491                    path: path.display().to_string(),
492                    source: e,
493                })?;
494                if n == 0 {
495                    break;
496                }
497                md5::Digest::update(&mut h, &buf[..n]);
498            }
499            Ok(format!("{:x}", md5::Digest::finalize(h)))
500        }
501        HashAlgorithm::Sha256 => {
502            let mut h = Sha256::new();
503            loop {
504                let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
505                    path: path.display().to_string(),
506                    source: e,
507                })?;
508                if n == 0 {
509                    break;
510                }
511                sha2::Digest::update(&mut h, &buf[..n]);
512            }
513            Ok(format!("{:x}", sha2::Digest::finalize(h)))
514        }
515    }
516}
517
518// ── Utilities ─────────────────────────────────────────────────────────────────
519
520fn is_excluded_dir_dup(name: &str, excludes: &[String]) -> bool {
521    excludes
522        .iter()
523        .any(|ex| Pattern::new(ex).map(|p| p.matches(name)).unwrap_or(false) || ex == name)
524}
525
526fn matches_include_dup(name: &str, patterns: &[String]) -> bool {
527    if patterns.is_empty() {
528        return true;
529    }
530    patterns.iter().any(|p| {
531        Pattern::new(p)
532            .map(|pat| pat.matches(name))
533            .unwrap_or(false)
534    })
535}
536
537/// Format bytes as a human-readable string (IEC units).
538pub fn human_bytes(bytes: u64) -> String {
539    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
540    if bytes == 0 {
541        return "0 B".into();
542    }
543    let exp = (bytes as f64).log(1024.0).floor() as usize;
544    let exp = exp.min(UNITS.len() - 1);
545    let val = bytes as f64 / 1024_f64.powi(exp as i32);
546    if exp == 0 {
547        format!("{} {}", bytes, UNITS[0])
548    } else {
549        format!("{:.1} {}", val, UNITS[exp])
550    }
551}
552
553#[cfg(test)]
554mod tests {
555    use super::*;
556    use std::io::Write;
557    use tempfile::TempDir;
558
559    fn make_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
560        let p = dir.join(name);
561        let mut f = fs::File::create(&p).unwrap();
562        f.write_all(content).unwrap();
563        p
564    }
565
566    #[test]
567    fn human_bytes_formatting() {
568        assert_eq!(human_bytes(0), "0 B");
569        assert_eq!(human_bytes(512), "512 B");
570        assert_eq!(human_bytes(1024), "1.0 KiB");
571        assert_eq!(human_bytes(1024 * 1024), "1.0 MiB");
572    }
573
574    #[test]
575    fn detects_identical_files() {
576        let tmp = TempDir::new().unwrap();
577        make_file(tmp.path(), "a.txt", b"hello world");
578        make_file(tmp.path(), "b.txt", b"hello world");
579        make_file(tmp.path(), "c.txt", b"different content");
580
581        let opts = DuplicateOptions::builder(tmp.path()).max_depth(1).build();
582        let interrupted = Arc::new(AtomicBool::new(false));
583        let (groups, summary) = find_duplicates(&opts, interrupted).unwrap();
584
585        assert_eq!(groups.len(), 1, "one duplicate group expected");
586        assert_eq!(groups[0].paths.len(), 2);
587        assert_eq!(summary.duplicate_files, 1);
588    }
589
590    #[test]
591    fn no_false_positives_for_unique_files() {
592        let tmp = TempDir::new().unwrap();
593        make_file(tmp.path(), "x.txt", b"aaa");
594        make_file(tmp.path(), "y.txt", b"bbb");
595
596        let opts = DuplicateOptions::builder(tmp.path()).build();
597        let interrupted = Arc::new(AtomicBool::new(false));
598        let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
599        assert!(groups.is_empty());
600    }
601
602    #[test]
603    fn name_mode_groups_by_filename() {
604        let tmp = TempDir::new().unwrap();
605        let sub = tmp.path().join("sub");
606        fs::create_dir_all(&sub).unwrap();
607        make_file(tmp.path(), "readme.txt", b"v1");
608        make_file(&sub, "readme.txt", b"v2");
609
610        let opts = DuplicateOptions::builder(tmp.path())
611            .max_depth(2)
612            .mode(DuplicateMode::Name)
613            .build();
614        let interrupted = Arc::new(AtomicBool::new(false));
615        let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
616        assert!(!groups.is_empty(), "name duplicates expected");
617    }
618
619    #[test]
620    fn hash_file_md5_and_sha256() {
621        let tmp = TempDir::new().unwrap();
622        let p = make_file(tmp.path(), "f.txt", b"test content");
623        let md5 = hash_file(&p, HashAlgorithm::Md5, 4096).unwrap();
624        let sha256 = hash_file(&p, HashAlgorithm::Sha256, 4096).unwrap();
625        assert_eq!(md5.len(), 32, "md5 should be 32 hex chars");
626        assert_eq!(sha256.len(), 64, "sha256 should be 64 hex chars");
627    }
628
629    #[test]
630    fn size_filter_excludes_small_files() {
631        let tmp = TempDir::new().unwrap();
632        make_file(tmp.path(), "small1.txt", b"hi");
633        make_file(tmp.path(), "small2.txt", b"hi");
634
635        let opts = DuplicateOptions::builder(tmp.path())
636            .min_size(1000) // bigger than our test files
637            .build();
638        let interrupted = Arc::new(AtomicBool::new(false));
639        let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
640        assert!(groups.is_empty(), "small files should be filtered out");
641    }
642}