Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, OutputFormat,
3    Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10    collections::HashMap,
11    fs,
12    io::{self, Read, stdout},
13    path::{Path, PathBuf},
14    sync::{
15        Arc,
16        atomic::{AtomicUsize, Ordering},
17        mpsc,
18    },
19    time,
20};
21
22type FileItem = (PathBuf, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26    StartHashing,
27    NumFiles(usize),
28    Result(PathBuf, u64, blake3::Hash),
29    Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34    Unchanged,
35    New,
36    Modified,
37}
38
39#[derive(Debug, PartialEq)]
40enum CheckEvent {
41    StartChecking,
42    TotalFiles(usize),
43    Result(PathBuf, CheckStatus),
44    FileDone,
45    Error,
46}
47
48enum DupState {
49    Single(PathBuf, time::SystemTime, usize),
50    Hashing,
51}
52
53/// A tool for finding duplicated files in a directory.
54pub struct FileHasher {
55    dirs: Vec<PathBuf>,
56    pub buffer_size: usize,
57    cache: Option<Arc<FileHashCache>>,
58    num_hashed: AtomicUsize,
59    num_hash_looked_up: AtomicUsize,
60    pub exclude: Option<GlobSet>,
61    pub progress: Option<Arc<ProgressBuilder>>,
62    pub output_format: OutputFormat,
63    pub jobs: usize,
64}
65
66impl FileHasher {
67    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
68
69    /// Creates a new `FileHasher` for the given directories.
70    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
71        if dirs.is_empty() {
72            anyhow::bail!("At least one directory must be specified.");
73        }
74        Ok(Self {
75            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
76            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
77            cache: None,
78            num_hashed: AtomicUsize::new(0),
79            num_hash_looked_up: AtomicUsize::new(0),
80            exclude: None,
81            progress: None,
82            output_format: OutputFormat::Default,
83            jobs: Self::DEFAULT_JOBS,
84        })
85    }
86
87    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
88        let mut hasher = Self::new(dirs)?;
89        hasher.cache = Some(hasher.new_cache()?);
90        Ok(hasher)
91    }
92
93    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
94        let common_ancestor = crate::common_ancestor(&self.dirs)
95            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
96        Ok(FileHashCache::find_or_new(&common_ancestor))
97    }
98
99    /// Gets the hash cache.
100    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
101        if self.cache.is_none() {
102            self.cache = Some(self.new_cache()?);
103        }
104        Ok(Arc::clone(self.cache.as_ref().unwrap()))
105    }
106
107    /// Remove a cache entry if it exists.
108    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
109        let cache = self.cache()?;
110        let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
111        cache.remove(relative);
112        Ok(())
113    }
114
115    /// Save the hash cache if it is dirty.
116    pub fn save_cache(&self) -> anyhow::Result<()> {
117        log::info!(
118            "Hash stats for {:?}: {} computed, {} looked up",
119            self.dirs,
120            self.num_hashed.load(Ordering::Relaxed),
121            self.num_hash_looked_up.load(Ordering::Relaxed)
122        );
123        if let Some(cache) = &self.cache {
124            cache.save()?;
125        }
126        Ok(())
127    }
128
129    /// Clears the loaded hashes in the cache.
130    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
131        let cache = self.cache()?;
132        for dir in &self.dirs {
133            let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
134            cache.clear(relative);
135        }
136        Ok(())
137    }
138
139    /// Executes the check/update process.
140    pub fn check(&self, update: bool) -> anyhow::Result<()> {
141        match self.output_format {
142            OutputFormat::Default | OutputFormat::Symbol => {}
143            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
144        }
145        if self.dirs.len() > 1 {
146            anyhow::bail!("Check mode only supports one directory.");
147        }
148        let start_time = time::Instant::now();
149        let progress = self
150            .progress
151            .as_ref()
152            .map(|progress| progress.add_spinner())
153            .unwrap_or_else(Progress::none);
154        progress.set_message("Scanning directory...");
155        let mut num_new = 0;
156        let mut num_modified = 0;
157        let mut num_error = 0;
158        std::thread::scope(|scope| {
159            let (tx, rx) = mpsc::channel();
160            scope.spawn(|| {
161                if let Err(e) = self.check_streaming(tx, update) {
162                    log::error!("Error during check: {}", e);
163                }
164            });
165            while let Ok(event) = rx.recv() {
166                match event {
167                    CheckEvent::StartChecking => {
168                        progress.set_message("Checking files...");
169                    }
170                    CheckEvent::TotalFiles(total) => {
171                        progress.set_length(total as u64);
172                        progress.set_message("");
173                    }
174                    CheckEvent::Result(path, status) => {
175                        let symbol = match status {
176                            CheckStatus::New => {
177                                num_new += 1;
178                                '+'
179                            }
180                            CheckStatus::Modified => {
181                                num_modified += 1;
182                                '!'
183                            }
184                            CheckStatus::Unchanged => unreachable!(),
185                        };
186                        progress.inc(1);
187                        progress.suspend_for(stdout(), || {
188                            println!("{} {}", symbol, path.display());
189                        });
190                    }
191                    CheckEvent::FileDone => {
192                        progress.inc(1);
193                    }
194                    CheckEvent::Error => {
195                        progress.inc(1);
196                        num_error += 1;
197                    }
198                }
199            }
200        });
201        progress.finish();
202        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
203        Ok(())
204    }
205
206    fn print_check_summary(
207        &self,
208        start_time: &time::Instant,
209        num_new: usize,
210        num_modified: usize,
211        num_error: usize,
212    ) -> io::Result<()> {
213        let summary = [
214            ("Elapsed:", 0),
215            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
216            ("New files:", num_new),
217            ("Modified files:", num_modified),
218            ("Errors:", num_error),
219        ];
220        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
221        let mut writer = std::io::stderr();
222        formatter.write_value(
223            &mut writer,
224            summary[0].0,
225            FormattedDuration(start_time.elapsed()),
226        )?;
227        formatter.write_values(&mut writer, &summary[1..])
228    }
229
230    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
231        assert_eq!(self.dirs.len(), 1);
232        let cache = self.new_cache()?;
233        let base_dir = &self.dirs[0];
234        let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
235        cache.set_remove_if_no_access(relative);
236        let cache_clone = Arc::clone(&cache);
237        std::thread::scope(|global_scope| {
238            let mut it = FileIterator::new(base_dir);
239            it.cache = Some(Arc::clone(&cache));
240            it.exclude = self.exclude.as_ref();
241            let it_rx = it.spawn_in_scope(global_scope);
242            tx.send(CheckEvent::StartChecking)?;
243            let pool = crate::build_thread_pool(self.jobs)?;
244            pool.scope(move |scope| -> anyhow::Result<()> {
245                let mut total_files = 0;
246                for path in it_rx {
247                    total_files += 1;
248                    let tx = tx.clone();
249                    let cache = Arc::clone(&cache);
250                    scope.spawn(move |_| {
251                        let status = self.check_file(&path, &cache, update);
252                        let event = match status {
253                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
254                                let rel_path = SimplePath::strip_prefix(&path, base_dir).unwrap();
255                                CheckEvent::Result(rel_path.into(), status.unwrap())
256                            }
257                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
258                            Err(e) => {
259                                log::error!("Failed to check file {:?}: {}", path, e);
260                                CheckEvent::Error
261                            }
262                        };
263                        if tx.send(event).is_err() {
264                            log::error!("Send failed");
265                        }
266                    });
267                }
268                tx.send(CheckEvent::TotalFiles(total_files))?;
269                Ok(())
270            })
271        })?;
272        cache_clone.save()?;
273        Ok(())
274    }
275
276    fn check_file(
277        &self,
278        abs_path: &Path,
279        cache: &FileHashCache,
280        update: bool,
281    ) -> anyhow::Result<CheckStatus> {
282        assert!(abs_path.is_absolute());
283        let computed_hash = self.compute_hash(abs_path)?;
284        let rel_path = SimplePath::strip_prefix(abs_path, cache.base_dir())?;
285        let cached_hash = cache.get_by_path(rel_path);
286        let status = match cached_hash {
287            None => CheckStatus::New,
288            Some(cached) => {
289                if computed_hash != cached {
290                    CheckStatus::Modified
291                } else {
292                    CheckStatus::Unchanged
293                }
294            }
295        };
296        if update {
297            let modified = fs::metadata(abs_path)?.modified()?;
298            match status {
299                CheckStatus::New | CheckStatus::Modified => {
300                    cache.insert(rel_path, modified, computed_hash);
301                }
302                CheckStatus::Unchanged => {
303                    if cache.get(rel_path, modified).is_none() {
304                        cache.insert(rel_path, modified, computed_hash);
305                    }
306                }
307            }
308        }
309        Ok(status)
310    }
311
312    /// Executes the duplicate file finding process and prints results.
313    pub fn run(&self) -> anyhow::Result<()> {
314        let start_time = time::Instant::now();
315        let mut duplicates = self.find_duplicates()?;
316        let mut total_wasted_space = 0;
317        if !duplicates.is_empty() {
318            duplicates.sort_by_key(|a| a.size);
319            total_wasted_space = self.print_duplicates_results(&duplicates)?;
320        }
321        self.print_duplicates_summary(&start_time, total_wasted_space)?;
322        Ok(())
323    }
324
325    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
326        let mut total_wasted_space = 0;
327        for dupes in duplicates {
328            dupes.print(self.output_format)?;
329            total_wasted_space += dupes.wasted_size();
330        }
331        Ok(total_wasted_space)
332    }
333
334    fn print_duplicates_summary(
335        &self,
336        start_time: &time::Instant,
337        total_wasted_space: u64,
338    ) -> io::Result<()> {
339        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
340        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
341        let total_wasted_space = crate::human_readable_size(total_wasted_space);
342        let summary = [
343            ("Elapsed:", elapsed),
344            ("Hash computed:", num_hashed),
345            ("Total wasted space:", total_wasted_space),
346        ];
347        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
348        formatter.write_values(&mut io::stderr(), &summary)
349    }
350
351    /// Finds duplicated files and returns a list of duplicate groups.
352    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
353        let progress = self
354            .progress
355            .as_ref()
356            .map(|progress| progress.add_spinner())
357            .unwrap_or_else(Progress::none);
358        progress.set_message("Scanning directories...");
359
360        let (tx, rx) = mpsc::channel();
361        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
362        std::thread::scope(|scope| {
363            scope.spawn(|| {
364                if let Err(e) = self.find_duplicates_streaming(tx) {
365                    log::error!("Error during duplicate finding: {}", e);
366                }
367            });
368
369            while let Ok(event) = rx.recv() {
370                match event {
371                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
372                    DupEvent::NumFiles(num) => progress.set_length(num as u64),
373                    DupEvent::Result(path, size, hash) => {
374                        progress.inc(1);
375                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
376                            paths: Vec::new(),
377                            size,
378                        });
379                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
380                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
381                        entry.paths.push(path);
382                    }
383                    DupEvent::Error => progress.inc(1),
384                }
385            }
386        });
387        progress.finish();
388
389        let mut duplicates = Vec::new();
390        for (_, mut dupes) in by_hash {
391            if dupes.paths.len() > 1 {
392                dupes.paths.sort();
393                duplicates.push(dupes);
394            }
395        }
396        Ok(duplicates)
397    }
398
399    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
400        std::thread::scope(|global_scope| {
401            let (it_rx, caches) = self.stream_file_items(global_scope)?;
402            let caches = &caches;
403            let pool = crate::build_thread_pool(self.jobs)?;
404            pool.scope(move |scope| -> anyhow::Result<()> {
405                let mut by_size: HashMap<u64, DupState> = HashMap::new();
406                let mut num_hashed = 0;
407                tx.send(DupEvent::StartHashing)?;
408                for (path, dir_index) in it_rx {
409                    let meta = fs::metadata(&path)?;
410                    let size = meta.len();
411                    if size == 0 {
412                        continue;
413                    }
414                    let modified = meta.modified()?;
415                    let cache = &caches[dir_index];
416                    match by_size.entry(size) {
417                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
418                        {
419                            DupState::Single(path0, modified0, dir_index0) => {
420                                // We found a second file of identical size.
421                                // Time to start hashing both the *original* matching file and the *new* one!
422                                let cache0 = &caches[*dir_index0];
423                                self.send_hash(path0, size, *modified0, cache0, &tx, scope);
424                                self.send_hash(&path, size, modified, cache, &tx, scope);
425
426                                // Modify the state to indicate we are now fully hashing this size bucket.
427                                *occ.get_mut() = DupState::Hashing;
428                                num_hashed += 2;
429                            }
430                            DupState::Hashing => {
431                                // File size bucket already hashing; just dynamically spawn the new file immediately.
432                                self.send_hash(&path, size, modified, cache, &tx, scope);
433                                num_hashed += 1;
434                            }
435                        },
436                        std::collections::hash_map::Entry::Vacant(vac) => {
437                            vac.insert(DupState::Single(path, modified, dir_index));
438                        }
439                    }
440                }
441                tx.send(DupEvent::NumFiles(num_hashed))?;
442                Ok(())
443            })?;
444            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
445            Ok::<(), anyhow::Error>(())
446        })?;
447        Ok(())
448    }
449
450    fn stream_file_items<'scope, 'env>(
451        &'env self,
452        scope: &'scope std::thread::Scope<'scope, 'env>,
453    ) -> anyhow::Result<(mpsc::Receiver<FileItem>, Vec<Arc<FileHashCache>>)> {
454        let (it_tx, it_rx) = mpsc::channel();
455        let mut caches = Vec::with_capacity(self.dirs.len());
456        for (dir_index, dir) in self.dirs.iter().enumerate() {
457            let mut it = FileIterator::new(dir);
458            let cache = FileHashCache::find_or_new(dir);
459            it.cache = Some(Arc::clone(&cache));
460            it.exclude = self.exclude.as_ref();
461            let it_tx = it_tx.clone();
462            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
463            caches.push(cache);
464        }
465        Ok((it_rx, caches))
466    }
467
468    fn send_hash<'scope>(
469        &'scope self,
470        path: &Path,
471        size: u64,
472        modified: time::SystemTime,
473        cache: &Arc<FileHashCache>,
474        tx: &mpsc::Sender<DupEvent>,
475        scope: &rayon::Scope<'scope>,
476    ) {
477        let (hash, relative) = self
478            .get_hash_from_cache(path, modified, cache)
479            .expect("path should be in cache base_dir");
480        if let Some(hash) = hash {
481            let _ = tx.send(DupEvent::Result(path.to_path_buf(), size, hash));
482            return;
483        }
484
485        let path = path.to_path_buf();
486        let relative = relative.to_path_buf();
487        let tx = tx.clone();
488        let cache = Arc::clone(cache);
489        scope.spawn(move |_| {
490            if let Ok(hash) = self.compute_hash(&path) {
491                cache.insert(&relative, modified, hash);
492                let _ = tx.send(DupEvent::Result(path, size, hash));
493            } else {
494                log::error!("Failed to hash file: {:?}", path);
495                let _ = tx.send(DupEvent::Error);
496            }
497        });
498    }
499
500    /// Gets the hash of a file, using the cache if available.
501    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
502        let cache = self.cache.as_ref().expect("cache should be initialized");
503        let meta = fs::metadata(path)?;
504        let modified = meta.modified()?;
505        let (hash, relative) = self.get_hash_from_cache(path, modified, cache)?;
506        if let Some(hash) = hash {
507            return Ok(hash);
508        }
509
510        let hash = self.compute_hash(path)?;
511        cache.insert(relative, modified, hash);
512        Ok(hash)
513    }
514
515    fn get_hash_from_cache<'a>(
516        &self,
517        path: &'a Path,
518        modified: time::SystemTime,
519        cache: &FileHashCache,
520    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
521        let relative = SimplePath::strip_prefix(path, cache.base_dir())
522            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
523        if let Some(hash) = cache.get(relative, modified) {
524            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
525            return Ok((Some(hash), relative));
526        }
527        Ok((None, relative))
528    }
529
530    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
531        let start_time = time::Instant::now();
532        let mut f = fs::File::open(path)?;
533        let len = f.metadata()?.len();
534        let progress = self
535            .progress
536            .as_ref()
537            .map(|progress| progress.add_file(path, len))
538            .unwrap_or_else(Progress::none);
539        let mut hasher = blake3::Hasher::new();
540        if self.buffer_size == 0 {
541            if len > 0 {
542                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
543                hasher.update(&mmap[..]);
544                progress.inc(len);
545            }
546        } else {
547            let mut buf = vec![0u8; self.buffer_size];
548            loop {
549                let n = f.read(&mut buf)?;
550                if n == 0 {
551                    break;
552                }
553                hasher.update(&buf[..n]);
554                progress.inc(n as u64);
555            }
556        }
557        progress.finish();
558        self.num_hashed.fetch_add(1, Ordering::Relaxed);
559        let hash = hasher.finalize();
560        log::debug!(
561            "Computed hash in {}: {:?}",
562            FormattedDuration(start_time.elapsed()),
563            path
564        );
565        Ok(hash)
566    }
567}
568
569/// A group of duplicated files and their size.
570#[derive(Clone, Debug)]
571pub struct DuplicatedFiles {
572    pub paths: Vec<PathBuf>,
573    pub size: u64,
574}
575
576impl DuplicatedFiles {
577    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
578        match output_format {
579            OutputFormat::Default => self.write_human(stdout())?,
580            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
581        }
582        Ok(())
583    }
584
585    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
586        writeln!(
587            writer,
588            "Identical {} files of {}:",
589            self.paths.len(),
590            crate::human_readable_size(self.size)
591        )?;
592        for path in &self.paths {
593            writeln!(writer, "  {}", path.display())?;
594        }
595        Ok(())
596    }
597
598    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
599        writeln!(writer, "- paths:")?;
600        for path in &self.paths {
601            writeln!(writer, "  - {:?}", path)?;
602        }
603        writeln!(writer, "  size: {}", self.size)?;
604        Ok(())
605    }
606
607    fn wasted_size(&self) -> u64 {
608        self.size * (self.paths.len() as u64 - 1)
609    }
610}
611
612#[cfg(test)]
613mod tests {
614    use super::*;
615
616    fn default_exclude() -> globset::GlobSet {
617        let mut builder = globset::GlobSetBuilder::new();
618        builder.add(
619            globset::GlobBuilder::new(".hash_cache")
620                .case_insensitive(true)
621                .build()
622                .unwrap(),
623        );
624        builder.build().unwrap()
625    }
626
627    #[test]
628    fn find_duplicates() -> anyhow::Result<()> {
629        let dir = tempfile::tempdir()?;
630
631        let file1_path = dir.path().join("same1.txt");
632        fs::write(&file1_path, "same content")?;
633
634        let file2_path = dir.path().join("same2.txt");
635        fs::write(&file2_path, "same content")?;
636
637        let diff_path = dir.path().join("diff.txt");
638        fs::write(&diff_path, "different content")?;
639
640        let mut hasher = FileHasher::new(&[dir.path()])?;
641        hasher.buffer_size = 8192;
642        let duplicates = hasher.find_duplicates()?;
643
644        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
645        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
646
647        assert_eq!(duplicates.len(), 1);
648        let group = &duplicates[0];
649        assert_eq!(group.paths.len(), 2);
650        assert_eq!(group.size, 12); // "same content" is 12 bytes
651
652        assert!(group.paths.contains(&file1_path));
653        assert!(group.paths.contains(&file2_path));
654
655        Ok(())
656    }
657
658    #[test]
659    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
660        let dir = tempfile::tempdir()?;
661        let dir_path = dir.path();
662
663        let sub_dir = dir_path.join("a").join("a");
664        fs::create_dir_all(&sub_dir)?;
665
666        let file1_path = sub_dir.join("1");
667        fs::write(&file1_path, "same content")?;
668
669        let file2_path = sub_dir.join("2");
670        fs::write(&file2_path, "same content")?;
671
672        // Create empty cache file in a/a to force it to be the cache base
673        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
674        fs::File::create(&cache_aa_path)?;
675
676        // Run find_duplicates on a/a
677        let hasher_aa = FileHasher::new(&[&sub_dir])?;
678        let duplicates_aa = hasher_aa.find_duplicates()?;
679        assert_eq!(duplicates_aa.len(), 1);
680        assert!(cache_aa_path.exists());
681        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
682        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
683
684        // Create empty cache file in a to force it to be the cache base
685        let root_a = dir_path.join("a");
686        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
687        fs::File::create(&cache_a_path)?;
688
689        // Run find_duplicates on a
690        let hasher_a = FileHasher::new(&[&root_a])?;
691        let duplicates_a = hasher_a.find_duplicates()?;
692        assert_eq!(duplicates_a.len(), 1);
693        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
694        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
695
696        // The merged child cache should be removed.
697        assert!(cache_a_path.exists());
698        assert!(!cache_aa_path.exists());
699
700        Ok(())
701    }
702
703    #[test]
704    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
705        let dir = tempfile::tempdir()?;
706
707        let file1_path = dir.path().join("same1.txt");
708        fs::write(&file1_path, "same content")?;
709
710        let file2_path = dir.path().join("same2.txt");
711        fs::write(&file2_path, "same content")?;
712
713        let exclude_path = dir.path().join("exclude.txt");
714        fs::write(&exclude_path, "same content")?;
715
716        let mut hasher = FileHasher::new(&[dir.path()])?;
717        hasher.buffer_size = 8192;
718        let mut builder = globset::GlobSetBuilder::new();
719        builder.add(
720            globset::GlobBuilder::new("exclude.txt")
721                .case_insensitive(true)
722                .build()?,
723        );
724        let filter = builder.build()?;
725        hasher.exclude = Some(filter);
726
727        let duplicates = hasher.find_duplicates()?;
728        assert_eq!(duplicates.len(), 1);
729        let group = &duplicates[0];
730        assert_eq!(group.paths.len(), 2);
731        assert!(group.paths.contains(&file1_path));
732        assert!(group.paths.contains(&file2_path));
733        assert!(!group.paths.contains(&exclude_path));
734        Ok(())
735    }
736
737    #[test]
738    fn check_mode_empty_cache() -> anyhow::Result<()> {
739        let dir = tempfile::tempdir()?;
740        let dir_path = dir.path().to_path_buf();
741        println!("{:?}", dir_path);
742        let file1_path = dir.path().join("file1.txt");
743        fs::write(&file1_path, "content 1")?;
744        let file2_path = dir.path().join("file2.txt");
745        fs::write(&file2_path, "content 2")?;
746
747        let mut hasher = FileHasher::new(&[&dir_path])?;
748        hasher.exclude = Some(default_exclude());
749        let (tx, rx) = mpsc::channel();
750        hasher.check_streaming(tx, false)?;
751        let mut results = Vec::new();
752        let mut start_seen = false;
753        let mut total_files = None;
754        let mut file_done_count = 0;
755        let mut num_error = 0;
756        while let Ok(event) = rx.recv() {
757            match event {
758                CheckEvent::StartChecking => start_seen = true,
759                CheckEvent::TotalFiles(total) => total_files = Some(total),
760                CheckEvent::Result(path, status) => results.push((path, status)),
761                CheckEvent::FileDone => file_done_count += 1,
762                CheckEvent::Error => num_error += 1,
763            }
764        }
765        assert!(start_seen);
766        assert_eq!(total_files, Some(2));
767        assert_eq!(file_done_count, 0);
768        assert_eq!(num_error, 0);
769
770        results.sort_by(|a, b| a.0.cmp(&b.0));
771        assert_eq!(results.len(), 2);
772        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
773        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
774
775        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
776        Ok(())
777    }
778
779    #[test]
780    fn check_mode_with_cache() -> anyhow::Result<()> {
781        let dir = tempfile::tempdir()?;
782        let dir_path = dir.path().to_path_buf();
783        let file1_path = dir.path().join("file1.txt");
784        let file2_path = dir.path().join("file2.txt");
785        fs::write(&file1_path, "content 1")?;
786        fs::write(&file2_path, "content 2")?;
787
788        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
789        hasher.exclude = Some(default_exclude());
790        let _hash1 = hasher.get_hash(&file1_path)?;
791        let _hash2 = hasher.get_hash(&file2_path)?;
792        hasher.save_cache()?;
793        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
794
795        let mut hasher = FileHasher::new(&[&dir_path])?;
796        hasher.exclude = Some(default_exclude());
797        let (tx, rx) = mpsc::channel();
798        hasher.check_streaming(tx, false)?;
799        let mut results = Vec::new();
800        let mut file_done_count = 0;
801        while let Ok(event) = rx.recv() {
802            match event {
803                CheckEvent::Result(path, status) => results.push((path, status)),
804                CheckEvent::FileDone => file_done_count += 1,
805                _ => {}
806            }
807        }
808        assert_eq!(results.len(), 0);
809        assert_eq!(file_done_count, 2);
810
811        fs::write(&file1_path, "content 1 modified")?;
812
813        let file2_meta_before = fs::metadata(&file2_path)?;
814        let mtime_before = file2_meta_before.modified()?;
815        std::thread::sleep(time::Duration::from_millis(10));
816        fs::write(&file2_path, "content 2")?;
817        let file2_meta_after = fs::metadata(&file2_path)?;
818        let mtime_after = file2_meta_after.modified()?;
819        assert!(mtime_after > mtime_before);
820
821        let mut hasher = FileHasher::new(&[&dir_path])?;
822        hasher.exclude = Some(default_exclude());
823        let (tx, rx) = mpsc::channel();
824        hasher.check_streaming(tx, false)?;
825        let mut results = Vec::new();
826        let mut file_done_count = 0;
827        while let Ok(event) = rx.recv() {
828            match event {
829                CheckEvent::Result(path, status) => results.push((path, status)),
830                CheckEvent::FileDone => file_done_count += 1,
831                _ => {}
832            }
833        }
834        assert_eq!(results.len(), 1);
835        assert_eq!(
836            results[0],
837            (PathBuf::from("file1.txt"), CheckStatus::Modified)
838        );
839        assert_eq!(file_done_count, 1);
840        Ok(())
841    }
842
843    #[test]
844    fn check_update_mode() -> anyhow::Result<()> {
845        let dir = tempfile::tempdir()?;
846        let dir_path = dir.path().to_path_buf();
847        let file1_path = dir.path().join("file1.txt");
848        fs::write(&file1_path, "content 1")?;
849
850        let mut hasher = FileHasher::new(&[&dir_path])?;
851        hasher.exclude = Some(default_exclude());
852        let (tx, rx) = mpsc::channel();
853        hasher.check_streaming(tx, true)?;
854        while rx.recv().is_ok() {}
855        hasher.save_cache()?;
856        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
857
858        let cache = FileHashCache::new(&dir_path);
859        let mtime1 = fs::metadata(&file1_path)?.modified()?;
860        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
861        assert!(hash1.is_some());
862
863        std::thread::sleep(time::Duration::from_millis(10));
864        fs::write(&file1_path, "content 1 modified")?;
865        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
866
867        let mut hasher = FileHasher::new(&[&dir_path])?;
868        hasher.exclude = Some(default_exclude());
869        let (tx, rx) = mpsc::channel();
870        hasher.check_streaming(tx, true)?;
871        while rx.recv().is_ok() {}
872        hasher.save_cache()?;
873
874        let cache = FileHashCache::new(&dir_path);
875        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
876        assert!(hash_mod.is_some());
877        assert_ne!(hash1, hash_mod);
878
879        std::thread::sleep(time::Duration::from_millis(10));
880        fs::write(&file1_path, "content 1 modified")?;
881        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
882        assert!(mtime1_mod2 > mtime1_mod);
883
884        assert!(
885            cache
886                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
887                .is_none()
888        );
889
890        let mut hasher = FileHasher::new(&[&dir_path])?;
891        hasher.exclude = Some(default_exclude());
892        let (tx, rx) = mpsc::channel();
893        hasher.check_streaming(tx, true)?;
894        while rx.recv().is_ok() {}
895        hasher.save_cache()?;
896
897        let cache = FileHashCache::new(&dir_path);
898        assert!(
899            cache
900                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
901                .is_some()
902        );
903        Ok(())
904    }
905
906    #[test]
907    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
908        let dir = tempfile::tempdir()?;
909        let dir_path = dir.path().to_path_buf();
910        let file1_path = dir.path().join("file1.txt");
911        let file2_path = dir.path().join("file2.txt");
912        fs::write(&file1_path, "content 1")?;
913        fs::write(&file2_path, "content 2")?;
914        let mtime1 = fs::metadata(&file1_path)?.modified()?;
915        let mtime2 = fs::metadata(&file2_path)?.modified()?;
916
917        let mut hasher = FileHasher::new(&[&dir_path])?;
918        hasher.exclude = Some(default_exclude());
919        let (tx, rx) = mpsc::channel();
920        hasher.check_streaming(tx, true)?;
921        while rx.recv().is_ok() {}
922        hasher.save_cache()?;
923
924        // Verify both are in the cache
925        let cache = FileHashCache::new(&dir_path);
926        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
927        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
928
929        // Now delete file2 from disk
930        fs::remove_file(&file2_path)?;
931
932        // Run check and save again
933        let mut hasher = FileHasher::new(&[&dir_path])?;
934        hasher.exclude = Some(default_exclude());
935        let (tx, rx) = mpsc::channel();
936        hasher.check_streaming(tx, true)?;
937        while rx.recv().is_ok() {}
938        hasher.save_cache()?;
939
940        // Verify file2 is removed from cache, but file1 is still there
941        let cache = FileHashCache::new(&dir_path);
942        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
943        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
944        Ok(())
945    }
946
947    #[test]
948    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
949        let tmp = tempfile::tempdir()?;
950        let dir1 = tmp.path().join("dir1");
951        let dir2 = tmp.path().join("dir2");
952        fs::create_dir(&dir1)?;
953        fs::create_dir(&dir2)?;
954        let file1_path = dir1.join("file1.txt");
955        fs::write(&file1_path, "same content")?;
956        let file2_path = dir2.join("file2.txt");
957        fs::write(&file2_path, "same content")?;
958        let hasher = FileHasher::new(&[&dir1, &dir2])?;
959        let duplicates = hasher.find_duplicates()?;
960        assert_eq!(duplicates.len(), 1);
961        let group = &duplicates[0];
962        assert_eq!(group.paths.len(), 2);
963        assert_eq!(group.size, 12);
964        assert!(group.paths.contains(&file1_path));
965        assert!(group.paths.contains(&file2_path));
966
967        Ok(())
968    }
969
970    #[test]
971    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
972        let tmp = tempfile::tempdir()?;
973        let dir1 = tmp.path().join("dir1");
974        let dir2 = tmp.path().join("dir2");
975        fs::create_dir(&dir1)?;
976        fs::create_dir(&dir2)?;
977        let hasher = FileHasher::new(&[&dir1, &dir2])?;
978        assert!(hasher.check(false).is_err());
979        Ok(())
980    }
981}