Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, OutputFormat,
3    Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use std::{
9    collections::HashMap,
10    fs,
11    io::{self, Read, stdout},
12    path::{Path, PathBuf},
13    sync::{
14        Arc,
15        atomic::{AtomicUsize, Ordering},
16        mpsc,
17    },
18    time,
19};
20
21type FileItem = (PathBuf, usize);
22
23#[derive(Debug, Clone)]
24enum DupEvent {
25    StartHashing,
26    NumFiles(usize),
27    Result(PathBuf, u64, blake3::Hash),
28    Error,
29}
30
31#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
32enum CheckStatus {
33    Unchanged,
34    New,
35    Modified,
36}
37
38#[derive(Debug, PartialEq)]
39enum CheckEvent {
40    StartChecking,
41    TotalFiles(usize),
42    Result(PathBuf, CheckStatus),
43    FileDone,
44    Error,
45}
46
47enum DupState {
48    Single(PathBuf, time::SystemTime, usize),
49    Hashing,
50}
51
52/// A tool for finding duplicated files in a directory.
53pub struct FileHasher {
54    dirs: Vec<PathBuf>,
55    pub buffer_size: usize,
56    cache: Option<Arc<FileHashCache>>,
57    num_hashed: AtomicUsize,
58    num_hash_looked_up: AtomicUsize,
59    pub exclude: Option<GlobSet>,
60    pub progress: Option<Arc<ProgressBuilder>>,
61    pub output_format: OutputFormat,
62    pub jobs: usize,
63}
64
65impl FileHasher {
66    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
67
68    /// Creates a new `FileHasher` for the given directories.
69    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
70        if dirs.is_empty() {
71            anyhow::bail!("At least one directory must be specified.");
72        }
73        Ok(Self {
74            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
75            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
76            cache: None,
77            num_hashed: AtomicUsize::new(0),
78            num_hash_looked_up: AtomicUsize::new(0),
79            exclude: None,
80            progress: None,
81            output_format: OutputFormat::Default,
82            jobs: Self::DEFAULT_JOBS,
83        })
84    }
85
86    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
87        let mut hasher = Self::new(dirs)?;
88        hasher.cache = Some(hasher.new_cache()?);
89        Ok(hasher)
90    }
91
92    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
93        let common_ancestor = crate::common_ancestor(&self.dirs)
94            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
95        Ok(FileHashCache::find_or_new(&common_ancestor))
96    }
97
98    /// Gets the hash cache.
99    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
100        if self.cache.is_none() {
101            self.cache = Some(self.new_cache()?);
102        }
103        Ok(Arc::clone(self.cache.as_ref().unwrap()))
104    }
105
106    /// Remove a cache entry if it exists.
107    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
108        let cache = self.cache()?;
109        let relative = crate::strip_prefix(path, cache.base_dir())?;
110        cache.remove(relative);
111        Ok(())
112    }
113
114    /// Save the hash cache if it is dirty.
115    pub fn save_cache(&self) -> anyhow::Result<()> {
116        log::info!(
117            "Hash stats for {:?}: {} computed, {} looked up",
118            self.dirs,
119            self.num_hashed.load(Ordering::Relaxed),
120            self.num_hash_looked_up.load(Ordering::Relaxed)
121        );
122        if let Some(cache) = &self.cache {
123            cache.save()?;
124        }
125        Ok(())
126    }
127
128    /// Clears the loaded hashes in the cache.
129    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
130        let cache = self.cache()?;
131        for dir in &self.dirs {
132            let relative = crate::strip_prefix(dir, cache.base_dir())?;
133            cache.clear(relative);
134        }
135        Ok(())
136    }
137
138    /// Executes the check/update process.
139    pub fn check(&self, update: bool) -> anyhow::Result<()> {
140        match self.output_format {
141            OutputFormat::Default | OutputFormat::Symbol => {}
142            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
143        }
144        if self.dirs.len() > 1 {
145            anyhow::bail!("Check mode only supports one directory.");
146        }
147        let start_time = time::Instant::now();
148        let progress = self
149            .progress
150            .as_ref()
151            .map(|progress| progress.add_spinner())
152            .unwrap_or_else(Progress::none);
153        progress.set_message("Scanning directory...");
154        let mut num_new = 0;
155        let mut num_modified = 0;
156        let mut num_error = 0;
157        std::thread::scope(|scope| {
158            let (tx, rx) = mpsc::channel();
159            scope.spawn(|| {
160                if let Err(e) = self.check_streaming(tx, update) {
161                    log::error!("Error during check: {}", e);
162                }
163            });
164            while let Ok(event) = rx.recv() {
165                match event {
166                    CheckEvent::StartChecking => {
167                        progress.set_message("Checking files...");
168                    }
169                    CheckEvent::TotalFiles(total) => {
170                        progress.set_length(total as u64);
171                        progress.set_message("");
172                    }
173                    CheckEvent::Result(path, status) => {
174                        let symbol = match status {
175                            CheckStatus::New => {
176                                num_new += 1;
177                                '+'
178                            }
179                            CheckStatus::Modified => {
180                                num_modified += 1;
181                                '!'
182                            }
183                            CheckStatus::Unchanged => unreachable!(),
184                        };
185                        progress.inc(1);
186                        progress.suspend_for(stdout(), || {
187                            println!("{} {}", symbol, path.display());
188                        });
189                    }
190                    CheckEvent::FileDone => {
191                        progress.inc(1);
192                    }
193                    CheckEvent::Error => {
194                        progress.inc(1);
195                        num_error += 1;
196                    }
197                }
198            }
199        });
200        progress.finish();
201        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
202        Ok(())
203    }
204
205    fn print_check_summary(
206        &self,
207        start_time: &time::Instant,
208        num_new: usize,
209        num_modified: usize,
210        num_error: usize,
211    ) -> io::Result<()> {
212        let summary = [
213            ("Elapsed:", 0),
214            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
215            ("New files:", num_new),
216            ("Modified files:", num_modified),
217            ("Errors:", num_error),
218        ];
219        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
220        let mut writer = std::io::stderr();
221        formatter.write_value(
222            &mut writer,
223            summary[0].0,
224            FormattedDuration(start_time.elapsed()),
225        )?;
226        formatter.write_values(&mut writer, &summary[1..])
227    }
228
229    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
230        assert_eq!(self.dirs.len(), 1);
231        let cache = self.new_cache()?;
232        let base_dir = &self.dirs[0];
233        let relative = crate::strip_prefix(base_dir, cache.base_dir())?;
234        cache.set_remove_if_no_access(relative);
235        let cache_clone = Arc::clone(&cache);
236        std::thread::scope(|global_scope| {
237            let mut it = FileIterator::new(base_dir);
238            it.cache = Some(Arc::clone(&cache));
239            it.exclude = self.exclude.as_ref();
240            let it_rx = it.spawn_in_scope(global_scope);
241            tx.send(CheckEvent::StartChecking)?;
242            let pool = crate::build_thread_pool(self.jobs)?;
243            pool.scope(move |scope| -> anyhow::Result<()> {
244                let mut total_files = 0;
245                for path in it_rx {
246                    total_files += 1;
247                    let tx = tx.clone();
248                    let cache = Arc::clone(&cache);
249                    scope.spawn(move |_| {
250                        let status = self.check_file(&path, &cache, update);
251                        let event = match status {
252                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
253                                let rel_path = crate::strip_prefix(&path, base_dir).unwrap();
254                                CheckEvent::Result(rel_path.into(), status.unwrap())
255                            }
256                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
257                            Err(e) => {
258                                log::error!("Failed to check file {:?}: {}", path, e);
259                                CheckEvent::Error
260                            }
261                        };
262                        if tx.send(event).is_err() {
263                            log::error!("Send failed");
264                        }
265                    });
266                }
267                tx.send(CheckEvent::TotalFiles(total_files))?;
268                Ok(())
269            })
270        })?;
271        cache_clone.save()?;
272        Ok(())
273    }
274
275    fn check_file(
276        &self,
277        abs_path: &Path,
278        cache: &FileHashCache,
279        update: bool,
280    ) -> anyhow::Result<CheckStatus> {
281        assert!(abs_path.is_absolute());
282        let computed_hash = self.compute_hash(abs_path)?;
283        let rel_path = crate::strip_prefix(abs_path, cache.base_dir())?;
284        let cached_hash = cache.get_by_path(rel_path);
285        let status = match cached_hash {
286            None => CheckStatus::New,
287            Some(cached) => {
288                if computed_hash != cached {
289                    CheckStatus::Modified
290                } else {
291                    CheckStatus::Unchanged
292                }
293            }
294        };
295        if update {
296            let modified = fs::metadata(abs_path)?.modified()?;
297            match status {
298                CheckStatus::New | CheckStatus::Modified => {
299                    cache.insert(rel_path, modified, computed_hash);
300                }
301                CheckStatus::Unchanged => {
302                    if cache.get(rel_path, modified).is_none() {
303                        cache.insert(rel_path, modified, computed_hash);
304                    }
305                }
306            }
307        }
308        Ok(status)
309    }
310
311    /// Executes the duplicate file finding process and prints results.
312    pub fn run(&self) -> anyhow::Result<()> {
313        let start_time = time::Instant::now();
314        let mut duplicates = self.find_duplicates()?;
315        let mut total_wasted_space = 0;
316        if !duplicates.is_empty() {
317            duplicates.sort_by_key(|a| a.size);
318            total_wasted_space = self.print_duplicates_results(&duplicates)?;
319        }
320        self.print_duplicates_summary(&start_time, total_wasted_space)?;
321        Ok(())
322    }
323
324    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
325        let mut total_wasted_space = 0;
326        for dupes in duplicates {
327            dupes.print(self.output_format)?;
328            total_wasted_space += dupes.wasted_size();
329        }
330        Ok(total_wasted_space)
331    }
332
333    fn print_duplicates_summary(
334        &self,
335        start_time: &time::Instant,
336        total_wasted_space: u64,
337    ) -> io::Result<()> {
338        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
339        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
340        let total_wasted_space = crate::human_readable_size(total_wasted_space);
341        let summary = [
342            ("Elapsed:", elapsed),
343            ("Hash computed:", num_hashed),
344            ("Total wasted space:", total_wasted_space),
345        ];
346        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
347        formatter.write_values(&mut io::stderr(), &summary)
348    }
349
350    /// Finds duplicated files and returns a list of duplicate groups.
351    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
352        let progress = self
353            .progress
354            .as_ref()
355            .map(|progress| progress.add_spinner())
356            .unwrap_or_else(Progress::none);
357        progress.set_message("Scanning directories...");
358
359        let (tx, rx) = mpsc::channel();
360        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
361        std::thread::scope(|scope| {
362            scope.spawn(|| {
363                if let Err(e) = self.find_duplicates_streaming(tx) {
364                    log::error!("Error during duplicate finding: {}", e);
365                }
366            });
367
368            while let Ok(event) = rx.recv() {
369                match event {
370                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
371                    DupEvent::NumFiles(num) => progress.set_length(num as u64),
372                    DupEvent::Result(path, size, hash) => {
373                        progress.inc(1);
374                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
375                            paths: Vec::new(),
376                            size,
377                        });
378                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
379                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
380                        entry.paths.push(path);
381                    }
382                    DupEvent::Error => progress.inc(1),
383                }
384            }
385        });
386        progress.finish();
387
388        let mut duplicates = Vec::new();
389        for (_, mut dupes) in by_hash {
390            if dupes.paths.len() > 1 {
391                dupes.paths.sort();
392                duplicates.push(dupes);
393            }
394        }
395        Ok(duplicates)
396    }
397
398    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
399        std::thread::scope(|global_scope| {
400            let (it_rx, caches) = self.stream_file_items(global_scope)?;
401            let caches = &caches;
402            let pool = crate::build_thread_pool(self.jobs)?;
403            pool.scope(move |scope| -> anyhow::Result<()> {
404                let mut by_size: HashMap<u64, DupState> = HashMap::new();
405                let mut num_hashed = 0;
406                tx.send(DupEvent::StartHashing)?;
407                for (path, dir_index) in it_rx {
408                    let meta = fs::metadata(&path)?;
409                    let size = meta.len();
410                    if size == 0 {
411                        continue;
412                    }
413                    let modified = meta.modified()?;
414                    let cache = &caches[dir_index];
415                    match by_size.entry(size) {
416                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
417                        {
418                            DupState::Single(path0, modified0, dir_index0) => {
419                                // We found a second file of identical size.
420                                // Time to start hashing both the *original* matching file and the *new* one!
421                                let cache0 = &caches[*dir_index0];
422                                self.send_hash(path0, size, *modified0, cache0, &tx, scope);
423                                self.send_hash(&path, size, modified, cache, &tx, scope);
424
425                                // Modify the state to indicate we are now fully hashing this size bucket.
426                                *occ.get_mut() = DupState::Hashing;
427                                num_hashed += 2;
428                            }
429                            DupState::Hashing => {
430                                // File size bucket already hashing; just dynamically spawn the new file immediately.
431                                self.send_hash(&path, size, modified, cache, &tx, scope);
432                                num_hashed += 1;
433                            }
434                        },
435                        std::collections::hash_map::Entry::Vacant(vac) => {
436                            vac.insert(DupState::Single(path, modified, dir_index));
437                        }
438                    }
439                }
440                tx.send(DupEvent::NumFiles(num_hashed))?;
441                Ok(())
442            })?;
443            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
444            Ok::<(), anyhow::Error>(())
445        })?;
446        Ok(())
447    }
448
449    fn stream_file_items<'scope, 'env>(
450        &'env self,
451        scope: &'scope std::thread::Scope<'scope, 'env>,
452    ) -> anyhow::Result<(mpsc::Receiver<FileItem>, Vec<Arc<FileHashCache>>)> {
453        let (it_tx, it_rx) = mpsc::channel();
454        let mut caches = Vec::with_capacity(self.dirs.len());
455        for (dir_index, dir) in self.dirs.iter().enumerate() {
456            let mut it = FileIterator::new(dir);
457            let cache = FileHashCache::find_or_new(dir);
458            it.cache = Some(Arc::clone(&cache));
459            it.exclude = self.exclude.as_ref();
460            let it_tx = it_tx.clone();
461            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
462            caches.push(cache);
463        }
464        Ok((it_rx, caches))
465    }
466
467    fn send_hash<'scope>(
468        &'scope self,
469        path: &Path,
470        size: u64,
471        modified: time::SystemTime,
472        cache: &Arc<FileHashCache>,
473        tx: &mpsc::Sender<DupEvent>,
474        scope: &rayon::Scope<'scope>,
475    ) {
476        let (hash, relative) = self
477            .get_hash_from_cache(path, modified, cache)
478            .expect("path should be in cache base_dir");
479        if let Some(hash) = hash {
480            let _ = tx.send(DupEvent::Result(path.to_path_buf(), size, hash));
481            return;
482        }
483
484        let path = path.to_path_buf();
485        let relative = relative.to_path_buf();
486        let tx = tx.clone();
487        let cache = Arc::clone(cache);
488        scope.spawn(move |_| {
489            if let Ok(hash) = self.compute_hash(&path) {
490                cache.insert(&relative, modified, hash);
491                let _ = tx.send(DupEvent::Result(path, size, hash));
492            } else {
493                log::error!("Failed to hash file: {:?}", path);
494                let _ = tx.send(DupEvent::Error);
495            }
496        });
497    }
498
499    /// Gets the hash of a file, using the cache if available.
500    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
501        let cache = self.cache.as_ref().expect("cache should be initialized");
502        let meta = fs::metadata(path)?;
503        let modified = meta.modified()?;
504        let (hash, relative) = self.get_hash_from_cache(path, modified, cache)?;
505        if let Some(hash) = hash {
506            return Ok(hash);
507        }
508
509        let hash = self.compute_hash(path)?;
510        cache.insert(relative, modified, hash);
511        Ok(hash)
512    }
513
514    fn get_hash_from_cache<'a>(
515        &self,
516        path: &'a Path,
517        modified: time::SystemTime,
518        cache: &FileHashCache,
519    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
520        let relative = crate::strip_prefix(path, cache.base_dir())
521            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
522        if let Some(hash) = cache.get(relative, modified) {
523            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
524            return Ok((Some(hash), relative));
525        }
526        Ok((None, relative))
527    }
528
529    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
530        let start_time = time::Instant::now();
531        let mut f = fs::File::open(path)?;
532        let len = f.metadata()?.len();
533        let progress = self
534            .progress
535            .as_ref()
536            .map(|progress| progress.add_file(path, len))
537            .unwrap_or_else(Progress::none);
538        let mut hasher = blake3::Hasher::new();
539        if self.buffer_size == 0 {
540            if len > 0 {
541                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
542                hasher.update(&mmap[..]);
543                progress.inc(len);
544            }
545        } else {
546            let mut buf = vec![0u8; self.buffer_size];
547            loop {
548                let n = f.read(&mut buf)?;
549                if n == 0 {
550                    break;
551                }
552                hasher.update(&buf[..n]);
553                progress.inc(n as u64);
554            }
555        }
556        progress.finish();
557        self.num_hashed.fetch_add(1, Ordering::Relaxed);
558        let hash = hasher.finalize();
559        log::debug!(
560            "Computed hash in {}: {:?}",
561            FormattedDuration(start_time.elapsed()),
562            path
563        );
564        Ok(hash)
565    }
566}
567
568/// A group of duplicated files and their size.
569#[derive(Clone, Debug)]
570pub struct DuplicatedFiles {
571    pub paths: Vec<PathBuf>,
572    pub size: u64,
573}
574
575impl DuplicatedFiles {
576    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
577        match output_format {
578            OutputFormat::Default => self.write_human(stdout())?,
579            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
580        }
581        Ok(())
582    }
583
584    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
585        writeln!(
586            writer,
587            "Identical {} files of {}:",
588            self.paths.len(),
589            crate::human_readable_size(self.size)
590        )?;
591        for path in &self.paths {
592            writeln!(writer, "  {}", path.display())?;
593        }
594        Ok(())
595    }
596
597    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
598        writeln!(writer, "- paths:")?;
599        for path in &self.paths {
600            writeln!(writer, "  - {:?}", path)?;
601        }
602        writeln!(writer, "  size: {}", self.size)?;
603        Ok(())
604    }
605
606    fn wasted_size(&self) -> u64 {
607        self.size * (self.paths.len() as u64 - 1)
608    }
609}
610
611#[cfg(test)]
612mod tests {
613    use super::*;
614
615    fn default_exclude() -> globset::GlobSet {
616        let mut builder = globset::GlobSetBuilder::new();
617        builder.add(
618            globset::GlobBuilder::new(".hash_cache")
619                .case_insensitive(true)
620                .build()
621                .unwrap(),
622        );
623        builder.build().unwrap()
624    }
625
626    #[test]
627    fn find_duplicates() -> anyhow::Result<()> {
628        let dir = tempfile::tempdir()?;
629
630        let file1_path = dir.path().join("same1.txt");
631        fs::write(&file1_path, "same content")?;
632
633        let file2_path = dir.path().join("same2.txt");
634        fs::write(&file2_path, "same content")?;
635
636        let diff_path = dir.path().join("diff.txt");
637        fs::write(&diff_path, "different content")?;
638
639        let mut hasher = FileHasher::new(&[dir.path()])?;
640        hasher.buffer_size = 8192;
641        let duplicates = hasher.find_duplicates()?;
642
643        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
644        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
645
646        assert_eq!(duplicates.len(), 1);
647        let group = &duplicates[0];
648        assert_eq!(group.paths.len(), 2);
649        assert_eq!(group.size, 12); // "same content" is 12 bytes
650
651        assert!(group.paths.contains(&file1_path));
652        assert!(group.paths.contains(&file2_path));
653
654        Ok(())
655    }
656
657    #[test]
658    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
659        let dir = tempfile::tempdir()?;
660        let dir_path = dir.path();
661
662        let sub_dir = dir_path.join("a").join("a");
663        fs::create_dir_all(&sub_dir)?;
664
665        let file1_path = sub_dir.join("1");
666        fs::write(&file1_path, "same content")?;
667
668        let file2_path = sub_dir.join("2");
669        fs::write(&file2_path, "same content")?;
670
671        // Create empty cache file in a/a to force it to be the cache base
672        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
673        fs::File::create(&cache_aa_path)?;
674
675        // Run find_duplicates on a/a
676        let hasher_aa = FileHasher::new(&[&sub_dir])?;
677        let duplicates_aa = hasher_aa.find_duplicates()?;
678        assert_eq!(duplicates_aa.len(), 1);
679        assert!(cache_aa_path.exists());
680        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
681        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
682
683        // Create empty cache file in a to force it to be the cache base
684        let root_a = dir_path.join("a");
685        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
686        fs::File::create(&cache_a_path)?;
687
688        // Run find_duplicates on a
689        let hasher_a = FileHasher::new(&[&root_a])?;
690        let duplicates_a = hasher_a.find_duplicates()?;
691        assert_eq!(duplicates_a.len(), 1);
692        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
693        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
694
695        // The merged child cache should be removed.
696        assert!(cache_a_path.exists());
697        assert!(!cache_aa_path.exists());
698
699        Ok(())
700    }
701
702    #[test]
703    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
704        let dir = tempfile::tempdir()?;
705
706        let file1_path = dir.path().join("same1.txt");
707        fs::write(&file1_path, "same content")?;
708
709        let file2_path = dir.path().join("same2.txt");
710        fs::write(&file2_path, "same content")?;
711
712        let exclude_path = dir.path().join("exclude.txt");
713        fs::write(&exclude_path, "same content")?;
714
715        let mut hasher = FileHasher::new(&[dir.path()])?;
716        hasher.buffer_size = 8192;
717        let mut builder = globset::GlobSetBuilder::new();
718        builder.add(
719            globset::GlobBuilder::new("exclude.txt")
720                .case_insensitive(true)
721                .build()?,
722        );
723        let filter = builder.build()?;
724        hasher.exclude = Some(filter);
725
726        let duplicates = hasher.find_duplicates()?;
727        assert_eq!(duplicates.len(), 1);
728        let group = &duplicates[0];
729        assert_eq!(group.paths.len(), 2);
730        assert!(group.paths.contains(&file1_path));
731        assert!(group.paths.contains(&file2_path));
732        assert!(!group.paths.contains(&exclude_path));
733        Ok(())
734    }
735
736    #[test]
737    fn check_mode_empty_cache() -> anyhow::Result<()> {
738        let dir = tempfile::tempdir()?;
739        let dir_path = dir.path().to_path_buf();
740        println!("{:?}", dir_path);
741        let file1_path = dir.path().join("file1.txt");
742        fs::write(&file1_path, "content 1")?;
743        let file2_path = dir.path().join("file2.txt");
744        fs::write(&file2_path, "content 2")?;
745
746        let mut hasher = FileHasher::new(&[&dir_path])?;
747        hasher.exclude = Some(default_exclude());
748        let (tx, rx) = mpsc::channel();
749        hasher.check_streaming(tx, false)?;
750        let mut results = Vec::new();
751        let mut start_seen = false;
752        let mut total_files = None;
753        let mut file_done_count = 0;
754        let mut num_error = 0;
755        while let Ok(event) = rx.recv() {
756            match event {
757                CheckEvent::StartChecking => start_seen = true,
758                CheckEvent::TotalFiles(total) => total_files = Some(total),
759                CheckEvent::Result(path, status) => results.push((path, status)),
760                CheckEvent::FileDone => file_done_count += 1,
761                CheckEvent::Error => num_error += 1,
762            }
763        }
764        assert!(start_seen);
765        assert_eq!(total_files, Some(2));
766        assert_eq!(file_done_count, 0);
767        assert_eq!(num_error, 0);
768
769        results.sort_by(|a, b| a.0.cmp(&b.0));
770        assert_eq!(results.len(), 2);
771        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
772        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
773
774        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
775        Ok(())
776    }
777
778    #[test]
779    fn check_mode_with_cache() -> anyhow::Result<()> {
780        let dir = tempfile::tempdir()?;
781        let dir_path = dir.path().to_path_buf();
782        let file1_path = dir.path().join("file1.txt");
783        let file2_path = dir.path().join("file2.txt");
784        fs::write(&file1_path, "content 1")?;
785        fs::write(&file2_path, "content 2")?;
786
787        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
788        hasher.exclude = Some(default_exclude());
789        let _hash1 = hasher.get_hash(&file1_path)?;
790        let _hash2 = hasher.get_hash(&file2_path)?;
791        hasher.save_cache()?;
792        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
793
794        let mut hasher = FileHasher::new(&[&dir_path])?;
795        hasher.exclude = Some(default_exclude());
796        let (tx, rx) = mpsc::channel();
797        hasher.check_streaming(tx, false)?;
798        let mut results = Vec::new();
799        let mut file_done_count = 0;
800        while let Ok(event) = rx.recv() {
801            match event {
802                CheckEvent::Result(path, status) => results.push((path, status)),
803                CheckEvent::FileDone => file_done_count += 1,
804                _ => {}
805            }
806        }
807        assert_eq!(results.len(), 0);
808        assert_eq!(file_done_count, 2);
809
810        fs::write(&file1_path, "content 1 modified")?;
811
812        let file2_meta_before = fs::metadata(&file2_path)?;
813        let mtime_before = file2_meta_before.modified()?;
814        std::thread::sleep(time::Duration::from_millis(10));
815        fs::write(&file2_path, "content 2")?;
816        let file2_meta_after = fs::metadata(&file2_path)?;
817        let mtime_after = file2_meta_after.modified()?;
818        assert!(mtime_after > mtime_before);
819
820        let mut hasher = FileHasher::new(&[&dir_path])?;
821        hasher.exclude = Some(default_exclude());
822        let (tx, rx) = mpsc::channel();
823        hasher.check_streaming(tx, false)?;
824        let mut results = Vec::new();
825        let mut file_done_count = 0;
826        while let Ok(event) = rx.recv() {
827            match event {
828                CheckEvent::Result(path, status) => results.push((path, status)),
829                CheckEvent::FileDone => file_done_count += 1,
830                _ => {}
831            }
832        }
833        assert_eq!(results.len(), 1);
834        assert_eq!(
835            results[0],
836            (PathBuf::from("file1.txt"), CheckStatus::Modified)
837        );
838        assert_eq!(file_done_count, 1);
839        Ok(())
840    }
841
842    #[test]
843    fn check_update_mode() -> anyhow::Result<()> {
844        let dir = tempfile::tempdir()?;
845        let dir_path = dir.path().to_path_buf();
846        let file1_path = dir.path().join("file1.txt");
847        fs::write(&file1_path, "content 1")?;
848
849        let mut hasher = FileHasher::new(&[&dir_path])?;
850        hasher.exclude = Some(default_exclude());
851        let (tx, rx) = mpsc::channel();
852        hasher.check_streaming(tx, true)?;
853        while rx.recv().is_ok() {}
854        hasher.save_cache()?;
855        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
856
857        let cache = FileHashCache::new(&dir_path);
858        let mtime1 = fs::metadata(&file1_path)?.modified()?;
859        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
860        assert!(hash1.is_some());
861
862        std::thread::sleep(time::Duration::from_millis(10));
863        fs::write(&file1_path, "content 1 modified")?;
864        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
865
866        let mut hasher = FileHasher::new(&[&dir_path])?;
867        hasher.exclude = Some(default_exclude());
868        let (tx, rx) = mpsc::channel();
869        hasher.check_streaming(tx, true)?;
870        while rx.recv().is_ok() {}
871        hasher.save_cache()?;
872
873        let cache = FileHashCache::new(&dir_path);
874        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
875        assert!(hash_mod.is_some());
876        assert_ne!(hash1, hash_mod);
877
878        std::thread::sleep(time::Duration::from_millis(10));
879        fs::write(&file1_path, "content 1 modified")?;
880        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
881        assert!(mtime1_mod2 > mtime1_mod);
882
883        assert!(
884            cache
885                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
886                .is_none()
887        );
888
889        let mut hasher = FileHasher::new(&[&dir_path])?;
890        hasher.exclude = Some(default_exclude());
891        let (tx, rx) = mpsc::channel();
892        hasher.check_streaming(tx, true)?;
893        while rx.recv().is_ok() {}
894        hasher.save_cache()?;
895
896        let cache = FileHashCache::new(&dir_path);
897        assert!(
898            cache
899                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
900                .is_some()
901        );
902        Ok(())
903    }
904
905    #[test]
906    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
907        let dir = tempfile::tempdir()?;
908        let dir_path = dir.path().to_path_buf();
909        let file1_path = dir.path().join("file1.txt");
910        let file2_path = dir.path().join("file2.txt");
911        fs::write(&file1_path, "content 1")?;
912        fs::write(&file2_path, "content 2")?;
913        let mtime1 = fs::metadata(&file1_path)?.modified()?;
914        let mtime2 = fs::metadata(&file2_path)?.modified()?;
915
916        let mut hasher = FileHasher::new(&[&dir_path])?;
917        hasher.exclude = Some(default_exclude());
918        let (tx, rx) = mpsc::channel();
919        hasher.check_streaming(tx, true)?;
920        while rx.recv().is_ok() {}
921        hasher.save_cache()?;
922
923        // Verify both are in the cache
924        let cache = FileHashCache::new(&dir_path);
925        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
926        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
927
928        // Now delete file2 from disk
929        fs::remove_file(&file2_path)?;
930
931        // Run check and save again
932        let mut hasher = FileHasher::new(&[&dir_path])?;
933        hasher.exclude = Some(default_exclude());
934        let (tx, rx) = mpsc::channel();
935        hasher.check_streaming(tx, true)?;
936        while rx.recv().is_ok() {}
937        hasher.save_cache()?;
938
939        // Verify file2 is removed from cache, but file1 is still there
940        let cache = FileHashCache::new(&dir_path);
941        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
942        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
943        Ok(())
944    }
945
946    #[test]
947    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
948        let tmp = tempfile::tempdir()?;
949        let dir1 = tmp.path().join("dir1");
950        let dir2 = tmp.path().join("dir2");
951        fs::create_dir(&dir1)?;
952        fs::create_dir(&dir2)?;
953        let file1_path = dir1.join("file1.txt");
954        fs::write(&file1_path, "same content")?;
955        let file2_path = dir2.join("file2.txt");
956        fs::write(&file2_path, "same content")?;
957        let hasher = FileHasher::new(&[&dir1, &dir2])?;
958        let duplicates = hasher.find_duplicates()?;
959        assert_eq!(duplicates.len(), 1);
960        let group = &duplicates[0];
961        assert_eq!(group.paths.len(), 2);
962        assert_eq!(group.size, 12);
963        assert!(group.paths.contains(&file1_path));
964        assert!(group.paths.contains(&file2_path));
965
966        Ok(())
967    }
968
969    #[test]
970    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
971        let tmp = tempfile::tempdir()?;
972        let dir1 = tmp.path().join("dir1");
973        let dir2 = tmp.path().join("dir2");
974        fs::create_dir(&dir1)?;
975        fs::create_dir(&dir2)?;
976        let hasher = FileHasher::new(&[&dir1, &dir2])?;
977        assert!(hasher.check(false).is_err());
978        Ok(())
979    }
980}