Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, Progress,
3    ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use std::{
8    collections::HashMap,
9    fs,
10    io::{self, Read, stdout},
11    path::{Path, PathBuf},
12    sync::{
13        Arc,
14        atomic::{AtomicUsize, Ordering},
15        mpsc,
16    },
17    time,
18};
19
20#[derive(Debug, Clone)]
21enum HashProgress {
22    StartDiscovering,
23    TotalFiles(usize),
24    Result(PathBuf, u64, blake3::Hash, bool),
25    Error,
26}
27
28#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
29enum CheckStatus {
30    Unchanged,
31    New,
32    Modified,
33}
34
35#[derive(Debug, PartialEq)]
36enum CheckEvent {
37    StartChecking,
38    TotalFiles(usize),
39    Result(PathBuf, CheckStatus),
40    FileDone,
41    Error,
42}
43
44enum EntryState {
45    Single(PathBuf, time::SystemTime),
46    Hashing,
47}
48
49/// A tool for finding duplicated files in a directory.
50pub struct FileHasher {
51    dirs: Vec<PathBuf>,
52    pub buffer_size: usize,
53    pub(crate) cache: Arc<FileHashCache>,
54    pub(crate) num_hashed: AtomicUsize,
55    pub(crate) num_hash_looked_up: AtomicUsize,
56    pub exclude: Option<GlobSet>,
57    pub progress: Option<Arc<ProgressBuilder>>,
58    pub is_yaml_format: bool,
59    pub jobs: usize,
60}
61
62impl FileHasher {
63    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
64
65    /// Creates a new `FileHasher` for the given directories.
66    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
67        if dirs.is_empty() {
68            anyhow::bail!("At least one directory must be specified.");
69        }
70        let common_ancestor = crate::common_ancestor(dirs)
71            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
72        Ok(Self {
73            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
74            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
75            cache: FileHashCache::find_or_new(&common_ancestor),
76            num_hashed: AtomicUsize::new(0),
77            num_hash_looked_up: AtomicUsize::new(0),
78            exclude: None,
79            progress: None,
80            is_yaml_format: false,
81            jobs: Self::DEFAULT_JOBS,
82        })
83    }
84
85    /// Remove a cache entry if it exists.
86    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
87        let relative = crate::strip_prefix(path, self.cache.base_dir())?;
88        self.cache.remove(relative);
89        Ok(())
90    }
91
92    /// Save the hash cache if it is dirty.
93    pub fn save_cache(&self) -> anyhow::Result<()> {
94        log::info!(
95            "Hash stats for {:?}: {} computed, {} looked up",
96            self.dirs,
97            self.num_hashed.load(Ordering::Relaxed),
98            self.num_hash_looked_up.load(Ordering::Relaxed)
99        );
100        Ok(self.cache.save()?)
101    }
102
103    /// Merges another cache into this hasher's cache.
104    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
105        self.cache.merge(other_cache);
106    }
107
108    /// Clears the loaded hashes in the cache.
109    pub fn clear_cache(&self) -> anyhow::Result<()> {
110        for dir in &self.dirs {
111            let relative = crate::strip_prefix(dir, self.cache.base_dir())?;
112            self.cache.clear(relative);
113        }
114        Ok(())
115    }
116
117    /// Executes the check/update process.
118    pub fn check(&self, update: bool) -> anyhow::Result<()> {
119        if self.dirs.len() > 1 {
120            anyhow::bail!("Check mode only supports one directory.");
121        }
122        let start_time = time::Instant::now();
123        let progress = self
124            .progress
125            .as_ref()
126            .map(|progress| progress.add_spinner())
127            .unwrap_or_else(Progress::none);
128        progress.set_message("Scanning directory...");
129        let mut num_new = 0;
130        let mut num_modified = 0;
131        let mut num_error = 0;
132        std::thread::scope(|scope| {
133            let (tx, rx) = mpsc::channel();
134            scope.spawn(|| {
135                if let Err(e) = self.check_streaming(tx, update) {
136                    log::error!("Error during check: {}", e);
137                }
138            });
139            while let Ok(event) = rx.recv() {
140                match event {
141                    CheckEvent::StartChecking => {
142                        progress.set_message("Checking files...");
143                    }
144                    CheckEvent::TotalFiles(total) => {
145                        progress.set_length(total as u64);
146                        progress.set_message("");
147                    }
148                    CheckEvent::Result(path, status) => {
149                        let symbol = match status {
150                            CheckStatus::New => {
151                                num_new += 1;
152                                '+'
153                            }
154                            CheckStatus::Modified => {
155                                num_modified += 1;
156                                '!'
157                            }
158                            CheckStatus::Unchanged => unreachable!(),
159                        };
160                        progress.inc(1);
161                        progress.suspend_for(stdout(), || {
162                            println!("{} {}", symbol, path.display());
163                        });
164                    }
165                    CheckEvent::FileDone => {
166                        progress.inc(1);
167                    }
168                    CheckEvent::Error => {
169                        progress.inc(1);
170                        num_error += 1;
171                    }
172                }
173            }
174        });
175        progress.finish();
176        if update {
177            self.save_cache()?;
178        }
179        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
180        Ok(())
181    }
182
183    fn print_check_summary(
184        &self,
185        start_time: &time::Instant,
186        num_new: usize,
187        num_modified: usize,
188        num_error: usize,
189    ) -> io::Result<()> {
190        let summary = [
191            ("Elapsed:", 0),
192            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
193            ("New files:", num_new),
194            ("Modified files:", num_modified),
195            ("Errors:", num_error),
196        ];
197        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
198        let mut writer = std::io::stderr();
199        formatter.write_value(
200            &mut writer,
201            summary[0].0,
202            FormattedDuration(start_time.elapsed()),
203        )?;
204        formatter.write_values(&mut writer, &summary[1..])
205    }
206
207    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
208        std::thread::scope(|global_scope| {
209            let mut it = FileIterator::new(self.dirs[0].clone());
210            it.hasher = Some(self);
211            it.exclude = self.exclude.as_ref();
212            let it_rx = it.spawn_in_scope(global_scope);
213            tx.send(CheckEvent::StartChecking)?;
214            let pool = crate::build_thread_pool(self.jobs)?;
215            pool.scope(move |scope| -> anyhow::Result<()> {
216                let mut total_files = 0;
217                for (rel_path, abs_path) in it_rx {
218                    total_files += 1;
219                    let tx = tx.clone();
220                    scope.spawn(move |_| {
221                        let status = self.check_file(&abs_path, update);
222                        let event = match status {
223                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
224                                CheckEvent::Result(rel_path, status.unwrap())
225                            }
226                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
227                            Err(e) => {
228                                log::error!("Failed to check file {:?}: {}", rel_path, e);
229                                CheckEvent::Error
230                            }
231                        };
232                        if tx.send(event).is_err() {
233                            log::error!("Send failed");
234                        }
235                    });
236                }
237                tx.send(CheckEvent::TotalFiles(total_files))?;
238                Ok(())
239            })
240        })?;
241        Ok(())
242    }
243
244    fn check_file(&self, abs_path: &Path, update: bool) -> anyhow::Result<CheckStatus> {
245        assert!(abs_path.is_absolute());
246        let computed_hash = self.compute_hash(abs_path)?;
247        let rel_path = crate::strip_prefix(abs_path, self.cache.base_dir())?;
248        let cached_hash = self.cache.get_by_path(rel_path);
249        let status = match cached_hash {
250            None => CheckStatus::New,
251            Some(cached) => {
252                if computed_hash != cached {
253                    CheckStatus::Modified
254                } else {
255                    CheckStatus::Unchanged
256                }
257            }
258        };
259        if update {
260            let modified = fs::metadata(abs_path)?.modified()?;
261            match status {
262                CheckStatus::New | CheckStatus::Modified => {
263                    self.cache.insert(rel_path, modified, computed_hash);
264                }
265                CheckStatus::Unchanged => {
266                    if self.cache.get(rel_path, modified).is_none() {
267                        self.cache.insert(rel_path, modified, computed_hash);
268                    }
269                }
270            }
271        }
272        Ok(status)
273    }
274
275    /// Executes the duplicate file finding process and prints results.
276    pub fn run(&self) -> anyhow::Result<()> {
277        let start_time = time::Instant::now();
278        let mut duplicates = self.find_duplicates()?;
279        let mut total_wasted_space = 0;
280        if !duplicates.is_empty() {
281            duplicates.sort_by_key(|a| a.size);
282            for dupes in &duplicates {
283                if self.is_yaml_format {
284                    dupes.write_yaml(std::io::stdout())?;
285                } else {
286                    dupes.write_human(std::io::stdout())?;
287                }
288                total_wasted_space += dupes.wasted_size();
289            }
290        }
291        self.print_duplicates_summary(&start_time, total_wasted_space)?;
292        Ok(())
293    }
294
295    fn print_duplicates_summary(
296        &self,
297        start_time: &time::Instant,
298        total_wasted_space: u64,
299    ) -> io::Result<()> {
300        let summary = [
301            (
302                "Elapsed:",
303                FormattedDuration(start_time.elapsed()).to_string(),
304            ),
305            (
306                "Total wasted space:",
307                crate::human_readable_size(total_wasted_space),
308            ),
309        ];
310        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
311        formatter.write_values(&mut io::stderr(), &summary)
312    }
313
314    /// Finds duplicated files and returns a list of duplicate groups.
315    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
316        let progress = self
317            .progress
318            .as_ref()
319            .map(|progress| progress.add_spinner())
320            .unwrap_or_else(Progress::none);
321        progress.set_message("Scanning directories...");
322
323        let (tx, rx) = mpsc::channel();
324        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
325        let mut num_cache_hits = 0;
326        std::thread::scope(|scope| {
327            scope.spawn(|| {
328                if let Err(e) = self.find_duplicates_streaming(tx) {
329                    log::error!("Error during duplicate finding: {}", e);
330                }
331            });
332
333            while let Ok(event) = rx.recv() {
334                match event {
335                    HashProgress::StartDiscovering => {
336                        progress.set_message("Hashing files...");
337                    }
338                    HashProgress::TotalFiles(total) => {
339                        progress.set_length(total as u64);
340                        if num_cache_hits > 0 {
341                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
342                        }
343                    }
344                    HashProgress::Result(path, size, hash, is_cache_hit) => {
345                        if is_cache_hit {
346                            num_cache_hits += 1;
347                            if progress.length().is_none() {
348                                progress.set_message(format!(
349                                    "Hashing files... ({} cache hits)",
350                                    num_cache_hits
351                                ));
352                            } else {
353                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
354                            }
355                        }
356
357                        progress.inc(1);
358                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
359                            paths: Vec::new(),
360                            size,
361                        });
362                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
363                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
364                        entry.paths.push(path);
365                    }
366                    HashProgress::Error => {
367                        progress.inc(1);
368                    }
369                }
370            }
371        });
372        progress.finish();
373
374        let mut duplicates = Vec::new();
375        for (_, mut dupes) in by_hash {
376            if dupes.paths.len() > 1 {
377                dupes.paths.sort();
378                duplicates.push(dupes);
379            }
380        }
381        Ok(duplicates)
382    }
383
384    fn find_duplicates_streaming(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
385        tx.send(HashProgress::StartDiscovering)?;
386        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
387        let mut total_hashed = 0;
388        std::thread::scope(|global_scope| {
389            let (it_tx, it_rx) = mpsc::channel();
390            for dir in &self.dirs {
391                let it_tx = it_tx.clone();
392                let mut it = FileIterator::new(dir.clone());
393                it.hasher = Some(self);
394                it.exclude = self.exclude.as_ref();
395                it.spawn_in_scope_with_sender(global_scope, it_tx);
396            }
397            drop(it_tx);
398
399            let pool = crate::build_thread_pool(self.jobs)?;
400            pool.scope(move |scope| -> anyhow::Result<()> {
401                for (_, current_path) in it_rx {
402                    let meta = fs::metadata(&current_path)?;
403                    let size = meta.len();
404                    let modified = meta.modified()?;
405
406                    // Small optimization: If file size is 0, it's not really worth treating
407                    // as wasted space duplicates in the same way, but keeping it unified for now.
408                    match by_size.entry(size) {
409                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
410                        {
411                            EntryState::Single(first_path, first_modified) => {
412                                // We found a second file of identical size.
413                                // Time to start hashing both the *original* matching file and the *new* one!
414                                self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
415                                self.spawn_hash_task(scope, &current_path, size, modified, &tx);
416
417                                // Modify the state to indicate we are now fully hashing this size bucket.
418                                *occ.get_mut() = EntryState::Hashing;
419                                total_hashed += 2;
420                            }
421                            EntryState::Hashing => {
422                                // File size bucket already hashing; just dynamically spawn the new file immediately.
423                                self.spawn_hash_task(scope, &current_path, size, modified, &tx);
424                                total_hashed += 1;
425                            }
426                        },
427                        std::collections::hash_map::Entry::Vacant(vac) => {
428                            vac.insert(EntryState::Single(current_path, modified));
429                        }
430                    }
431                }
432                tx.send(HashProgress::TotalFiles(total_hashed))?;
433                Ok(())
434            })
435        })?;
436
437        // The scope waits for all spawned tasks to complete.
438        // Channel `tx` gets naturally closed when it drops at the end of this function.
439        self.save_cache()
440    }
441
442    fn spawn_hash_task<'scope>(
443        &'scope self,
444        scope: &rayon::Scope<'scope>,
445        path: &Path,
446        size: u64,
447        modified: time::SystemTime,
448        tx: &mpsc::Sender<HashProgress>,
449    ) {
450        let relative = crate::strip_prefix(path, self.cache.base_dir())
451            .expect("path should be in cache base_dir");
452        if let Some(hash) = self.cache.get(relative, modified) {
453            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
454            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
455            return;
456        }
457
458        let path_owned = path.to_path_buf();
459        let relative_owned = relative.to_path_buf();
460        let tx_owned = tx.clone();
461        scope.spawn(move |_| {
462            if let Ok(hash) = self.compute_hash(&path_owned) {
463                self.cache.insert(&relative_owned, modified, hash);
464                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
465            } else {
466                log::error!("Failed to hash file: {:?}", path_owned);
467                let _ = tx_owned.send(HashProgress::Error);
468            }
469        });
470    }
471
472    /// Gets the hash of a file, using the cache if available.
473    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
474        let meta = fs::metadata(path)?;
475        let modified = meta.modified()?;
476        let relative = crate::strip_prefix(path, self.cache.base_dir())
477            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
478        if let Some(hash) = self.cache.get(relative, modified) {
479            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
480            return Ok(hash);
481        }
482
483        let hash = self.compute_hash(path)?;
484        self.cache.insert(relative, modified, hash);
485        Ok(hash)
486    }
487
488    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
489        let start_time = time::Instant::now();
490        let mut f = fs::File::open(path)?;
491        let len = f.metadata()?.len();
492        let progress = self
493            .progress
494            .as_ref()
495            .map(|progress| progress.add_file(path, len))
496            .unwrap_or_else(Progress::none);
497        let mut hasher = blake3::Hasher::new();
498        if self.buffer_size == 0 {
499            if len > 0 {
500                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
501                hasher.update(&mmap[..]);
502                progress.inc(len);
503            }
504        } else {
505            let mut buf = vec![0u8; self.buffer_size];
506            loop {
507                let n = f.read(&mut buf)?;
508                if n == 0 {
509                    break;
510                }
511                hasher.update(&buf[..n]);
512                progress.inc(n as u64);
513            }
514        }
515        progress.finish();
516        self.num_hashed.fetch_add(1, Ordering::Relaxed);
517        let hash = hasher.finalize();
518        log::debug!(
519            "Computed hash in {}: {:?}",
520            FormattedDuration(start_time.elapsed()),
521            path
522        );
523        Ok(hash)
524    }
525}
526
527/// A group of duplicated files and their size.
528#[derive(Clone, Debug)]
529pub struct DuplicatedFiles {
530    pub paths: Vec<PathBuf>,
531    pub size: u64,
532}
533
534impl DuplicatedFiles {
535    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
536        writeln!(
537            writer,
538            "Identical {} files of {}:",
539            self.paths.len(),
540            crate::human_readable_size(self.size)
541        )?;
542        for path in &self.paths {
543            writeln!(writer, "  {}", path.display())?;
544        }
545        Ok(())
546    }
547
548    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
549        writeln!(writer, "- paths:")?;
550        for path in &self.paths {
551            writeln!(writer, "  - {:?}", path)?;
552        }
553        writeln!(writer, "  size: {}", self.size)?;
554        Ok(())
555    }
556
557    fn wasted_size(&self) -> u64 {
558        self.size * (self.paths.len() as u64 - 1)
559    }
560}
561
562#[cfg(test)]
563mod tests {
564    use super::*;
565
566    fn default_exclude() -> globset::GlobSet {
567        let mut builder = globset::GlobSetBuilder::new();
568        builder.add(
569            globset::GlobBuilder::new(".hash_cache")
570                .case_insensitive(true)
571                .build()
572                .unwrap(),
573        );
574        builder.build().unwrap()
575    }
576
577    #[test]
578    fn find_duplicates() -> anyhow::Result<()> {
579        let dir = tempfile::tempdir()?;
580
581        let file1_path = dir.path().join("same1.txt");
582        fs::write(&file1_path, "same content")?;
583
584        let file2_path = dir.path().join("same2.txt");
585        fs::write(&file2_path, "same content")?;
586
587        let diff_path = dir.path().join("diff.txt");
588        fs::write(&diff_path, "different content")?;
589
590        let mut hasher = FileHasher::new(&[dir.path()])?;
591        hasher.buffer_size = 8192;
592        let duplicates = hasher.find_duplicates()?;
593
594        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
595        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
596
597        assert_eq!(duplicates.len(), 1);
598        let group = &duplicates[0];
599        assert_eq!(group.paths.len(), 2);
600        assert_eq!(group.size, 12); // "same content" is 12 bytes
601
602        assert!(group.paths.contains(&file1_path));
603        assert!(group.paths.contains(&file2_path));
604
605        Ok(())
606    }
607
608    #[test]
609    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
610        let dir = tempfile::tempdir()?;
611        let dir_path = dir.path();
612
613        let sub_dir = dir_path.join("a").join("a");
614        fs::create_dir_all(&sub_dir)?;
615
616        let file1_path = sub_dir.join("1");
617        fs::write(&file1_path, "same content")?;
618
619        let file2_path = sub_dir.join("2");
620        fs::write(&file2_path, "same content")?;
621
622        // Create empty cache file in a/a to force it to be the cache base
623        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
624        fs::File::create(&cache_aa_path)?;
625
626        // Run find_duplicates on a/a
627        let hasher_aa = FileHasher::new(&[&sub_dir])?;
628        let duplicates_aa = hasher_aa.find_duplicates()?;
629        assert_eq!(duplicates_aa.len(), 1);
630        assert!(cache_aa_path.exists());
631        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
632        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
633
634        // Create empty cache file in a to force it to be the cache base
635        let root_a = dir_path.join("a");
636        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
637        fs::File::create(&cache_a_path)?;
638
639        // Run find_duplicates on a
640        let hasher_a = FileHasher::new(&[&root_a])?;
641        let duplicates_a = hasher_a.find_duplicates()?;
642        assert_eq!(duplicates_a.len(), 1);
643        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
644        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
645
646        // The merged child cache should be removed.
647        assert!(cache_a_path.exists());
648        assert!(!cache_aa_path.exists());
649
650        Ok(())
651    }
652
653    #[test]
654    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
655        let dir = tempfile::tempdir()?;
656
657        let file1_path = dir.path().join("same1.txt");
658        fs::write(&file1_path, "same content")?;
659
660        let file2_path = dir.path().join("same2.txt");
661        fs::write(&file2_path, "same content")?;
662
663        let exclude_path = dir.path().join("exclude.txt");
664        fs::write(&exclude_path, "same content")?;
665
666        let mut hasher = FileHasher::new(&[dir.path()])?;
667        hasher.buffer_size = 8192;
668        let mut builder = globset::GlobSetBuilder::new();
669        builder.add(
670            globset::GlobBuilder::new("exclude.txt")
671                .case_insensitive(true)
672                .build()?,
673        );
674        let filter = builder.build()?;
675        hasher.exclude = Some(filter);
676
677        let duplicates = hasher.find_duplicates()?;
678        assert_eq!(duplicates.len(), 1);
679        let group = &duplicates[0];
680        assert_eq!(group.paths.len(), 2);
681        assert!(group.paths.contains(&file1_path));
682        assert!(group.paths.contains(&file2_path));
683        assert!(!group.paths.contains(&exclude_path));
684        Ok(())
685    }
686
687    #[test]
688    fn check_mode_empty_cache() -> anyhow::Result<()> {
689        let dir = tempfile::tempdir()?;
690        let dir_path = dir.path().to_path_buf();
691        println!("{:?}", dir_path);
692        let file1_path = dir.path().join("file1.txt");
693        fs::write(&file1_path, "content 1")?;
694        let file2_path = dir.path().join("file2.txt");
695        fs::write(&file2_path, "content 2")?;
696
697        let mut hasher = FileHasher::new(&[&dir_path])?;
698        hasher.exclude = Some(default_exclude());
699        let (tx, rx) = mpsc::channel();
700        hasher.check_streaming(tx, false)?;
701        let mut results = Vec::new();
702        let mut start_seen = false;
703        let mut total_files = None;
704        let mut file_done_count = 0;
705        let mut num_error = 0;
706        while let Ok(event) = rx.recv() {
707            match event {
708                CheckEvent::StartChecking => start_seen = true,
709                CheckEvent::TotalFiles(total) => total_files = Some(total),
710                CheckEvent::Result(path, status) => results.push((path, status)),
711                CheckEvent::FileDone => file_done_count += 1,
712                CheckEvent::Error => num_error += 1,
713            }
714        }
715        assert!(start_seen);
716        assert_eq!(total_files, Some(2));
717        assert_eq!(file_done_count, 0);
718        assert_eq!(num_error, 0);
719
720        results.sort_by(|a, b| a.0.cmp(&b.0));
721        assert_eq!(results.len(), 2);
722        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
723        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
724
725        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
726        Ok(())
727    }
728
729    #[test]
730    fn check_mode_with_cache() -> anyhow::Result<()> {
731        let dir = tempfile::tempdir()?;
732        let dir_path = dir.path().to_path_buf();
733        let file1_path = dir.path().join("file1.txt");
734        fs::write(&file1_path, "content 1")?;
735        let file2_path = dir.path().join("file2.txt");
736        fs::write(&file2_path, "content 2")?;
737
738        let mut hasher = FileHasher::new(&[&dir_path])?;
739        hasher.exclude = Some(default_exclude());
740        let _hash1 = hasher.get_hash(&file1_path)?;
741        let _hash2 = hasher.get_hash(&file2_path)?;
742        hasher.save_cache()?;
743        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
744
745        let mut hasher = FileHasher::new(&[&dir_path])?;
746        hasher.exclude = Some(default_exclude());
747        let (tx, rx) = mpsc::channel();
748        hasher.check_streaming(tx, false)?;
749        let mut results = Vec::new();
750        let mut file_done_count = 0;
751        while let Ok(event) = rx.recv() {
752            match event {
753                CheckEvent::Result(path, status) => results.push((path, status)),
754                CheckEvent::FileDone => file_done_count += 1,
755                _ => {}
756            }
757        }
758        assert_eq!(results.len(), 0);
759        assert_eq!(file_done_count, 2);
760
761        fs::write(&file1_path, "content 1 modified")?;
762
763        let file2_meta_before = fs::metadata(&file2_path)?;
764        let mtime_before = file2_meta_before.modified()?;
765        std::thread::sleep(time::Duration::from_millis(10));
766        fs::write(&file2_path, "content 2")?;
767        let file2_meta_after = fs::metadata(&file2_path)?;
768        let mtime_after = file2_meta_after.modified()?;
769        assert!(mtime_after > mtime_before);
770
771        let mut hasher = FileHasher::new(&[&dir_path])?;
772        hasher.exclude = Some(default_exclude());
773        let (tx, rx) = mpsc::channel();
774        hasher.check_streaming(tx, false)?;
775        let mut results = Vec::new();
776        let mut file_done_count = 0;
777        while let Ok(event) = rx.recv() {
778            match event {
779                CheckEvent::Result(path, status) => results.push((path, status)),
780                CheckEvent::FileDone => file_done_count += 1,
781                _ => {}
782            }
783        }
784        assert_eq!(results.len(), 1);
785        assert_eq!(
786            results[0],
787            (PathBuf::from("file1.txt"), CheckStatus::Modified)
788        );
789        assert_eq!(file_done_count, 1);
790        Ok(())
791    }
792
793    #[test]
794    fn check_update_mode() -> anyhow::Result<()> {
795        let dir = tempfile::tempdir()?;
796        let dir_path = dir.path().to_path_buf();
797        let file1_path = dir.path().join("file1.txt");
798        fs::write(&file1_path, "content 1")?;
799
800        let mut hasher = FileHasher::new(&[&dir_path])?;
801        hasher.exclude = Some(default_exclude());
802        let (tx, rx) = mpsc::channel();
803        hasher.check_streaming(tx, true)?;
804        while rx.recv().is_ok() {}
805        hasher.save_cache()?;
806        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
807
808        let cache = FileHashCache::new(&dir_path);
809        let mtime1 = fs::metadata(&file1_path)?.modified()?;
810        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
811        assert!(hash1.is_some());
812
813        std::thread::sleep(time::Duration::from_millis(10));
814        fs::write(&file1_path, "content 1 modified")?;
815        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
816
817        let mut hasher = FileHasher::new(&[&dir_path])?;
818        hasher.exclude = Some(default_exclude());
819        let (tx, rx) = mpsc::channel();
820        hasher.check_streaming(tx, true)?;
821        while rx.recv().is_ok() {}
822        hasher.save_cache()?;
823
824        let cache = FileHashCache::new(&dir_path);
825        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
826        assert!(hash_mod.is_some());
827        assert_ne!(hash1, hash_mod);
828
829        std::thread::sleep(time::Duration::from_millis(10));
830        fs::write(&file1_path, "content 1 modified")?;
831        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
832        assert!(mtime1_mod2 > mtime1_mod);
833
834        assert!(
835            cache
836                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
837                .is_none()
838        );
839
840        let mut hasher = FileHasher::new(&[&dir_path])?;
841        hasher.exclude = Some(default_exclude());
842        let (tx, rx) = mpsc::channel();
843        hasher.check_streaming(tx, true)?;
844        while rx.recv().is_ok() {}
845        hasher.save_cache()?;
846
847        let cache = FileHashCache::new(&dir_path);
848        assert!(
849            cache
850                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
851                .is_some()
852        );
853        Ok(())
854    }
855
856    #[test]
857    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
858        let tmp = tempfile::tempdir()?;
859        let dir1 = tmp.path().join("dir1");
860        let dir2 = tmp.path().join("dir2");
861        fs::create_dir(&dir1)?;
862        fs::create_dir(&dir2)?;
863        let file1_path = dir1.join("file1.txt");
864        fs::write(&file1_path, "same content")?;
865        let file2_path = dir2.join("file2.txt");
866        fs::write(&file2_path, "same content")?;
867        let hasher = FileHasher::new(&[&dir1, &dir2])?;
868        let duplicates = hasher.find_duplicates()?;
869        assert_eq!(duplicates.len(), 1);
870        let group = &duplicates[0];
871        assert_eq!(group.paths.len(), 2);
872        assert_eq!(group.size, 12);
873        assert!(group.paths.contains(&file1_path));
874        assert!(group.paths.contains(&file2_path));
875
876        Ok(())
877    }
878
879    #[test]
880    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
881        let tmp = tempfile::tempdir()?;
882        let dir1 = tmp.path().join("dir1");
883        let dir2 = tmp.path().join("dir2");
884        fs::create_dir(&dir1)?;
885        fs::create_dir(&dir2)?;
886        let hasher = FileHasher::new(&[&dir1, &dir2])?;
887        assert!(hasher.check(false).is_err());
888        Ok(())
889    }
890}