Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, Progress,
3    ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use std::{
8    collections::HashMap,
9    fs,
10    io::{self, Read, stdout},
11    path::{Path, PathBuf},
12    sync::{
13        Arc,
14        atomic::{AtomicUsize, Ordering},
15        mpsc,
16    },
17    time,
18};
19
20#[derive(Debug, Clone)]
21enum HashProgress {
22    StartDiscovering,
23    TotalFiles(usize),
24    Result(PathBuf, u64, blake3::Hash, bool),
25    Error,
26}
27
28#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
29enum CheckStatus {
30    Unchanged,
31    New,
32    Modified,
33}
34
35#[derive(Debug, PartialEq)]
36enum CheckEvent {
37    StartChecking,
38    TotalFiles(usize),
39    Result(PathBuf, CheckStatus),
40    FileDone,
41    Error,
42}
43
44enum EntryState {
45    Single(PathBuf, time::SystemTime),
46    Hashing,
47}
48
49/// A tool for finding duplicated files in a directory.
50pub struct FileHasher {
51    dirs: Vec<PathBuf>,
52    pub buffer_size: usize,
53    cache: Arc<FileHashCache>,
54    num_hashed: AtomicUsize,
55    num_hash_looked_up: AtomicUsize,
56    pub exclude: Option<GlobSet>,
57    pub progress: Option<Arc<ProgressBuilder>>,
58    pub is_yaml_format: bool,
59    pub jobs: usize,
60}
61
62impl FileHasher {
63    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
64
65    /// Creates a new `FileHasher` for the given directories.
66    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
67        if dirs.is_empty() {
68            anyhow::bail!("At least one directory must be specified.");
69        }
70        let common_ancestor = crate::common_ancestor(dirs)
71            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
72        Ok(Self {
73            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
74            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
75            cache: FileHashCache::find_or_new(&common_ancestor),
76            num_hashed: AtomicUsize::new(0),
77            num_hash_looked_up: AtomicUsize::new(0),
78            exclude: None,
79            progress: None,
80            is_yaml_format: false,
81            jobs: Self::DEFAULT_JOBS,
82        })
83    }
84
85    /// Remove a cache entry if it exists.
86    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
87        let relative = crate::strip_prefix(path, self.cache.base_dir())?;
88        self.cache.remove(relative);
89        Ok(())
90    }
91
92    /// Save the hash cache if it is dirty.
93    pub fn save_cache(&self) -> anyhow::Result<()> {
94        log::info!(
95            "Hash stats for {:?}: {} computed, {} looked up",
96            self.dirs,
97            self.num_hashed.load(Ordering::Relaxed),
98            self.num_hash_looked_up.load(Ordering::Relaxed)
99        );
100        Ok(self.cache.save()?)
101    }
102
103    /// Merges another cache into this hasher's cache.
104    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
105        self.cache.merge(other_cache);
106    }
107
108    /// Clears the loaded hashes in the cache.
109    pub fn clear_cache(&self) -> anyhow::Result<()> {
110        for dir in &self.dirs {
111            let relative = crate::strip_prefix(dir, self.cache.base_dir())?;
112            self.cache.clear(relative);
113        }
114        Ok(())
115    }
116
117    /// Executes the check/update process.
118    pub fn check(&self, update: bool) -> anyhow::Result<()> {
119        if self.dirs.len() > 1 {
120            anyhow::bail!("Check mode only supports one directory.");
121        }
122        let start_time = time::Instant::now();
123        let progress = self
124            .progress
125            .as_ref()
126            .map(|progress| progress.add_spinner())
127            .unwrap_or_else(Progress::none);
128        progress.set_message("Scanning directory...");
129        let mut num_new = 0;
130        let mut num_modified = 0;
131        let mut num_error = 0;
132        std::thread::scope(|scope| {
133            let (tx, rx) = mpsc::channel();
134            scope.spawn(|| {
135                if let Err(e) = self.check_streaming(tx, update) {
136                    log::error!("Error during check: {}", e);
137                }
138            });
139            while let Ok(event) = rx.recv() {
140                match event {
141                    CheckEvent::StartChecking => {
142                        progress.set_message("Checking files...");
143                    }
144                    CheckEvent::TotalFiles(total) => {
145                        progress.set_length(total as u64);
146                        progress.set_message("");
147                    }
148                    CheckEvent::Result(path, status) => {
149                        let symbol = match status {
150                            CheckStatus::New => {
151                                num_new += 1;
152                                '+'
153                            }
154                            CheckStatus::Modified => {
155                                num_modified += 1;
156                                '!'
157                            }
158                            CheckStatus::Unchanged => unreachable!(),
159                        };
160                        progress.inc(1);
161                        progress.suspend_for(stdout(), || {
162                            println!("{} {}", symbol, path.display());
163                        });
164                    }
165                    CheckEvent::FileDone => {
166                        progress.inc(1);
167                    }
168                    CheckEvent::Error => {
169                        progress.inc(1);
170                        num_error += 1;
171                    }
172                }
173            }
174        });
175        progress.finish();
176        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
177        Ok(())
178    }
179
180    fn print_check_summary(
181        &self,
182        start_time: &time::Instant,
183        num_new: usize,
184        num_modified: usize,
185        num_error: usize,
186    ) -> io::Result<()> {
187        let summary = [
188            ("Elapsed:", 0),
189            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
190            ("New files:", num_new),
191            ("Modified files:", num_modified),
192            ("Errors:", num_error),
193        ];
194        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
195        let mut writer = std::io::stderr();
196        formatter.write_value(
197            &mut writer,
198            summary[0].0,
199            FormattedDuration(start_time.elapsed()),
200        )?;
201        formatter.write_values(&mut writer, &summary[1..])
202    }
203
204    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
205        let base_dir = &self.dirs[0];
206        let relative = crate::strip_prefix(base_dir, self.cache.base_dir())?;
207        self.cache.set_remove_if_no_access(relative);
208        std::thread::scope(|global_scope| {
209            let mut it = FileIterator::new(base_dir.clone());
210            it.hasher = Some(self);
211            it.exclude = self.exclude.as_ref();
212            let it_rx = it.spawn_in_scope(global_scope);
213            tx.send(CheckEvent::StartChecking)?;
214            let pool = crate::build_thread_pool(self.jobs)?;
215            pool.scope(move |scope| -> anyhow::Result<()> {
216                let mut total_files = 0;
217                for path in it_rx {
218                    total_files += 1;
219                    let tx = tx.clone();
220                    scope.spawn(move |_| {
221                        let status = self.check_file(&path, update);
222                        let event = match status {
223                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
224                                let rel_path = crate::strip_prefix(&path, base_dir).unwrap();
225                                CheckEvent::Result(rel_path.into(), status.unwrap())
226                            }
227                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
228                            Err(e) => {
229                                log::error!("Failed to check file {:?}: {}", path, e);
230                                CheckEvent::Error
231                            }
232                        };
233                        if tx.send(event).is_err() {
234                            log::error!("Send failed");
235                        }
236                    });
237                }
238                tx.send(CheckEvent::TotalFiles(total_files))?;
239                Ok(())
240            })
241        })?;
242        self.save_cache()?;
243        Ok(())
244    }
245
246    fn check_file(&self, abs_path: &Path, update: bool) -> anyhow::Result<CheckStatus> {
247        assert!(abs_path.is_absolute());
248        let computed_hash = self.compute_hash(abs_path)?;
249        let rel_path = crate::strip_prefix(abs_path, self.cache.base_dir())?;
250        let cached_hash = self.cache.get_by_path(rel_path);
251        let status = match cached_hash {
252            None => CheckStatus::New,
253            Some(cached) => {
254                if computed_hash != cached {
255                    CheckStatus::Modified
256                } else {
257                    CheckStatus::Unchanged
258                }
259            }
260        };
261        if update {
262            let modified = fs::metadata(abs_path)?.modified()?;
263            match status {
264                CheckStatus::New | CheckStatus::Modified => {
265                    self.cache.insert(rel_path, modified, computed_hash);
266                }
267                CheckStatus::Unchanged => {
268                    if self.cache.get(rel_path, modified).is_none() {
269                        self.cache.insert(rel_path, modified, computed_hash);
270                    }
271                }
272            }
273        }
274        Ok(status)
275    }
276
277    /// Executes the duplicate file finding process and prints results.
278    pub fn run(&self) -> anyhow::Result<()> {
279        let start_time = time::Instant::now();
280        let mut duplicates = self.find_duplicates()?;
281        let mut total_wasted_space = 0;
282        if !duplicates.is_empty() {
283            duplicates.sort_by_key(|a| a.size);
284            for dupes in &duplicates {
285                if self.is_yaml_format {
286                    dupes.write_yaml(std::io::stdout())?;
287                } else {
288                    dupes.write_human(std::io::stdout())?;
289                }
290                total_wasted_space += dupes.wasted_size();
291            }
292        }
293        self.print_duplicates_summary(&start_time, total_wasted_space)?;
294        Ok(())
295    }
296
297    fn print_duplicates_summary(
298        &self,
299        start_time: &time::Instant,
300        total_wasted_space: u64,
301    ) -> io::Result<()> {
302        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
303        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
304        let total_wasted_space = crate::human_readable_size(total_wasted_space);
305        let summary = [
306            ("Elapsed:", elapsed),
307            ("Hash computed:", num_hashed),
308            ("Total wasted space:", total_wasted_space),
309        ];
310        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
311        formatter.write_values(&mut io::stderr(), &summary)
312    }
313
314    /// Finds duplicated files and returns a list of duplicate groups.
315    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
316        let progress = self
317            .progress
318            .as_ref()
319            .map(|progress| progress.add_spinner())
320            .unwrap_or_else(Progress::none);
321        progress.set_message("Scanning directories...");
322
323        let (tx, rx) = mpsc::channel();
324        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
325        let mut num_cache_hits = 0;
326        std::thread::scope(|scope| {
327            scope.spawn(|| {
328                if let Err(e) = self.find_duplicates_streaming(tx) {
329                    log::error!("Error during duplicate finding: {}", e);
330                }
331            });
332
333            while let Ok(event) = rx.recv() {
334                match event {
335                    HashProgress::StartDiscovering => {
336                        progress.set_message("Hashing files...");
337                    }
338                    HashProgress::TotalFiles(total) => {
339                        progress.set_length(total as u64);
340                        if num_cache_hits > 0 {
341                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
342                        }
343                    }
344                    HashProgress::Result(path, size, hash, is_cache_hit) => {
345                        if is_cache_hit {
346                            num_cache_hits += 1;
347                            if progress.length().is_none() {
348                                progress.set_message(format!(
349                                    "Hashing files... ({} cache hits)",
350                                    num_cache_hits
351                                ));
352                            } else {
353                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
354                            }
355                        }
356
357                        progress.inc(1);
358                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
359                            paths: Vec::new(),
360                            size,
361                        });
362                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
363                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
364                        entry.paths.push(path);
365                    }
366                    HashProgress::Error => {
367                        progress.inc(1);
368                    }
369                }
370            }
371        });
372        progress.finish();
373
374        let mut duplicates = Vec::new();
375        for (_, mut dupes) in by_hash {
376            if dupes.paths.len() > 1 {
377                dupes.paths.sort();
378                duplicates.push(dupes);
379            }
380        }
381        Ok(duplicates)
382    }
383
384    fn find_duplicates_streaming(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
385        tx.send(HashProgress::StartDiscovering)?;
386        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
387        let mut total_hashed = 0;
388        std::thread::scope(|global_scope| {
389            let (it_tx, it_rx) = mpsc::channel();
390            for dir in &self.dirs {
391                let it_tx = it_tx.clone();
392                let mut it = FileIterator::new(dir.clone());
393                it.hasher = Some(self);
394                it.exclude = self.exclude.as_ref();
395                it.spawn_in_scope_with_sender(global_scope, it_tx);
396            }
397            drop(it_tx);
398
399            let pool = crate::build_thread_pool(self.jobs)?;
400            pool.scope(move |scope| -> anyhow::Result<()> {
401                for current_path in it_rx {
402                    let meta = fs::metadata(&current_path)?;
403                    let size = meta.len();
404                    let modified = meta.modified()?;
405
406                    // Small optimization: If file size is 0, it's not really worth treating
407                    // as wasted space duplicates in the same way, but keeping it unified for now.
408                    match by_size.entry(size) {
409                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
410                        {
411                            EntryState::Single(first_path, first_modified) => {
412                                // We found a second file of identical size.
413                                // Time to start hashing both the *original* matching file and the *new* one!
414                                self.spawn_hash_task(first_path, size, *first_modified, scope, &tx);
415                                self.spawn_hash_task(&current_path, size, modified, scope, &tx);
416
417                                // Modify the state to indicate we are now fully hashing this size bucket.
418                                *occ.get_mut() = EntryState::Hashing;
419                                total_hashed += 2;
420                            }
421                            EntryState::Hashing => {
422                                // File size bucket already hashing; just dynamically spawn the new file immediately.
423                                self.spawn_hash_task(&current_path, size, modified, scope, &tx);
424                                total_hashed += 1;
425                            }
426                        },
427                        std::collections::hash_map::Entry::Vacant(vac) => {
428                            vac.insert(EntryState::Single(current_path, modified));
429                        }
430                    }
431                }
432                tx.send(HashProgress::TotalFiles(total_hashed))?;
433                Ok(())
434            })
435        })?;
436
437        // The scope waits for all spawned tasks to complete.
438        // Channel `tx` gets naturally closed when it drops at the end of this function.
439        self.save_cache()
440    }
441
442    fn spawn_hash_task<'scope>(
443        &'scope self,
444        path: &Path,
445        size: u64,
446        modified: time::SystemTime,
447        scope: &rayon::Scope<'scope>,
448        tx: &mpsc::Sender<HashProgress>,
449    ) {
450        let (hash, relative) = self
451            .get_hash_from_cache(path, modified)
452            .expect("path should be in cache base_dir");
453        if let Some(hash) = hash {
454            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
455            return;
456        }
457
458        let path = path.to_path_buf();
459        let relative = relative.to_path_buf();
460        let tx = tx.clone();
461        scope.spawn(move |_| {
462            if let Ok(hash) = self.compute_hash(&path) {
463                self.cache.insert(&relative, modified, hash);
464                let _ = tx.send(HashProgress::Result(path, size, hash, false));
465            } else {
466                log::error!("Failed to hash file: {:?}", path);
467                let _ = tx.send(HashProgress::Error);
468            }
469        });
470    }
471
472    /// Gets the hash of a file, using the cache if available.
473    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
474        let meta = fs::metadata(path)?;
475        let modified = meta.modified()?;
476        let (hash, relative) = self.get_hash_from_cache(path, modified)?;
477        if let Some(hash) = hash {
478            return Ok(hash);
479        }
480
481        let hash = self.compute_hash(path)?;
482        self.cache.insert(relative, modified, hash);
483        Ok(hash)
484    }
485
486    fn get_hash_from_cache<'a>(
487        &self,
488        path: &'a Path,
489        modified: time::SystemTime,
490    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
491        let relative = crate::strip_prefix(path, self.cache.base_dir())
492            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
493        if let Some(hash) = self.cache.get(relative, modified) {
494            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
495            return Ok((Some(hash), relative));
496        }
497        Ok((None, relative))
498    }
499
500    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
501        let start_time = time::Instant::now();
502        let mut f = fs::File::open(path)?;
503        let len = f.metadata()?.len();
504        let progress = self
505            .progress
506            .as_ref()
507            .map(|progress| progress.add_file(path, len))
508            .unwrap_or_else(Progress::none);
509        let mut hasher = blake3::Hasher::new();
510        if self.buffer_size == 0 {
511            if len > 0 {
512                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
513                hasher.update(&mmap[..]);
514                progress.inc(len);
515            }
516        } else {
517            let mut buf = vec![0u8; self.buffer_size];
518            loop {
519                let n = f.read(&mut buf)?;
520                if n == 0 {
521                    break;
522                }
523                hasher.update(&buf[..n]);
524                progress.inc(n as u64);
525            }
526        }
527        progress.finish();
528        self.num_hashed.fetch_add(1, Ordering::Relaxed);
529        let hash = hasher.finalize();
530        log::debug!(
531            "Computed hash in {}: {:?}",
532            FormattedDuration(start_time.elapsed()),
533            path
534        );
535        Ok(hash)
536    }
537}
538
539/// A group of duplicated files and their size.
540#[derive(Clone, Debug)]
541pub struct DuplicatedFiles {
542    pub paths: Vec<PathBuf>,
543    pub size: u64,
544}
545
546impl DuplicatedFiles {
547    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
548        writeln!(
549            writer,
550            "Identical {} files of {}:",
551            self.paths.len(),
552            crate::human_readable_size(self.size)
553        )?;
554        for path in &self.paths {
555            writeln!(writer, "  {}", path.display())?;
556        }
557        Ok(())
558    }
559
560    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
561        writeln!(writer, "- paths:")?;
562        for path in &self.paths {
563            writeln!(writer, "  - {:?}", path)?;
564        }
565        writeln!(writer, "  size: {}", self.size)?;
566        Ok(())
567    }
568
569    fn wasted_size(&self) -> u64 {
570        self.size * (self.paths.len() as u64 - 1)
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    fn default_exclude() -> globset::GlobSet {
579        let mut builder = globset::GlobSetBuilder::new();
580        builder.add(
581            globset::GlobBuilder::new(".hash_cache")
582                .case_insensitive(true)
583                .build()
584                .unwrap(),
585        );
586        builder.build().unwrap()
587    }
588
589    #[test]
590    fn find_duplicates() -> anyhow::Result<()> {
591        let dir = tempfile::tempdir()?;
592
593        let file1_path = dir.path().join("same1.txt");
594        fs::write(&file1_path, "same content")?;
595
596        let file2_path = dir.path().join("same2.txt");
597        fs::write(&file2_path, "same content")?;
598
599        let diff_path = dir.path().join("diff.txt");
600        fs::write(&diff_path, "different content")?;
601
602        let mut hasher = FileHasher::new(&[dir.path()])?;
603        hasher.buffer_size = 8192;
604        let duplicates = hasher.find_duplicates()?;
605
606        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
607        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
608
609        assert_eq!(duplicates.len(), 1);
610        let group = &duplicates[0];
611        assert_eq!(group.paths.len(), 2);
612        assert_eq!(group.size, 12); // "same content" is 12 bytes
613
614        assert!(group.paths.contains(&file1_path));
615        assert!(group.paths.contains(&file2_path));
616
617        Ok(())
618    }
619
620    #[test]
621    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
622        let dir = tempfile::tempdir()?;
623        let dir_path = dir.path();
624
625        let sub_dir = dir_path.join("a").join("a");
626        fs::create_dir_all(&sub_dir)?;
627
628        let file1_path = sub_dir.join("1");
629        fs::write(&file1_path, "same content")?;
630
631        let file2_path = sub_dir.join("2");
632        fs::write(&file2_path, "same content")?;
633
634        // Create empty cache file in a/a to force it to be the cache base
635        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
636        fs::File::create(&cache_aa_path)?;
637
638        // Run find_duplicates on a/a
639        let hasher_aa = FileHasher::new(&[&sub_dir])?;
640        let duplicates_aa = hasher_aa.find_duplicates()?;
641        assert_eq!(duplicates_aa.len(), 1);
642        assert!(cache_aa_path.exists());
643        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
644        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
645
646        // Create empty cache file in a to force it to be the cache base
647        let root_a = dir_path.join("a");
648        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
649        fs::File::create(&cache_a_path)?;
650
651        // Run find_duplicates on a
652        let hasher_a = FileHasher::new(&[&root_a])?;
653        let duplicates_a = hasher_a.find_duplicates()?;
654        assert_eq!(duplicates_a.len(), 1);
655        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
656        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
657
658        // The merged child cache should be removed.
659        assert!(cache_a_path.exists());
660        assert!(!cache_aa_path.exists());
661
662        Ok(())
663    }
664
665    #[test]
666    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
667        let dir = tempfile::tempdir()?;
668
669        let file1_path = dir.path().join("same1.txt");
670        fs::write(&file1_path, "same content")?;
671
672        let file2_path = dir.path().join("same2.txt");
673        fs::write(&file2_path, "same content")?;
674
675        let exclude_path = dir.path().join("exclude.txt");
676        fs::write(&exclude_path, "same content")?;
677
678        let mut hasher = FileHasher::new(&[dir.path()])?;
679        hasher.buffer_size = 8192;
680        let mut builder = globset::GlobSetBuilder::new();
681        builder.add(
682            globset::GlobBuilder::new("exclude.txt")
683                .case_insensitive(true)
684                .build()?,
685        );
686        let filter = builder.build()?;
687        hasher.exclude = Some(filter);
688
689        let duplicates = hasher.find_duplicates()?;
690        assert_eq!(duplicates.len(), 1);
691        let group = &duplicates[0];
692        assert_eq!(group.paths.len(), 2);
693        assert!(group.paths.contains(&file1_path));
694        assert!(group.paths.contains(&file2_path));
695        assert!(!group.paths.contains(&exclude_path));
696        Ok(())
697    }
698
699    #[test]
700    fn check_mode_empty_cache() -> anyhow::Result<()> {
701        let dir = tempfile::tempdir()?;
702        let dir_path = dir.path().to_path_buf();
703        println!("{:?}", dir_path);
704        let file1_path = dir.path().join("file1.txt");
705        fs::write(&file1_path, "content 1")?;
706        let file2_path = dir.path().join("file2.txt");
707        fs::write(&file2_path, "content 2")?;
708
709        let mut hasher = FileHasher::new(&[&dir_path])?;
710        hasher.exclude = Some(default_exclude());
711        let (tx, rx) = mpsc::channel();
712        hasher.check_streaming(tx, false)?;
713        let mut results = Vec::new();
714        let mut start_seen = false;
715        let mut total_files = None;
716        let mut file_done_count = 0;
717        let mut num_error = 0;
718        while let Ok(event) = rx.recv() {
719            match event {
720                CheckEvent::StartChecking => start_seen = true,
721                CheckEvent::TotalFiles(total) => total_files = Some(total),
722                CheckEvent::Result(path, status) => results.push((path, status)),
723                CheckEvent::FileDone => file_done_count += 1,
724                CheckEvent::Error => num_error += 1,
725            }
726        }
727        assert!(start_seen);
728        assert_eq!(total_files, Some(2));
729        assert_eq!(file_done_count, 0);
730        assert_eq!(num_error, 0);
731
732        results.sort_by(|a, b| a.0.cmp(&b.0));
733        assert_eq!(results.len(), 2);
734        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
735        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
736
737        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
738        Ok(())
739    }
740
741    #[test]
742    fn check_mode_with_cache() -> anyhow::Result<()> {
743        let dir = tempfile::tempdir()?;
744        let dir_path = dir.path().to_path_buf();
745        let file1_path = dir.path().join("file1.txt");
746        fs::write(&file1_path, "content 1")?;
747        let file2_path = dir.path().join("file2.txt");
748        fs::write(&file2_path, "content 2")?;
749
750        let mut hasher = FileHasher::new(&[&dir_path])?;
751        hasher.exclude = Some(default_exclude());
752        let _hash1 = hasher.get_hash(&file1_path)?;
753        let _hash2 = hasher.get_hash(&file2_path)?;
754        hasher.save_cache()?;
755        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
756
757        let mut hasher = FileHasher::new(&[&dir_path])?;
758        hasher.exclude = Some(default_exclude());
759        let (tx, rx) = mpsc::channel();
760        hasher.check_streaming(tx, false)?;
761        let mut results = Vec::new();
762        let mut file_done_count = 0;
763        while let Ok(event) = rx.recv() {
764            match event {
765                CheckEvent::Result(path, status) => results.push((path, status)),
766                CheckEvent::FileDone => file_done_count += 1,
767                _ => {}
768            }
769        }
770        assert_eq!(results.len(), 0);
771        assert_eq!(file_done_count, 2);
772
773        fs::write(&file1_path, "content 1 modified")?;
774
775        let file2_meta_before = fs::metadata(&file2_path)?;
776        let mtime_before = file2_meta_before.modified()?;
777        std::thread::sleep(time::Duration::from_millis(10));
778        fs::write(&file2_path, "content 2")?;
779        let file2_meta_after = fs::metadata(&file2_path)?;
780        let mtime_after = file2_meta_after.modified()?;
781        assert!(mtime_after > mtime_before);
782
783        let mut hasher = FileHasher::new(&[&dir_path])?;
784        hasher.exclude = Some(default_exclude());
785        let (tx, rx) = mpsc::channel();
786        hasher.check_streaming(tx, false)?;
787        let mut results = Vec::new();
788        let mut file_done_count = 0;
789        while let Ok(event) = rx.recv() {
790            match event {
791                CheckEvent::Result(path, status) => results.push((path, status)),
792                CheckEvent::FileDone => file_done_count += 1,
793                _ => {}
794            }
795        }
796        assert_eq!(results.len(), 1);
797        assert_eq!(
798            results[0],
799            (PathBuf::from("file1.txt"), CheckStatus::Modified)
800        );
801        assert_eq!(file_done_count, 1);
802        Ok(())
803    }
804
805    #[test]
806    fn check_update_mode() -> anyhow::Result<()> {
807        let dir = tempfile::tempdir()?;
808        let dir_path = dir.path().to_path_buf();
809        let file1_path = dir.path().join("file1.txt");
810        fs::write(&file1_path, "content 1")?;
811
812        let mut hasher = FileHasher::new(&[&dir_path])?;
813        hasher.exclude = Some(default_exclude());
814        let (tx, rx) = mpsc::channel();
815        hasher.check_streaming(tx, true)?;
816        while rx.recv().is_ok() {}
817        hasher.save_cache()?;
818        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
819
820        let cache = FileHashCache::new(&dir_path);
821        let mtime1 = fs::metadata(&file1_path)?.modified()?;
822        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
823        assert!(hash1.is_some());
824
825        std::thread::sleep(time::Duration::from_millis(10));
826        fs::write(&file1_path, "content 1 modified")?;
827        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
828
829        let mut hasher = FileHasher::new(&[&dir_path])?;
830        hasher.exclude = Some(default_exclude());
831        let (tx, rx) = mpsc::channel();
832        hasher.check_streaming(tx, true)?;
833        while rx.recv().is_ok() {}
834        hasher.save_cache()?;
835
836        let cache = FileHashCache::new(&dir_path);
837        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
838        assert!(hash_mod.is_some());
839        assert_ne!(hash1, hash_mod);
840
841        std::thread::sleep(time::Duration::from_millis(10));
842        fs::write(&file1_path, "content 1 modified")?;
843        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
844        assert!(mtime1_mod2 > mtime1_mod);
845
846        assert!(
847            cache
848                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
849                .is_none()
850        );
851
852        let mut hasher = FileHasher::new(&[&dir_path])?;
853        hasher.exclude = Some(default_exclude());
854        let (tx, rx) = mpsc::channel();
855        hasher.check_streaming(tx, true)?;
856        while rx.recv().is_ok() {}
857        hasher.save_cache()?;
858
859        let cache = FileHashCache::new(&dir_path);
860        assert!(
861            cache
862                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
863                .is_some()
864        );
865        Ok(())
866    }
867
868    #[test]
869    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
870        let dir = tempfile::tempdir()?;
871        let dir_path = dir.path().to_path_buf();
872        let file1_path = dir.path().join("file1.txt");
873        let file2_path = dir.path().join("file2.txt");
874        fs::write(&file1_path, "content 1")?;
875        fs::write(&file2_path, "content 2")?;
876        let mtime1 = fs::metadata(&file1_path)?.modified()?;
877        let mtime2 = fs::metadata(&file2_path)?.modified()?;
878
879        let mut hasher = FileHasher::new(&[&dir_path])?;
880        hasher.exclude = Some(default_exclude());
881        let (tx, rx) = mpsc::channel();
882        hasher.check_streaming(tx, true)?;
883        while rx.recv().is_ok() {}
884        hasher.save_cache()?;
885
886        // Verify both are in the cache
887        let cache = FileHashCache::new(&dir_path);
888        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
889        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
890
891        // Now delete file2 from disk
892        fs::remove_file(&file2_path)?;
893
894        // Run check and save again
895        let mut hasher = FileHasher::new(&[&dir_path])?;
896        hasher.exclude = Some(default_exclude());
897        let (tx, rx) = mpsc::channel();
898        hasher.check_streaming(tx, true)?;
899        while rx.recv().is_ok() {}
900        hasher.save_cache()?;
901
902        // Verify file2 is removed from cache, but file1 is still there
903        let cache = FileHashCache::new(&dir_path);
904        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
905        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
906        Ok(())
907    }
908
909    #[test]
910    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
911        let tmp = tempfile::tempdir()?;
912        let dir1 = tmp.path().join("dir1");
913        let dir2 = tmp.path().join("dir2");
914        fs::create_dir(&dir1)?;
915        fs::create_dir(&dir2)?;
916        let file1_path = dir1.join("file1.txt");
917        fs::write(&file1_path, "same content")?;
918        let file2_path = dir2.join("file2.txt");
919        fs::write(&file2_path, "same content")?;
920        let hasher = FileHasher::new(&[&dir1, &dir2])?;
921        let duplicates = hasher.find_duplicates()?;
922        assert_eq!(duplicates.len(), 1);
923        let group = &duplicates[0];
924        assert_eq!(group.paths.len(), 2);
925        assert_eq!(group.size, 12);
926        assert!(group.paths.contains(&file1_path));
927        assert!(group.paths.contains(&file2_path));
928
929        Ok(())
930    }
931
932    #[test]
933    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
934        let tmp = tempfile::tempdir()?;
935        let dir1 = tmp.path().join("dir1");
936        let dir2 = tmp.path().join("dir2");
937        fs::create_dir(&dir1)?;
938        fs::create_dir(&dir2)?;
939        let hasher = FileHasher::new(&[&dir1, &dir2])?;
940        assert!(hasher.check(false).is_err());
941        Ok(())
942    }
943}