Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, Progress,
3    ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use std::collections::HashMap;
8use std::fs;
9use std::io::{self, Read, stdout};
10use std::path::{Path, PathBuf};
11use std::sync::atomic::{AtomicUsize, Ordering};
12use std::sync::{Arc, mpsc};
13
14#[derive(Debug, Clone)]
15enum HashProgress {
16    StartDiscovering,
17    TotalFiles(usize),
18    Result(PathBuf, u64, blake3::Hash, bool),
19}
20
21#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
22enum CheckStatus {
23    Unchanged,
24    New,
25    Modified,
26}
27
28#[derive(Debug, PartialEq)]
29enum CheckEvent {
30    StartChecking,
31    TotalFiles(usize),
32    Result(PathBuf, CheckStatus),
33    FileDone,
34}
35
36enum EntryState {
37    Single(PathBuf, std::time::SystemTime),
38    Hashing,
39}
40
41/// A group of duplicated files and their size.
42#[derive(Debug, Clone)]
43pub struct DuplicatedFiles {
44    pub paths: Vec<PathBuf>,
45    pub size: u64,
46}
47
48/// A tool for finding duplicated files in a directory.
49pub struct FileHasher {
50    dirs: Vec<PathBuf>,
51    pub buffer_size: usize,
52    pub(crate) cache: Arc<FileHashCache>,
53    pub(crate) num_hashed: AtomicUsize,
54    pub(crate) num_hash_looked_up: AtomicUsize,
55    pub exclude: Option<GlobSet>,
56    pub progress: Option<Arc<ProgressBuilder>>,
57    pub jobs: usize,
58}
59
60impl FileHasher {
61    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
62
63    /// Creates a new `FileHasher` for the given directories.
64    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
65        if dirs.is_empty() {
66            anyhow::bail!("At least one directory must be specified.");
67        }
68        let common_ancestor = crate::common_ancestor(dirs)
69            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
70        Ok(Self {
71            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
72            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
73            cache: FileHashCache::find_or_new(&common_ancestor),
74            num_hashed: AtomicUsize::new(0),
75            num_hash_looked_up: AtomicUsize::new(0),
76            exclude: None,
77            progress: None,
78            jobs: Self::DEFAULT_JOBS,
79        })
80    }
81
82    /// Remove a cache entry if it exists.
83    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
84        let relative = crate::strip_prefix(path, self.cache.base_dir())?;
85        self.cache.remove(relative);
86        Ok(())
87    }
88
89    /// Save the hash cache if it is dirty.
90    pub fn save_cache(&self) -> anyhow::Result<()> {
91        log::info!(
92            "Hash stats for {:?}: {} computed, {} looked up",
93            self.dirs,
94            self.num_hashed.load(Ordering::Relaxed),
95            self.num_hash_looked_up.load(Ordering::Relaxed)
96        );
97        Ok(self.cache.save()?)
98    }
99
100    /// Merges another cache into this hasher's cache.
101    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
102        self.cache.merge(other_cache);
103    }
104
105    /// Clears the loaded hashes in the cache.
106    pub fn clear_cache(&self) -> anyhow::Result<()> {
107        for dir in &self.dirs {
108            let relative = crate::strip_prefix(dir, self.cache.base_dir())?;
109            self.cache.clear(relative);
110        }
111        Ok(())
112    }
113
114    /// Executes the check/update process.
115    pub fn check(&self, update: bool) -> anyhow::Result<()> {
116        if self.dirs.len() > 1 {
117            anyhow::bail!("Check mode only supports one directory.");
118        }
119        let start_time = std::time::Instant::now();
120        let progress = self
121            .progress
122            .as_ref()
123            .map(|progress| progress.add_spinner())
124            .unwrap_or_else(Progress::none);
125        progress.set_message("Scanning directory...");
126        let mut num_new = 0;
127        let mut num_modified = 0;
128        std::thread::scope(|scope| {
129            let (tx, rx) = mpsc::channel();
130            scope.spawn(|| {
131                if let Err(e) = self.check_streaming(tx, update) {
132                    log::error!("Error during check: {}", e);
133                }
134            });
135            while let Ok(event) = rx.recv() {
136                match event {
137                    CheckEvent::StartChecking => {
138                        progress.set_message("Checking files...");
139                    }
140                    CheckEvent::TotalFiles(total) => {
141                        progress.set_length(total as u64);
142                        progress.set_message("");
143                    }
144                    CheckEvent::Result(path, status) => {
145                        let symbol = match status {
146                            CheckStatus::New => {
147                                num_new += 1;
148                                '+'
149                            }
150                            CheckStatus::Modified => {
151                                num_modified += 1;
152                                '!'
153                            }
154                            CheckStatus::Unchanged => unreachable!(),
155                        };
156                        progress.inc(1);
157                        progress.suspend_for(stdout(), || {
158                            println!("{} {}", symbol, path.display());
159                        });
160                    }
161                    CheckEvent::FileDone => {
162                        progress.inc(1);
163                    }
164                }
165            }
166        });
167        progress.finish();
168        if update {
169            self.save_cache()?;
170        }
171        let summary = [
172            ("Elapsed:", 0),
173            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
174            ("New files:", num_new),
175            ("Modified files:", num_modified),
176        ];
177        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
178        let mut writer = std::io::stderr();
179        formatter.write_value(
180            &mut writer,
181            summary[0].0,
182            FormattedDuration(start_time.elapsed()),
183        )?;
184        formatter.write_values(&mut writer, &summary[1..])?;
185        Ok(())
186    }
187
188    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
189        std::thread::scope(|global_scope| {
190            let mut it = FileIterator::new(self.dirs[0].clone());
191            it.hasher = Some(self);
192            it.exclude = self.exclude.as_ref();
193            let it_rx = it.spawn_in_scope(global_scope);
194            tx.send(CheckEvent::StartChecking)?;
195            let pool = crate::build_thread_pool(self.jobs)?;
196            pool.scope(move |scope| -> anyhow::Result<()> {
197                let mut total_files = 0;
198                for (rel_path, abs_path) in it_rx {
199                    total_files += 1;
200                    let tx = tx.clone();
201                    scope.spawn(move |_| {
202                        let status = self.check_file(&abs_path, update);
203                        let event = match status {
204                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
205                                CheckEvent::Result(rel_path, status.unwrap())
206                            }
207                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
208                            Err(e) => {
209                                log::warn!("Failed to check file {:?}: {}", rel_path, e);
210                                CheckEvent::FileDone
211                            }
212                        };
213                        if tx.send(event).is_err() {
214                            log::error!("Send failed");
215                        }
216                    });
217                }
218                tx.send(CheckEvent::TotalFiles(total_files))?;
219                Ok(())
220            })
221        })?;
222        Ok(())
223    }
224
225    fn check_file(&self, abs_path: &Path, update: bool) -> anyhow::Result<CheckStatus> {
226        assert!(abs_path.is_absolute());
227        let computed_hash = self.compute_hash(abs_path)?;
228        let rel_path = crate::strip_prefix(abs_path, self.cache.base_dir())?;
229        let cached_hash = self.cache.get_by_path(rel_path);
230        let status = match cached_hash {
231            None => CheckStatus::New,
232            Some(cached) => {
233                if computed_hash != cached {
234                    CheckStatus::Modified
235                } else {
236                    CheckStatus::Unchanged
237                }
238            }
239        };
240        if update {
241            let modified = fs::metadata(abs_path)?.modified()?;
242            match status {
243                CheckStatus::New | CheckStatus::Modified => {
244                    self.cache.insert(rel_path, modified, computed_hash);
245                }
246                CheckStatus::Unchanged => {
247                    if self.cache.get(rel_path, modified).is_none() {
248                        self.cache.insert(rel_path, modified, computed_hash);
249                    }
250                }
251            }
252        }
253        Ok(status)
254    }
255
256    /// Executes the duplicate file finding process and prints results.
257    pub fn run(&self) -> anyhow::Result<()> {
258        let start_time = std::time::Instant::now();
259        let mut duplicates = self.find_duplicates()?;
260        if duplicates.is_empty() {
261            println!("No duplicates found.");
262        } else {
263            duplicates.sort_by_key(|a| a.size);
264            let mut total_wasted_space = 0;
265            for dupes in &duplicates {
266                let paths = &dupes.paths;
267                let file_size = dupes.size;
268                println!(
269                    "Identical {} files of {}:",
270                    paths.len(),
271                    crate::human_readable_size(file_size)
272                );
273                for path in paths {
274                    println!("  {}", path.display());
275                }
276                total_wasted_space += file_size * (paths.len() as u64 - 1);
277            }
278            eprintln!(
279                "Total wasted space: {}",
280                crate::human_readable_size(total_wasted_space)
281            );
282        }
283        eprintln!("Finished in {}.", FormattedDuration(start_time.elapsed()));
284        Ok(())
285    }
286
287    /// Finds duplicated files and returns a list of duplicate groups.
288    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
289        let progress = self
290            .progress
291            .as_ref()
292            .map(|progress| progress.add_spinner())
293            .unwrap_or_else(Progress::none);
294        progress.set_message("Scanning directories...");
295
296        let (tx, rx) = mpsc::channel();
297        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
298        let mut num_cache_hits = 0;
299        std::thread::scope(|scope| {
300            scope.spawn(|| {
301                if let Err(e) = self.find_duplicates_streaming(tx) {
302                    log::error!("Error during duplicate finding: {}", e);
303                }
304            });
305
306            while let Ok(event) = rx.recv() {
307                match event {
308                    HashProgress::StartDiscovering => {
309                        progress.set_message("Hashing files...");
310                    }
311                    HashProgress::TotalFiles(total) => {
312                        progress.set_length(total as u64);
313                        if num_cache_hits > 0 {
314                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
315                        }
316                    }
317                    HashProgress::Result(path, size, hash, is_cache_hit) => {
318                        if is_cache_hit {
319                            num_cache_hits += 1;
320                            if progress.length().is_none() {
321                                progress.set_message(format!(
322                                    "Hashing files... ({} cache hits)",
323                                    num_cache_hits
324                                ));
325                            } else {
326                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
327                            }
328                        }
329
330                        progress.inc(1);
331                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
332                            paths: Vec::new(),
333                            size,
334                        });
335                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
336                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
337                        entry.paths.push(path);
338                    }
339                }
340            }
341        });
342        progress.finish();
343
344        let mut duplicates = Vec::new();
345        for (_, mut dupes) in by_hash {
346            if dupes.paths.len() > 1 {
347                dupes.paths.sort();
348                duplicates.push(dupes);
349            }
350        }
351        Ok(duplicates)
352    }
353
354    fn find_duplicates_streaming(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
355        tx.send(HashProgress::StartDiscovering)?;
356        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
357        let mut total_hashed = 0;
358        std::thread::scope(|global_scope| {
359            let (it_tx, it_rx) = mpsc::channel();
360            for dir in &self.dirs {
361                let it_tx = it_tx.clone();
362                let mut it = FileIterator::new(dir.clone());
363                it.hasher = Some(self);
364                it.exclude = self.exclude.as_ref();
365                it.spawn_in_scope_with_sender(global_scope, it_tx);
366            }
367            drop(it_tx);
368
369            let pool = crate::build_thread_pool(self.jobs)?;
370            pool.scope(move |scope| -> anyhow::Result<()> {
371                for (_, current_path) in it_rx {
372                    let meta = fs::metadata(&current_path)?;
373                    let size = meta.len();
374                    let modified = meta.modified()?;
375
376                    // Small optimization: If file size is 0, it's not really worth treating
377                    // as wasted space duplicates in the same way, but keeping it unified for now.
378                    match by_size.entry(size) {
379                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
380                        {
381                            EntryState::Single(first_path, first_modified) => {
382                                // We found a second file of identical size.
383                                // Time to start hashing both the *original* matching file and the *new* one!
384                                self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
385                                self.spawn_hash_task(scope, &current_path, size, modified, &tx);
386
387                                // Modify the state to indicate we are now fully hashing this size bucket.
388                                *occ.get_mut() = EntryState::Hashing;
389                                total_hashed += 2;
390                            }
391                            EntryState::Hashing => {
392                                // File size bucket already hashing; just dynamically spawn the new file immediately.
393                                self.spawn_hash_task(scope, &current_path, size, modified, &tx);
394                                total_hashed += 1;
395                            }
396                        },
397                        std::collections::hash_map::Entry::Vacant(vac) => {
398                            vac.insert(EntryState::Single(current_path, modified));
399                        }
400                    }
401                }
402                tx.send(HashProgress::TotalFiles(total_hashed))?;
403                Ok(())
404            })
405        })?;
406
407        // The scope waits for all spawned tasks to complete.
408        // Channel `tx` gets naturally closed when it drops at the end of this function.
409        self.save_cache()
410    }
411
412    fn spawn_hash_task<'scope>(
413        &'scope self,
414        scope: &rayon::Scope<'scope>,
415        path: &Path,
416        size: u64,
417        modified: std::time::SystemTime,
418        tx: &mpsc::Sender<HashProgress>,
419    ) {
420        let relative = crate::strip_prefix(path, self.cache.base_dir())
421            .expect("path should be in cache base_dir");
422        if let Some(hash) = self.cache.get(relative, modified) {
423            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
424            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
425            return;
426        }
427
428        let path_owned = path.to_path_buf();
429        let relative_owned = relative.to_path_buf();
430        let tx_owned = tx.clone();
431        scope.spawn(move |_| {
432            if let Ok(hash) = self.compute_hash(&path_owned) {
433                self.cache.insert(&relative_owned, modified, hash);
434                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
435            } else {
436                log::warn!("Failed to hash file: {:?}", path_owned);
437            }
438        });
439    }
440
441    /// Gets the hash of a file, using the cache if available.
442    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
443        let meta = fs::metadata(path)?;
444        let modified = meta.modified()?;
445        let relative = crate::strip_prefix(path, self.cache.base_dir())
446            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
447        if let Some(hash) = self.cache.get(relative, modified) {
448            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
449            return Ok(hash);
450        }
451
452        let hash = self.compute_hash(path)?;
453        self.cache.insert(relative, modified, hash);
454        Ok(hash)
455    }
456
457    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
458        let start_time = std::time::Instant::now();
459        let mut f = fs::File::open(path)?;
460        let len = f.metadata()?.len();
461        let progress = self
462            .progress
463            .as_ref()
464            .map(|progress| progress.add_file(path, len))
465            .unwrap_or_else(Progress::none);
466        let mut hasher = blake3::Hasher::new();
467        if self.buffer_size == 0 {
468            if len > 0 {
469                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
470                hasher.update(&mmap[..]);
471                progress.inc(len);
472            }
473        } else {
474            let mut buf = vec![0u8; self.buffer_size];
475            loop {
476                let n = f.read(&mut buf)?;
477                if n == 0 {
478                    break;
479                }
480                hasher.update(&buf[..n]);
481                progress.inc(n as u64);
482            }
483        }
484        progress.finish();
485        self.num_hashed.fetch_add(1, Ordering::Relaxed);
486        let hash = hasher.finalize();
487        log::debug!(
488            "Computed hash in {}: {:?}",
489            FormattedDuration(start_time.elapsed()),
490            path
491        );
492        Ok(hash)
493    }
494}
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    fn default_exclude() -> globset::GlobSet {
501        let mut builder = globset::GlobSetBuilder::new();
502        builder.add(
503            globset::GlobBuilder::new(".hash_cache")
504                .case_insensitive(true)
505                .build()
506                .unwrap(),
507        );
508        builder.build().unwrap()
509    }
510
511    #[test]
512    fn find_duplicates() -> anyhow::Result<()> {
513        let dir = tempfile::tempdir()?;
514
515        let file1_path = dir.path().join("same1.txt");
516        fs::write(&file1_path, "same content")?;
517
518        let file2_path = dir.path().join("same2.txt");
519        fs::write(&file2_path, "same content")?;
520
521        let diff_path = dir.path().join("diff.txt");
522        fs::write(&diff_path, "different content")?;
523
524        let mut hasher = FileHasher::new(&[dir.path()])?;
525        hasher.buffer_size = 8192;
526        let duplicates = hasher.find_duplicates()?;
527
528        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
529        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
530
531        assert_eq!(duplicates.len(), 1);
532        let group = &duplicates[0];
533        assert_eq!(group.paths.len(), 2);
534        assert_eq!(group.size, 12); // "same content" is 12 bytes
535
536        assert!(group.paths.contains(&file1_path));
537        assert!(group.paths.contains(&file2_path));
538
539        Ok(())
540    }
541
542    #[test]
543    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
544        let dir = tempfile::tempdir()?;
545        let dir_path = dir.path();
546
547        let sub_dir = dir_path.join("a").join("a");
548        fs::create_dir_all(&sub_dir)?;
549
550        let file1_path = sub_dir.join("1");
551        fs::write(&file1_path, "same content")?;
552
553        let file2_path = sub_dir.join("2");
554        fs::write(&file2_path, "same content")?;
555
556        // Create empty cache file in a/a to force it to be the cache base
557        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
558        fs::File::create(&cache_aa_path)?;
559
560        // Run find_duplicates on a/a
561        let hasher_aa = FileHasher::new(&[&sub_dir])?;
562        let duplicates_aa = hasher_aa.find_duplicates()?;
563        assert_eq!(duplicates_aa.len(), 1);
564        assert!(cache_aa_path.exists());
565        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
566        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
567
568        // Create empty cache file in a to force it to be the cache base
569        let root_a = dir_path.join("a");
570        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
571        fs::File::create(&cache_a_path)?;
572
573        // Run find_duplicates on a
574        let hasher_a = FileHasher::new(&[&root_a])?;
575        let duplicates_a = hasher_a.find_duplicates()?;
576        assert_eq!(duplicates_a.len(), 1);
577        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
578        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
579
580        // The merged child cache should be removed.
581        assert!(cache_a_path.exists());
582        assert!(!cache_aa_path.exists());
583
584        Ok(())
585    }
586
587    #[test]
588    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
589        let dir = tempfile::tempdir()?;
590
591        let file1_path = dir.path().join("same1.txt");
592        fs::write(&file1_path, "same content")?;
593
594        let file2_path = dir.path().join("same2.txt");
595        fs::write(&file2_path, "same content")?;
596
597        let exclude_path = dir.path().join("exclude.txt");
598        fs::write(&exclude_path, "same content")?;
599
600        let mut hasher = FileHasher::new(&[dir.path()])?;
601        hasher.buffer_size = 8192;
602        let mut builder = globset::GlobSetBuilder::new();
603        builder.add(
604            globset::GlobBuilder::new("exclude.txt")
605                .case_insensitive(true)
606                .build()?,
607        );
608        let filter = builder.build()?;
609        hasher.exclude = Some(filter);
610
611        let duplicates = hasher.find_duplicates()?;
612        assert_eq!(duplicates.len(), 1);
613        let group = &duplicates[0];
614        assert_eq!(group.paths.len(), 2);
615        assert!(group.paths.contains(&file1_path));
616        assert!(group.paths.contains(&file2_path));
617        assert!(!group.paths.contains(&exclude_path));
618        Ok(())
619    }
620
621    #[test]
622    fn check_mode_empty_cache() -> anyhow::Result<()> {
623        let dir = tempfile::tempdir()?;
624        let dir_path = dir.path().to_path_buf();
625        println!("{:?}", dir_path);
626        let file1_path = dir.path().join("file1.txt");
627        fs::write(&file1_path, "content 1")?;
628        let file2_path = dir.path().join("file2.txt");
629        fs::write(&file2_path, "content 2")?;
630
631        let mut hasher = FileHasher::new(&[&dir_path])?;
632        hasher.exclude = Some(default_exclude());
633        let (tx, rx) = mpsc::channel();
634        hasher.check_streaming(tx, false)?;
635        let mut results = Vec::new();
636        let mut start_seen = false;
637        let mut total_files = None;
638        let mut file_done_count = 0;
639        while let Ok(event) = rx.recv() {
640            match event {
641                CheckEvent::StartChecking => start_seen = true,
642                CheckEvent::TotalFiles(total) => total_files = Some(total),
643                CheckEvent::Result(path, status) => results.push((path, status)),
644                CheckEvent::FileDone => file_done_count += 1,
645            }
646        }
647        assert!(start_seen);
648        assert_eq!(total_files, Some(2));
649        assert_eq!(file_done_count, 0);
650
651        results.sort_by(|a, b| a.0.cmp(&b.0));
652        assert_eq!(results.len(), 2);
653        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
654        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
655
656        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
657        Ok(())
658    }
659
660    #[test]
661    fn check_mode_with_cache() -> anyhow::Result<()> {
662        let dir = tempfile::tempdir()?;
663        let dir_path = dir.path().to_path_buf();
664        let file1_path = dir.path().join("file1.txt");
665        fs::write(&file1_path, "content 1")?;
666        let file2_path = dir.path().join("file2.txt");
667        fs::write(&file2_path, "content 2")?;
668
669        let mut hasher = FileHasher::new(&[&dir_path])?;
670        hasher.exclude = Some(default_exclude());
671        let _hash1 = hasher.get_hash(&file1_path)?;
672        let _hash2 = hasher.get_hash(&file2_path)?;
673        hasher.save_cache()?;
674        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
675
676        let mut hasher = FileHasher::new(&[&dir_path])?;
677        hasher.exclude = Some(default_exclude());
678        let (tx, rx) = mpsc::channel();
679        hasher.check_streaming(tx, false)?;
680        let mut results = Vec::new();
681        let mut file_done_count = 0;
682        while let Ok(event) = rx.recv() {
683            match event {
684                CheckEvent::Result(path, status) => results.push((path, status)),
685                CheckEvent::FileDone => file_done_count += 1,
686                _ => {}
687            }
688        }
689        assert_eq!(results.len(), 0);
690        assert_eq!(file_done_count, 2);
691
692        fs::write(&file1_path, "content 1 modified")?;
693
694        let file2_meta_before = fs::metadata(&file2_path)?;
695        let mtime_before = file2_meta_before.modified()?;
696        std::thread::sleep(std::time::Duration::from_millis(10));
697        fs::write(&file2_path, "content 2")?;
698        let file2_meta_after = fs::metadata(&file2_path)?;
699        let mtime_after = file2_meta_after.modified()?;
700        assert!(mtime_after > mtime_before);
701
702        let mut hasher = FileHasher::new(&[&dir_path])?;
703        hasher.exclude = Some(default_exclude());
704        let (tx, rx) = mpsc::channel();
705        hasher.check_streaming(tx, false)?;
706        let mut results = Vec::new();
707        let mut file_done_count = 0;
708        while let Ok(event) = rx.recv() {
709            match event {
710                CheckEvent::Result(path, status) => results.push((path, status)),
711                CheckEvent::FileDone => file_done_count += 1,
712                _ => {}
713            }
714        }
715        assert_eq!(results.len(), 1);
716        assert_eq!(
717            results[0],
718            (PathBuf::from("file1.txt"), CheckStatus::Modified)
719        );
720        assert_eq!(file_done_count, 1);
721        Ok(())
722    }
723
724    #[test]
725    fn check_update_mode() -> anyhow::Result<()> {
726        let dir = tempfile::tempdir()?;
727        let dir_path = dir.path().to_path_buf();
728        let file1_path = dir.path().join("file1.txt");
729        fs::write(&file1_path, "content 1")?;
730
731        let mut hasher = FileHasher::new(&[&dir_path])?;
732        hasher.exclude = Some(default_exclude());
733        let (tx, rx) = mpsc::channel();
734        hasher.check_streaming(tx, true)?;
735        while rx.recv().is_ok() {}
736        hasher.save_cache()?;
737        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
738
739        let cache = FileHashCache::new(&dir_path);
740        let mtime1 = fs::metadata(&file1_path)?.modified()?;
741        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
742        assert!(hash1.is_some());
743
744        std::thread::sleep(std::time::Duration::from_millis(10));
745        fs::write(&file1_path, "content 1 modified")?;
746        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
747
748        let mut hasher = FileHasher::new(&[&dir_path])?;
749        hasher.exclude = Some(default_exclude());
750        let (tx, rx) = mpsc::channel();
751        hasher.check_streaming(tx, true)?;
752        while rx.recv().is_ok() {}
753        hasher.save_cache()?;
754
755        let cache = FileHashCache::new(&dir_path);
756        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
757        assert!(hash_mod.is_some());
758        assert_ne!(hash1, hash_mod);
759
760        std::thread::sleep(std::time::Duration::from_millis(10));
761        fs::write(&file1_path, "content 1 modified")?;
762        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
763        assert!(mtime1_mod2 > mtime1_mod);
764
765        assert!(
766            cache
767                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
768                .is_none()
769        );
770
771        let mut hasher = FileHasher::new(&[&dir_path])?;
772        hasher.exclude = Some(default_exclude());
773        let (tx, rx) = mpsc::channel();
774        hasher.check_streaming(tx, true)?;
775        while rx.recv().is_ok() {}
776        hasher.save_cache()?;
777
778        let cache = FileHashCache::new(&dir_path);
779        assert!(
780            cache
781                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
782                .is_some()
783        );
784        Ok(())
785    }
786
787    #[test]
788    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
789        let tmp = tempfile::tempdir()?;
790        let dir1 = tmp.path().join("dir1");
791        let dir2 = tmp.path().join("dir2");
792        fs::create_dir(&dir1)?;
793        fs::create_dir(&dir2)?;
794        let file1_path = dir1.join("file1.txt");
795        fs::write(&file1_path, "same content")?;
796        let file2_path = dir2.join("file2.txt");
797        fs::write(&file2_path, "same content")?;
798        let hasher = FileHasher::new(&[&dir1, &dir2])?;
799        let duplicates = hasher.find_duplicates()?;
800        assert_eq!(duplicates.len(), 1);
801        let group = &duplicates[0];
802        assert_eq!(group.paths.len(), 2);
803        assert_eq!(group.size, 12);
804        assert!(group.paths.contains(&file1_path));
805        assert!(group.paths.contains(&file2_path));
806
807        Ok(())
808    }
809
810    #[test]
811    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
812        let tmp = tempfile::tempdir()?;
813        let dir1 = tmp.path().join("dir1");
814        let dir2 = tmp.path().join("dir2");
815        fs::create_dir(&dir1)?;
816        fs::create_dir(&dir2)?;
817        let hasher = FileHasher::new(&[&dir1, &dir2])?;
818        assert!(hasher.check(false).is_err());
819        Ok(())
820    }
821}