Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, Progress,
3    ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use std::{
9    collections::HashMap,
10    fs,
11    io::{self, Read, stdout},
12    path::{Path, PathBuf},
13    sync::{
14        Arc,
15        atomic::{AtomicUsize, Ordering},
16        mpsc,
17    },
18    time,
19};
20
21type FileItem = (PathBuf, usize);
22
23#[derive(Debug, Clone)]
24enum DupEvent {
25    StartHashing,
26    NumFiles(usize),
27    Result(PathBuf, u64, blake3::Hash),
28    Error,
29}
30
31#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
32enum CheckStatus {
33    Unchanged,
34    New,
35    Modified,
36}
37
38#[derive(Debug, PartialEq)]
39enum CheckEvent {
40    StartChecking,
41    TotalFiles(usize),
42    Result(PathBuf, CheckStatus),
43    FileDone,
44    Error,
45}
46
47enum DupState {
48    Single(PathBuf, time::SystemTime, usize),
49    Hashing,
50}
51
52/// A tool for finding duplicated files in a directory.
53pub struct FileHasher {
54    dirs: Vec<PathBuf>,
55    pub buffer_size: usize,
56    cache: Option<Arc<FileHashCache>>,
57    num_hashed: AtomicUsize,
58    num_hash_looked_up: AtomicUsize,
59    pub exclude: Option<GlobSet>,
60    pub progress: Option<Arc<ProgressBuilder>>,
61    pub is_yaml_format: bool,
62    pub jobs: usize,
63}
64
65impl FileHasher {
66    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
67
68    /// Creates a new `FileHasher` for the given directories.
69    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
70        if dirs.is_empty() {
71            anyhow::bail!("At least one directory must be specified.");
72        }
73        Ok(Self {
74            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
75            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
76            cache: None,
77            num_hashed: AtomicUsize::new(0),
78            num_hash_looked_up: AtomicUsize::new(0),
79            exclude: None,
80            progress: None,
81            is_yaml_format: false,
82            jobs: Self::DEFAULT_JOBS,
83        })
84    }
85
86    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
87        let mut hasher = Self::new(dirs)?;
88        hasher.cache = Some(hasher.new_cache()?);
89        Ok(hasher)
90    }
91
92    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
93        let common_ancestor = crate::common_ancestor(&self.dirs)
94            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
95        Ok(FileHashCache::find_or_new(&common_ancestor))
96    }
97
98    /// Gets the hash cache.
99    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
100        if self.cache.is_none() {
101            self.cache = Some(self.new_cache()?);
102        }
103        Ok(Arc::clone(self.cache.as_ref().unwrap()))
104    }
105
106    /// Remove a cache entry if it exists.
107    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
108        let cache = self.cache()?;
109        let relative = crate::strip_prefix(path, cache.base_dir())?;
110        cache.remove(relative);
111        Ok(())
112    }
113
114    /// Save the hash cache if it is dirty.
115    pub fn save_cache(&self) -> anyhow::Result<()> {
116        log::info!(
117            "Hash stats for {:?}: {} computed, {} looked up",
118            self.dirs,
119            self.num_hashed.load(Ordering::Relaxed),
120            self.num_hash_looked_up.load(Ordering::Relaxed)
121        );
122        if let Some(cache) = &self.cache {
123            cache.save()?;
124        }
125        Ok(())
126    }
127
128    /// Clears the loaded hashes in the cache.
129    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
130        let cache = self.cache()?;
131        for dir in &self.dirs {
132            let relative = crate::strip_prefix(dir, cache.base_dir())?;
133            cache.clear(relative);
134        }
135        Ok(())
136    }
137
138    /// Executes the check/update process.
139    pub fn check(&self, update: bool) -> anyhow::Result<()> {
140        if self.dirs.len() > 1 {
141            anyhow::bail!("Check mode only supports one directory.");
142        }
143        let start_time = time::Instant::now();
144        let progress = self
145            .progress
146            .as_ref()
147            .map(|progress| progress.add_spinner())
148            .unwrap_or_else(Progress::none);
149        progress.set_message("Scanning directory...");
150        let mut num_new = 0;
151        let mut num_modified = 0;
152        let mut num_error = 0;
153        std::thread::scope(|scope| {
154            let (tx, rx) = mpsc::channel();
155            scope.spawn(|| {
156                if let Err(e) = self.check_streaming(tx, update) {
157                    log::error!("Error during check: {}", e);
158                }
159            });
160            while let Ok(event) = rx.recv() {
161                match event {
162                    CheckEvent::StartChecking => {
163                        progress.set_message("Checking files...");
164                    }
165                    CheckEvent::TotalFiles(total) => {
166                        progress.set_length(total as u64);
167                        progress.set_message("");
168                    }
169                    CheckEvent::Result(path, status) => {
170                        let symbol = match status {
171                            CheckStatus::New => {
172                                num_new += 1;
173                                '+'
174                            }
175                            CheckStatus::Modified => {
176                                num_modified += 1;
177                                '!'
178                            }
179                            CheckStatus::Unchanged => unreachable!(),
180                        };
181                        progress.inc(1);
182                        progress.suspend_for(stdout(), || {
183                            println!("{} {}", symbol, path.display());
184                        });
185                    }
186                    CheckEvent::FileDone => {
187                        progress.inc(1);
188                    }
189                    CheckEvent::Error => {
190                        progress.inc(1);
191                        num_error += 1;
192                    }
193                }
194            }
195        });
196        progress.finish();
197        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
198        Ok(())
199    }
200
201    fn print_check_summary(
202        &self,
203        start_time: &time::Instant,
204        num_new: usize,
205        num_modified: usize,
206        num_error: usize,
207    ) -> io::Result<()> {
208        let summary = [
209            ("Elapsed:", 0),
210            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
211            ("New files:", num_new),
212            ("Modified files:", num_modified),
213            ("Errors:", num_error),
214        ];
215        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
216        let mut writer = std::io::stderr();
217        formatter.write_value(
218            &mut writer,
219            summary[0].0,
220            FormattedDuration(start_time.elapsed()),
221        )?;
222        formatter.write_values(&mut writer, &summary[1..])
223    }
224
225    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
226        assert_eq!(self.dirs.len(), 1);
227        let cache = self.new_cache()?;
228        let base_dir = &self.dirs[0];
229        let relative = crate::strip_prefix(base_dir, cache.base_dir())?;
230        cache.set_remove_if_no_access(relative);
231        let cache_clone = Arc::clone(&cache);
232        std::thread::scope(|global_scope| {
233            let mut it = FileIterator::new(base_dir);
234            it.cache = Some(Arc::clone(&cache));
235            it.exclude = self.exclude.as_ref();
236            let it_rx = it.spawn_in_scope(global_scope);
237            tx.send(CheckEvent::StartChecking)?;
238            let pool = crate::build_thread_pool(self.jobs)?;
239            pool.scope(move |scope| -> anyhow::Result<()> {
240                let mut total_files = 0;
241                for path in it_rx {
242                    total_files += 1;
243                    let tx = tx.clone();
244                    let cache = Arc::clone(&cache);
245                    scope.spawn(move |_| {
246                        let status = self.check_file(&path, &cache, update);
247                        let event = match status {
248                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
249                                let rel_path = crate::strip_prefix(&path, base_dir).unwrap();
250                                CheckEvent::Result(rel_path.into(), status.unwrap())
251                            }
252                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
253                            Err(e) => {
254                                log::error!("Failed to check file {:?}: {}", path, e);
255                                CheckEvent::Error
256                            }
257                        };
258                        if tx.send(event).is_err() {
259                            log::error!("Send failed");
260                        }
261                    });
262                }
263                tx.send(CheckEvent::TotalFiles(total_files))?;
264                Ok(())
265            })
266        })?;
267        cache_clone.save()?;
268        Ok(())
269    }
270
271    fn check_file(
272        &self,
273        abs_path: &Path,
274        cache: &FileHashCache,
275        update: bool,
276    ) -> anyhow::Result<CheckStatus> {
277        assert!(abs_path.is_absolute());
278        let computed_hash = self.compute_hash(abs_path)?;
279        let rel_path = crate::strip_prefix(abs_path, cache.base_dir())?;
280        let cached_hash = cache.get_by_path(rel_path);
281        let status = match cached_hash {
282            None => CheckStatus::New,
283            Some(cached) => {
284                if computed_hash != cached {
285                    CheckStatus::Modified
286                } else {
287                    CheckStatus::Unchanged
288                }
289            }
290        };
291        if update {
292            let modified = fs::metadata(abs_path)?.modified()?;
293            match status {
294                CheckStatus::New | CheckStatus::Modified => {
295                    cache.insert(rel_path, modified, computed_hash);
296                }
297                CheckStatus::Unchanged => {
298                    if cache.get(rel_path, modified).is_none() {
299                        cache.insert(rel_path, modified, computed_hash);
300                    }
301                }
302            }
303        }
304        Ok(status)
305    }
306
307    /// Executes the duplicate file finding process and prints results.
308    pub fn run(&self) -> anyhow::Result<()> {
309        let start_time = time::Instant::now();
310        let mut duplicates = self.find_duplicates()?;
311        let mut total_wasted_space = 0;
312        if !duplicates.is_empty() {
313            duplicates.sort_by_key(|a| a.size);
314            for dupes in &duplicates {
315                if self.is_yaml_format {
316                    dupes.write_yaml(std::io::stdout())?;
317                } else {
318                    dupes.write_human(std::io::stdout())?;
319                }
320                total_wasted_space += dupes.wasted_size();
321            }
322        }
323        self.print_duplicates_summary(&start_time, total_wasted_space)?;
324        Ok(())
325    }
326
327    fn print_duplicates_summary(
328        &self,
329        start_time: &time::Instant,
330        total_wasted_space: u64,
331    ) -> io::Result<()> {
332        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
333        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
334        let total_wasted_space = crate::human_readable_size(total_wasted_space);
335        let summary = [
336            ("Elapsed:", elapsed),
337            ("Hash computed:", num_hashed),
338            ("Total wasted space:", total_wasted_space),
339        ];
340        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
341        formatter.write_values(&mut io::stderr(), &summary)
342    }
343
344    /// Finds duplicated files and returns a list of duplicate groups.
345    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
346        let progress = self
347            .progress
348            .as_ref()
349            .map(|progress| progress.add_spinner())
350            .unwrap_or_else(Progress::none);
351        progress.set_message("Scanning directories...");
352
353        let (tx, rx) = mpsc::channel();
354        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
355        std::thread::scope(|scope| {
356            scope.spawn(|| {
357                if let Err(e) = self.find_duplicates_streaming(tx) {
358                    log::error!("Error during duplicate finding: {}", e);
359                }
360            });
361
362            while let Ok(event) = rx.recv() {
363                match event {
364                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
365                    DupEvent::NumFiles(num) => progress.set_length(num as u64),
366                    DupEvent::Result(path, size, hash) => {
367                        progress.inc(1);
368                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
369                            paths: Vec::new(),
370                            size,
371                        });
372                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
373                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
374                        entry.paths.push(path);
375                    }
376                    DupEvent::Error => progress.inc(1),
377                }
378            }
379        });
380        progress.finish();
381
382        let mut duplicates = Vec::new();
383        for (_, mut dupes) in by_hash {
384            if dupes.paths.len() > 1 {
385                dupes.paths.sort();
386                duplicates.push(dupes);
387            }
388        }
389        Ok(duplicates)
390    }
391
392    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
393        std::thread::scope(|global_scope| {
394            let (it_rx, caches) = self.stream_file_items(global_scope)?;
395            let caches = &caches;
396            let pool = crate::build_thread_pool(self.jobs)?;
397            pool.scope(move |scope| -> anyhow::Result<()> {
398                let mut by_size: HashMap<u64, DupState> = HashMap::new();
399                let mut num_hashed = 0;
400                tx.send(DupEvent::StartHashing)?;
401                for (path, dir_index) in it_rx {
402                    let meta = fs::metadata(&path)?;
403                    let size = meta.len();
404                    if size == 0 {
405                        continue;
406                    }
407                    let modified = meta.modified()?;
408                    let cache = &caches[dir_index];
409                    match by_size.entry(size) {
410                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
411                        {
412                            DupState::Single(path0, modified0, dir_index0) => {
413                                // We found a second file of identical size.
414                                // Time to start hashing both the *original* matching file and the *new* one!
415                                let cache0 = &caches[*dir_index0];
416                                self.send_hash(path0, size, *modified0, cache0, &tx, scope);
417                                self.send_hash(&path, size, modified, cache, &tx, scope);
418
419                                // Modify the state to indicate we are now fully hashing this size bucket.
420                                *occ.get_mut() = DupState::Hashing;
421                                num_hashed += 2;
422                            }
423                            DupState::Hashing => {
424                                // File size bucket already hashing; just dynamically spawn the new file immediately.
425                                self.send_hash(&path, size, modified, cache, &tx, scope);
426                                num_hashed += 1;
427                            }
428                        },
429                        std::collections::hash_map::Entry::Vacant(vac) => {
430                            vac.insert(DupState::Single(path, modified, dir_index));
431                        }
432                    }
433                }
434                tx.send(DupEvent::NumFiles(num_hashed))?;
435                Ok(())
436            })?;
437            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
438            Ok::<(), anyhow::Error>(())
439        })?;
440        Ok(())
441    }
442
443    fn stream_file_items<'scope, 'env>(
444        &'env self,
445        scope: &'scope std::thread::Scope<'scope, 'env>,
446    ) -> anyhow::Result<(mpsc::Receiver<FileItem>, Vec<Arc<FileHashCache>>)> {
447        let (it_tx, it_rx) = mpsc::channel();
448        let mut caches = Vec::with_capacity(self.dirs.len());
449        for (dir_index, dir) in self.dirs.iter().enumerate() {
450            let mut it = FileIterator::new(dir);
451            let cache = FileHashCache::find_or_new(dir);
452            it.cache = Some(Arc::clone(&cache));
453            it.exclude = self.exclude.as_ref();
454            let it_tx = it_tx.clone();
455            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
456            caches.push(cache);
457        }
458        Ok((it_rx, caches))
459    }
460
461    fn send_hash<'scope>(
462        &'scope self,
463        path: &Path,
464        size: u64,
465        modified: time::SystemTime,
466        cache: &Arc<FileHashCache>,
467        tx: &mpsc::Sender<DupEvent>,
468        scope: &rayon::Scope<'scope>,
469    ) {
470        let (hash, relative) = self
471            .get_hash_from_cache(path, modified, cache)
472            .expect("path should be in cache base_dir");
473        if let Some(hash) = hash {
474            let _ = tx.send(DupEvent::Result(path.to_path_buf(), size, hash));
475            return;
476        }
477
478        let path = path.to_path_buf();
479        let relative = relative.to_path_buf();
480        let tx = tx.clone();
481        let cache = Arc::clone(cache);
482        scope.spawn(move |_| {
483            if let Ok(hash) = self.compute_hash(&path) {
484                cache.insert(&relative, modified, hash);
485                let _ = tx.send(DupEvent::Result(path, size, hash));
486            } else {
487                log::error!("Failed to hash file: {:?}", path);
488                let _ = tx.send(DupEvent::Error);
489            }
490        });
491    }
492
493    /// Gets the hash of a file, using the cache if available.
494    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
495        let cache = self.cache.as_ref().expect("cache should be initialized");
496        let meta = fs::metadata(path)?;
497        let modified = meta.modified()?;
498        let (hash, relative) = self.get_hash_from_cache(path, modified, cache)?;
499        if let Some(hash) = hash {
500            return Ok(hash);
501        }
502
503        let hash = self.compute_hash(path)?;
504        cache.insert(relative, modified, hash);
505        Ok(hash)
506    }
507
508    fn get_hash_from_cache<'a>(
509        &self,
510        path: &'a Path,
511        modified: time::SystemTime,
512        cache: &FileHashCache,
513    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
514        let relative = crate::strip_prefix(path, cache.base_dir())
515            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
516        if let Some(hash) = cache.get(relative, modified) {
517            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
518            return Ok((Some(hash), relative));
519        }
520        Ok((None, relative))
521    }
522
523    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
524        let start_time = time::Instant::now();
525        let mut f = fs::File::open(path)?;
526        let len = f.metadata()?.len();
527        let progress = self
528            .progress
529            .as_ref()
530            .map(|progress| progress.add_file(path, len))
531            .unwrap_or_else(Progress::none);
532        let mut hasher = blake3::Hasher::new();
533        if self.buffer_size == 0 {
534            if len > 0 {
535                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
536                hasher.update(&mmap[..]);
537                progress.inc(len);
538            }
539        } else {
540            let mut buf = vec![0u8; self.buffer_size];
541            loop {
542                let n = f.read(&mut buf)?;
543                if n == 0 {
544                    break;
545                }
546                hasher.update(&buf[..n]);
547                progress.inc(n as u64);
548            }
549        }
550        progress.finish();
551        self.num_hashed.fetch_add(1, Ordering::Relaxed);
552        let hash = hasher.finalize();
553        log::debug!(
554            "Computed hash in {}: {:?}",
555            FormattedDuration(start_time.elapsed()),
556            path
557        );
558        Ok(hash)
559    }
560}
561
562/// A group of duplicated files and their size.
563#[derive(Clone, Debug)]
564pub struct DuplicatedFiles {
565    pub paths: Vec<PathBuf>,
566    pub size: u64,
567}
568
569impl DuplicatedFiles {
570    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
571        writeln!(
572            writer,
573            "Identical {} files of {}:",
574            self.paths.len(),
575            crate::human_readable_size(self.size)
576        )?;
577        for path in &self.paths {
578            writeln!(writer, "  {}", path.display())?;
579        }
580        Ok(())
581    }
582
583    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
584        writeln!(writer, "- paths:")?;
585        for path in &self.paths {
586            writeln!(writer, "  - {:?}", path)?;
587        }
588        writeln!(writer, "  size: {}", self.size)?;
589        Ok(())
590    }
591
592    fn wasted_size(&self) -> u64 {
593        self.size * (self.paths.len() as u64 - 1)
594    }
595}
596
597#[cfg(test)]
598mod tests {
599    use super::*;
600
601    fn default_exclude() -> globset::GlobSet {
602        let mut builder = globset::GlobSetBuilder::new();
603        builder.add(
604            globset::GlobBuilder::new(".hash_cache")
605                .case_insensitive(true)
606                .build()
607                .unwrap(),
608        );
609        builder.build().unwrap()
610    }
611
612    #[test]
613    fn find_duplicates() -> anyhow::Result<()> {
614        let dir = tempfile::tempdir()?;
615
616        let file1_path = dir.path().join("same1.txt");
617        fs::write(&file1_path, "same content")?;
618
619        let file2_path = dir.path().join("same2.txt");
620        fs::write(&file2_path, "same content")?;
621
622        let diff_path = dir.path().join("diff.txt");
623        fs::write(&diff_path, "different content")?;
624
625        let mut hasher = FileHasher::new(&[dir.path()])?;
626        hasher.buffer_size = 8192;
627        let duplicates = hasher.find_duplicates()?;
628
629        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
630        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
631
632        assert_eq!(duplicates.len(), 1);
633        let group = &duplicates[0];
634        assert_eq!(group.paths.len(), 2);
635        assert_eq!(group.size, 12); // "same content" is 12 bytes
636
637        assert!(group.paths.contains(&file1_path));
638        assert!(group.paths.contains(&file2_path));
639
640        Ok(())
641    }
642
643    #[test]
644    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
645        let dir = tempfile::tempdir()?;
646        let dir_path = dir.path();
647
648        let sub_dir = dir_path.join("a").join("a");
649        fs::create_dir_all(&sub_dir)?;
650
651        let file1_path = sub_dir.join("1");
652        fs::write(&file1_path, "same content")?;
653
654        let file2_path = sub_dir.join("2");
655        fs::write(&file2_path, "same content")?;
656
657        // Create empty cache file in a/a to force it to be the cache base
658        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
659        fs::File::create(&cache_aa_path)?;
660
661        // Run find_duplicates on a/a
662        let hasher_aa = FileHasher::new(&[&sub_dir])?;
663        let duplicates_aa = hasher_aa.find_duplicates()?;
664        assert_eq!(duplicates_aa.len(), 1);
665        assert!(cache_aa_path.exists());
666        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
667        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
668
669        // Create empty cache file in a to force it to be the cache base
670        let root_a = dir_path.join("a");
671        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
672        fs::File::create(&cache_a_path)?;
673
674        // Run find_duplicates on a
675        let hasher_a = FileHasher::new(&[&root_a])?;
676        let duplicates_a = hasher_a.find_duplicates()?;
677        assert_eq!(duplicates_a.len(), 1);
678        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
679        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
680
681        // The merged child cache should be removed.
682        assert!(cache_a_path.exists());
683        assert!(!cache_aa_path.exists());
684
685        Ok(())
686    }
687
688    #[test]
689    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
690        let dir = tempfile::tempdir()?;
691
692        let file1_path = dir.path().join("same1.txt");
693        fs::write(&file1_path, "same content")?;
694
695        let file2_path = dir.path().join("same2.txt");
696        fs::write(&file2_path, "same content")?;
697
698        let exclude_path = dir.path().join("exclude.txt");
699        fs::write(&exclude_path, "same content")?;
700
701        let mut hasher = FileHasher::new(&[dir.path()])?;
702        hasher.buffer_size = 8192;
703        let mut builder = globset::GlobSetBuilder::new();
704        builder.add(
705            globset::GlobBuilder::new("exclude.txt")
706                .case_insensitive(true)
707                .build()?,
708        );
709        let filter = builder.build()?;
710        hasher.exclude = Some(filter);
711
712        let duplicates = hasher.find_duplicates()?;
713        assert_eq!(duplicates.len(), 1);
714        let group = &duplicates[0];
715        assert_eq!(group.paths.len(), 2);
716        assert!(group.paths.contains(&file1_path));
717        assert!(group.paths.contains(&file2_path));
718        assert!(!group.paths.contains(&exclude_path));
719        Ok(())
720    }
721
722    #[test]
723    fn check_mode_empty_cache() -> anyhow::Result<()> {
724        let dir = tempfile::tempdir()?;
725        let dir_path = dir.path().to_path_buf();
726        println!("{:?}", dir_path);
727        let file1_path = dir.path().join("file1.txt");
728        fs::write(&file1_path, "content 1")?;
729        let file2_path = dir.path().join("file2.txt");
730        fs::write(&file2_path, "content 2")?;
731
732        let mut hasher = FileHasher::new(&[&dir_path])?;
733        hasher.exclude = Some(default_exclude());
734        let (tx, rx) = mpsc::channel();
735        hasher.check_streaming(tx, false)?;
736        let mut results = Vec::new();
737        let mut start_seen = false;
738        let mut total_files = None;
739        let mut file_done_count = 0;
740        let mut num_error = 0;
741        while let Ok(event) = rx.recv() {
742            match event {
743                CheckEvent::StartChecking => start_seen = true,
744                CheckEvent::TotalFiles(total) => total_files = Some(total),
745                CheckEvent::Result(path, status) => results.push((path, status)),
746                CheckEvent::FileDone => file_done_count += 1,
747                CheckEvent::Error => num_error += 1,
748            }
749        }
750        assert!(start_seen);
751        assert_eq!(total_files, Some(2));
752        assert_eq!(file_done_count, 0);
753        assert_eq!(num_error, 0);
754
755        results.sort_by(|a, b| a.0.cmp(&b.0));
756        assert_eq!(results.len(), 2);
757        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
758        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
759
760        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
761        Ok(())
762    }
763
764    #[test]
765    fn check_mode_with_cache() -> anyhow::Result<()> {
766        let dir = tempfile::tempdir()?;
767        let dir_path = dir.path().to_path_buf();
768        let file1_path = dir.path().join("file1.txt");
769        let file2_path = dir.path().join("file2.txt");
770        fs::write(&file1_path, "content 1")?;
771        fs::write(&file2_path, "content 2")?;
772
773        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
774        hasher.exclude = Some(default_exclude());
775        let _hash1 = hasher.get_hash(&file1_path)?;
776        let _hash2 = hasher.get_hash(&file2_path)?;
777        hasher.save_cache()?;
778        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
779
780        let mut hasher = FileHasher::new(&[&dir_path])?;
781        hasher.exclude = Some(default_exclude());
782        let (tx, rx) = mpsc::channel();
783        hasher.check_streaming(tx, false)?;
784        let mut results = Vec::new();
785        let mut file_done_count = 0;
786        while let Ok(event) = rx.recv() {
787            match event {
788                CheckEvent::Result(path, status) => results.push((path, status)),
789                CheckEvent::FileDone => file_done_count += 1,
790                _ => {}
791            }
792        }
793        assert_eq!(results.len(), 0);
794        assert_eq!(file_done_count, 2);
795
796        fs::write(&file1_path, "content 1 modified")?;
797
798        let file2_meta_before = fs::metadata(&file2_path)?;
799        let mtime_before = file2_meta_before.modified()?;
800        std::thread::sleep(time::Duration::from_millis(10));
801        fs::write(&file2_path, "content 2")?;
802        let file2_meta_after = fs::metadata(&file2_path)?;
803        let mtime_after = file2_meta_after.modified()?;
804        assert!(mtime_after > mtime_before);
805
806        let mut hasher = FileHasher::new(&[&dir_path])?;
807        hasher.exclude = Some(default_exclude());
808        let (tx, rx) = mpsc::channel();
809        hasher.check_streaming(tx, false)?;
810        let mut results = Vec::new();
811        let mut file_done_count = 0;
812        while let Ok(event) = rx.recv() {
813            match event {
814                CheckEvent::Result(path, status) => results.push((path, status)),
815                CheckEvent::FileDone => file_done_count += 1,
816                _ => {}
817            }
818        }
819        assert_eq!(results.len(), 1);
820        assert_eq!(
821            results[0],
822            (PathBuf::from("file1.txt"), CheckStatus::Modified)
823        );
824        assert_eq!(file_done_count, 1);
825        Ok(())
826    }
827
828    #[test]
829    fn check_update_mode() -> anyhow::Result<()> {
830        let dir = tempfile::tempdir()?;
831        let dir_path = dir.path().to_path_buf();
832        let file1_path = dir.path().join("file1.txt");
833        fs::write(&file1_path, "content 1")?;
834
835        let mut hasher = FileHasher::new(&[&dir_path])?;
836        hasher.exclude = Some(default_exclude());
837        let (tx, rx) = mpsc::channel();
838        hasher.check_streaming(tx, true)?;
839        while rx.recv().is_ok() {}
840        hasher.save_cache()?;
841        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
842
843        let cache = FileHashCache::new(&dir_path);
844        let mtime1 = fs::metadata(&file1_path)?.modified()?;
845        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
846        assert!(hash1.is_some());
847
848        std::thread::sleep(time::Duration::from_millis(10));
849        fs::write(&file1_path, "content 1 modified")?;
850        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
851
852        let mut hasher = FileHasher::new(&[&dir_path])?;
853        hasher.exclude = Some(default_exclude());
854        let (tx, rx) = mpsc::channel();
855        hasher.check_streaming(tx, true)?;
856        while rx.recv().is_ok() {}
857        hasher.save_cache()?;
858
859        let cache = FileHashCache::new(&dir_path);
860        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
861        assert!(hash_mod.is_some());
862        assert_ne!(hash1, hash_mod);
863
864        std::thread::sleep(time::Duration::from_millis(10));
865        fs::write(&file1_path, "content 1 modified")?;
866        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
867        assert!(mtime1_mod2 > mtime1_mod);
868
869        assert!(
870            cache
871                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
872                .is_none()
873        );
874
875        let mut hasher = FileHasher::new(&[&dir_path])?;
876        hasher.exclude = Some(default_exclude());
877        let (tx, rx) = mpsc::channel();
878        hasher.check_streaming(tx, true)?;
879        while rx.recv().is_ok() {}
880        hasher.save_cache()?;
881
882        let cache = FileHashCache::new(&dir_path);
883        assert!(
884            cache
885                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
886                .is_some()
887        );
888        Ok(())
889    }
890
891    #[test]
892    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
893        let dir = tempfile::tempdir()?;
894        let dir_path = dir.path().to_path_buf();
895        let file1_path = dir.path().join("file1.txt");
896        let file2_path = dir.path().join("file2.txt");
897        fs::write(&file1_path, "content 1")?;
898        fs::write(&file2_path, "content 2")?;
899        let mtime1 = fs::metadata(&file1_path)?.modified()?;
900        let mtime2 = fs::metadata(&file2_path)?.modified()?;
901
902        let mut hasher = FileHasher::new(&[&dir_path])?;
903        hasher.exclude = Some(default_exclude());
904        let (tx, rx) = mpsc::channel();
905        hasher.check_streaming(tx, true)?;
906        while rx.recv().is_ok() {}
907        hasher.save_cache()?;
908
909        // Verify both are in the cache
910        let cache = FileHashCache::new(&dir_path);
911        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
912        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
913
914        // Now delete file2 from disk
915        fs::remove_file(&file2_path)?;
916
917        // Run check and save again
918        let mut hasher = FileHasher::new(&[&dir_path])?;
919        hasher.exclude = Some(default_exclude());
920        let (tx, rx) = mpsc::channel();
921        hasher.check_streaming(tx, true)?;
922        while rx.recv().is_ok() {}
923        hasher.save_cache()?;
924
925        // Verify file2 is removed from cache, but file1 is still there
926        let cache = FileHashCache::new(&dir_path);
927        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
928        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
929        Ok(())
930    }
931
932    #[test]
933    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
934        let tmp = tempfile::tempdir()?;
935        let dir1 = tmp.path().join("dir1");
936        let dir2 = tmp.path().join("dir2");
937        fs::create_dir(&dir1)?;
938        fs::create_dir(&dir2)?;
939        let file1_path = dir1.join("file1.txt");
940        fs::write(&file1_path, "same content")?;
941        let file2_path = dir2.join("file2.txt");
942        fs::write(&file2_path, "same content")?;
943        let hasher = FileHasher::new(&[&dir1, &dir2])?;
944        let duplicates = hasher.find_duplicates()?;
945        assert_eq!(duplicates.len(), 1);
946        let group = &duplicates[0];
947        assert_eq!(group.paths.len(), 2);
948        assert_eq!(group.size, 12);
949        assert!(group.paths.contains(&file1_path));
950        assert!(group.paths.contains(&file2_path));
951
952        Ok(())
953    }
954
955    #[test]
956    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
957        let tmp = tempfile::tempdir()?;
958        let dir1 = tmp.path().join("dir1");
959        let dir2 = tmp.path().join("dir2");
960        fs::create_dir(&dir1)?;
961        fs::create_dir(&dir2)?;
962        let hasher = FileHasher::new(&[&dir1, &dir2])?;
963        assert!(hasher.check(false).is_err());
964        Ok(())
965    }
966}