Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    Classification, ColumnFormatter, DirectoryComparer, FileComparer, FileComparisonResult,
3    FileHashCache, FileItem, FileIterator, OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10    collections::HashMap,
11    fs,
12    io::{self, Read, stdout},
13    path::{Path, PathBuf},
14    sync::{
15        Arc,
16        atomic::{self, AtomicUsize},
17        mpsc,
18    },
19    time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26    StartHashing,
27    Total(ProgressValue),
28    Result(FileItem, blake3::Hash),
29    Error,
30}
31
32#[derive(Debug)]
33enum CheckEvent {
34    StartChecking,
35    Total(ProgressValue),
36    Result(FileComparisonResult, ProgressValue),
37    Progress(ProgressValue),
38    Error(FileItem),
39}
40
41enum DupState {
42    Single(FileItem, usize),
43    Hashing,
44}
45
46/// A tool for finding duplicated files in a directory.
47pub struct FileHasher {
48    dirs: Vec<PathBuf>,
49    pub buffer_size: usize,
50    cache: Option<Arc<FileHashCache>>,
51    num_hashed: AtomicUsize,
52    num_hash_looked_up: AtomicUsize,
53    pub exclude: Option<GlobSet>,
54    pub progress: Option<Arc<ProgressBuilder>>,
55    pub output_format: OutputFormat,
56    pub jobs: usize,
57}
58
59impl FileHasher {
60    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
61
62    /// Creates a new `FileHasher` for the given directories.
63    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
64        if dirs.is_empty() {
65            anyhow::bail!("At least one directory must be specified.");
66        }
67        Ok(Self {
68            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
69            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
70            cache: None,
71            num_hashed: AtomicUsize::new(0),
72            num_hash_looked_up: AtomicUsize::new(0),
73            exclude: None,
74            progress: None,
75            output_format: OutputFormat::Default,
76            jobs: Self::DEFAULT_JOBS,
77        })
78    }
79
80    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
81        let mut hasher = Self::new(dirs)?;
82        hasher.cache = Some(hasher.new_cache()?);
83        Ok(hasher)
84    }
85
86    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
87        let common_ancestor = crate::common_ancestor(&self.dirs)
88            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
89        Ok(FileHashCache::find_or_new(&common_ancestor))
90    }
91
92    /// Gets the hash cache.
93    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
94        if self.cache.is_none() {
95            self.cache = Some(self.new_cache()?);
96        }
97        Ok(Arc::clone(self.cache.as_ref().unwrap()))
98    }
99
100    /// Remove a cache entry if it exists.
101    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
102        let cache = self.cache()?;
103        let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
104        cache.remove(relative);
105        Ok(())
106    }
107
108    /// Save the hash cache if it is dirty.
109    pub fn save_cache(&self) -> anyhow::Result<()> {
110        log::info!(
111            "Hash stats for {:?}: {} computed, {} looked up",
112            self.dirs,
113            self.num_hashed.load(atomic::Ordering::Relaxed),
114            self.num_hash_looked_up.load(atomic::Ordering::Relaxed)
115        );
116        if let Some(cache) = &self.cache {
117            cache.save()?;
118        }
119        Ok(())
120    }
121
122    /// Clears the loaded hashes in the cache.
123    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
124        let cache = self.cache()?;
125        for dir in &self.dirs {
126            let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
127            cache.clear(relative);
128        }
129        Ok(())
130    }
131
132    /// Executes the check/update process.
133    pub fn check(&self, update: bool) -> anyhow::Result<()> {
134        match self.output_format {
135            OutputFormat::Default | OutputFormat::Symbol => {}
136            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
137        }
138        if self.dirs.len() > 1 {
139            anyhow::bail!("Check mode only supports one directory.");
140        }
141        let start_time = time::Instant::now();
142        let mut progress = self
143            .progress
144            .as_ref()
145            .map(|progress| progress.add_spinner())
146            .unwrap_or_else(Progress::none);
147        progress.use_bytes();
148        progress.set_message("Scanning directory...");
149        let mut num_new = 0;
150        let mut num_modified = 0;
151        let mut num_error = 0;
152        std::thread::scope(|scope| {
153            let (tx, rx) = mpsc::channel();
154            scope.spawn(|| {
155                if let Err(e) = self.check_streaming(tx, update) {
156                    log::error!("Error during check: {}", e);
157                }
158            });
159            while let Ok(event) = rx.recv() {
160                match event {
161                    CheckEvent::StartChecking => {
162                        progress.set_message("Checking files...");
163                    }
164                    CheckEvent::Total(value) => {
165                        progress.set_length(value);
166                        progress.set_message("");
167                    }
168                    CheckEvent::Result(result, value) => {
169                        progress.inc(value);
170                        progress.suspend_for(stdout(), || {
171                            result.print(self.output_format, "cached", "current")
172                        });
173                        if result.classification == Classification::OnlyInDir2 {
174                            num_new += 1;
175                        } else if result.is_identical_content() == Some(false) {
176                            num_modified += 1;
177                        }
178                    }
179                    CheckEvent::Progress(value) => {
180                        progress.inc(value);
181                    }
182                    CheckEvent::Error(file) => {
183                        progress.inc(ProgressValue::with_skip(file.size()));
184                        num_error += 1;
185                    }
186                }
187            }
188        });
189        progress.finish();
190        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
191        Ok(())
192    }
193
194    fn print_check_summary(
195        &self,
196        start_time: &time::Instant,
197        num_new: usize,
198        num_modified: usize,
199        num_error: usize,
200    ) -> io::Result<()> {
201        let summary = [
202            ("Elapsed:", 0),
203            (
204                "Hash computed:",
205                self.num_hashed.load(atomic::Ordering::Relaxed),
206            ),
207            ("New files:", num_new),
208            ("Modified files:", num_modified),
209            ("Errors:", num_error),
210        ];
211        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
212        let mut writer = std::io::stderr();
213        formatter.write_value(
214            &mut writer,
215            summary[0].0,
216            FormattedDuration(start_time.elapsed()),
217        )?;
218        formatter.write_values(&mut writer, &summary[1..])
219    }
220
221    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
222        assert_eq!(self.dirs.len(), 1);
223        let cache = self.new_cache()?;
224        let base_dir = &self.dirs[0];
225        let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
226        cache.set_remove_if_no_access(relative);
227        let cache_clone = Arc::clone(&cache);
228        std::thread::scope(|global_scope| {
229            let mut it = FileIterator::new(base_dir);
230            it.cache = Some(Arc::clone(&cache));
231            it.exclude = self.exclude.as_ref();
232            let it_rx = it.spawn_in_scope(global_scope);
233            tx.send(CheckEvent::StartChecking)?;
234            let pool = crate::build_thread_pool(self.jobs)?;
235            pool.scope(move |scope| -> anyhow::Result<()> {
236                let mut total = ProgressValue::default();
237                for file in it_rx {
238                    self.check_file(file, &cache, update, &mut total, &tx, scope);
239                }
240                tx.send(CheckEvent::Total(total))?;
241                Ok(())
242            })
243        })?;
244        cache_clone.save()?;
245        Ok(())
246    }
247
248    fn check_file<'scope>(
249        &'scope self,
250        file: FileItem,
251        cache: &Arc<FileHashCache>,
252        update: bool,
253        total: &mut ProgressValue,
254        tx: &mpsc::Sender<CheckEvent>,
255        scope: &rayon::Scope<'scope>,
256    ) {
257        *total += ProgressValue::with_size(file.size());
258        let tx = tx.clone();
259        let cache = Arc::clone(cache);
260        scope.spawn(move |_| {
261            if let Err(error) = self._check_file(&file, cache, update, &tx) {
262                log::error!("Failed to check file '{}': {}", file, error);
263                if tx.send(CheckEvent::Error(file)).is_err() {
264                    log::error!("Send failed");
265                }
266            }
267        });
268    }
269
270    fn _check_file(
271        &self,
272        file: &FileItem,
273        cache: Arc<FileHashCache>,
274        update: bool,
275        tx: &mpsc::Sender<CheckEvent>,
276    ) -> anyhow::Result<()> {
277        assert!(file.path().is_absolute());
278        let path_in_cache = file.relative_path(cache.base_dir());
279        match cache.get_entry(path_in_cache) {
280            Some(cached) => {
281                let mut result =
282                    FileComparisonResult::new(file.path().into(), Classification::InBoth);
283                result.update_moodified(cached.modified, file.modified());
284                if cached.size != 0 {
285                    result.update_size(cached.size, file.size());
286                }
287                if !update && cached.size != 0 && file.size() != cached.size {
288                    tx.send(CheckEvent::Result(
289                        result,
290                        ProgressValue::with_skip(file.size()),
291                    ))?;
292                    return Ok(());
293                }
294                let hash = self.compute_hash(file)?;
295                result.is_content_same = Some(hash == cached.hash);
296                if hash == cached.hash {
297                    if cached.should_update(file, update) {
298                        cache.insert(path_in_cache, file, hash);
299                    }
300                    tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
301                } else {
302                    if update {
303                        cache.insert(path_in_cache, file, hash);
304                    }
305                    tx.send(CheckEvent::Result(
306                        result,
307                        ProgressValue::with_size(file.size()),
308                    ))?;
309                }
310            }
311            None => {
312                if update {
313                    let hash = self.compute_hash(file)?;
314                    cache.insert(path_in_cache, file, hash);
315                }
316                tx.send(CheckEvent::Result(
317                    FileComparisonResult::new(file.path().into(), Classification::OnlyInDir2),
318                    ProgressValue::with_size(file.size()),
319                ))?;
320            }
321        }
322        Ok(())
323    }
324
325    /// Executes the duplicate file finding process and prints results.
326    pub fn run(&self) -> anyhow::Result<()> {
327        let start_time = time::Instant::now();
328        let mut duplicates = self.find_duplicates()?;
329        let mut total_wasted_space = 0;
330        if !duplicates.is_empty() {
331            duplicates.sort_by_key(|a| a.size);
332            total_wasted_space = self.print_duplicates_results(&duplicates)?;
333        }
334        self.print_duplicates_summary(&start_time, total_wasted_space)?;
335        Ok(())
336    }
337
338    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
339        let mut total_wasted_space = 0;
340        for dupes in duplicates {
341            dupes.print(self.output_format)?;
342            total_wasted_space += dupes.wasted_size();
343        }
344        Ok(total_wasted_space)
345    }
346
347    fn print_duplicates_summary(
348        &self,
349        start_time: &time::Instant,
350        total_wasted_space: u64,
351    ) -> io::Result<()> {
352        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
353        let num_hashed = self.num_hashed.load(atomic::Ordering::Relaxed).to_string();
354        let total_wasted_space = crate::human_readable_size(total_wasted_space);
355        let summary = [
356            ("Elapsed:", elapsed),
357            ("Hash computed:", num_hashed),
358            ("Total wasted space:", total_wasted_space),
359        ];
360        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
361        formatter.write_values(&mut io::stderr(), &summary)
362    }
363
364    /// Finds duplicated files and returns a list of duplicate groups.
365    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
366        let mut progress = self
367            .progress
368            .as_ref()
369            .map(|progress| progress.add_spinner())
370            .unwrap_or_else(Progress::none);
371        progress.set_message("Scanning directories...");
372
373        let (tx, rx) = mpsc::channel();
374        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
375        std::thread::scope(|scope| {
376            scope.spawn(|| {
377                if let Err(e) = self.find_duplicates_streaming(tx) {
378                    log::error!("Error during duplicate finding: {}", e);
379                }
380            });
381
382            while let Ok(event) = rx.recv() {
383                match event {
384                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
385                    DupEvent::Total(value) => progress.set_length(value),
386                    DupEvent::Result(file, hash) => {
387                        progress.inc(ProgressValue::with_size(file.size()));
388                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
389                            paths: Vec::new(),
390                            size: file.size(),
391                        });
392                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
393                        assert_eq!(
394                            entry.size,
395                            file.size(),
396                            "Hash collision: sizes do not match"
397                        );
398                        entry.paths.push(file.into_path_buf());
399                    }
400                    DupEvent::Error => {}
401                }
402            }
403        });
404        progress.finish();
405
406        let mut duplicates = Vec::new();
407        for (_, mut dupes) in by_hash {
408            if dupes.paths.len() > 1 {
409                dupes.paths.sort();
410                duplicates.push(dupes);
411            }
412        }
413        Ok(duplicates)
414    }
415
416    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
417        std::thread::scope(|global_scope| {
418            let (it_rx, caches) = self.stream_file_items(global_scope)?;
419            let caches = &caches;
420            let pool = crate::build_thread_pool(self.jobs)?;
421            pool.scope(move |scope| -> anyhow::Result<()> {
422                let mut by_size: HashMap<u64, DupState> = HashMap::new();
423                let mut total = ProgressValue::default();
424                tx.send(DupEvent::StartHashing)?;
425                for (file, dir_index) in it_rx {
426                    let size = file.size();
427                    if size == 0 {
428                        continue;
429                    }
430                    let cache = &caches[dir_index];
431                    match by_size.entry(size) {
432                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
433                        {
434                            DupState::Single(file0, dir_index0) => {
435                                // We found a second file of identical size.
436                                // Time to start hashing both the *original* matching file and the *new* one!
437                                let cache0 = &caches[*dir_index0];
438                                self.send_hash(file0, cache0, &tx, scope);
439                                self.send_hash(&file, cache, &tx, scope);
440                                total += ProgressValue::with_size(file0.size());
441                                total += ProgressValue::with_size(file.size());
442
443                                // Modify the state to indicate we are now fully hashing this size bucket.
444                                *occ.get_mut() = DupState::Hashing;
445                            }
446                            DupState::Hashing => {
447                                // File size bucket already hashing; just dynamically spawn the new file immediately.
448                                self.send_hash(&file, cache, &tx, scope);
449                                total += ProgressValue::with_size(file.size());
450                            }
451                        },
452                        std::collections::hash_map::Entry::Vacant(vac) => {
453                            vac.insert(DupState::Single(file, dir_index));
454                        }
455                    }
456                }
457                tx.send(DupEvent::Total(total))?;
458                Ok(())
459            })?;
460            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
461            Ok::<(), anyhow::Error>(())
462        })?;
463        Ok(())
464    }
465
466    fn stream_file_items<'scope, 'env>(
467        &'env self,
468        scope: &'scope std::thread::Scope<'scope, 'env>,
469    ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
470        let (it_tx, it_rx) = mpsc::channel();
471        let mut caches = Vec::with_capacity(self.dirs.len());
472        for (dir_index, dir) in self.dirs.iter().enumerate() {
473            let mut it = FileIterator::new(dir);
474            let cache = FileHashCache::find_or_new(dir);
475            it.cache = Some(Arc::clone(&cache));
476            it.exclude = self.exclude.as_ref();
477            let it_tx = it_tx.clone();
478            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
479            caches.push(cache);
480        }
481        Ok((it_rx, caches))
482    }
483
484    fn send_hash<'scope>(
485        &'scope self,
486        file: &FileItem,
487        cache: &Arc<FileHashCache>,
488        tx: &mpsc::Sender<DupEvent>,
489        scope: &rayon::Scope<'scope>,
490    ) {
491        let (hash, relative) = self
492            .get_hash_from_cache(file, cache)
493            .expect("path should be in cache base_dir");
494        if let Some(hash) = hash {
495            let _ = tx.send(DupEvent::Result(file.clone(), hash));
496            return;
497        }
498
499        let file = file.clone();
500        let relative = relative.to_path_buf();
501        let tx = tx.clone();
502        let cache = Arc::clone(cache);
503        scope.spawn(move |_| {
504            if let Ok(hash) = self.compute_hash(&file) {
505                cache.insert(&relative, &file, hash);
506                let _ = tx.send(DupEvent::Result(file, hash));
507            } else {
508                log::error!("Failed to hash file: '{}'", file);
509                let _ = tx.send(DupEvent::Error);
510            }
511        });
512    }
513
514    /// Gets the hash of a file, using the cache if available.
515    pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
516        let cache = self.cache.as_ref().expect("cache should be initialized");
517        let (hash, relative) = self.get_hash_from_cache(file, cache)?;
518        if let Some(hash) = hash {
519            return Ok(hash);
520        }
521
522        let hash = self.compute_hash(file)?;
523        cache.insert(relative, file, hash);
524        Ok(hash)
525    }
526
527    fn get_hash_from_cache<'a>(
528        &self,
529        file: &'a FileItem,
530        cache: &FileHashCache,
531    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
532        let relative = file.relative_path(cache.base_dir());
533        if let Some(hash) = cache.get(relative, file) {
534            self.num_hash_looked_up
535                .fetch_add(1, atomic::Ordering::Relaxed);
536            return Ok((Some(hash), relative));
537        }
538        Ok((None, relative))
539    }
540
541    fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
542        let start_time = time::Instant::now();
543        let mut f = fs::File::open(file.path())?;
544        let mut progress = self
545            .progress
546            .as_ref()
547            .map(|progress| progress.add_file(file.path(), file.size()))
548            .unwrap_or_else(Progress::none);
549        let mut hasher = blake3::Hasher::new();
550        if self.buffer_size == 0 {
551            if file.size() > 0 {
552                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
553                hasher.update(&mmap[..]);
554                progress.inc(ProgressValue::with_size(file.size()));
555            }
556        } else {
557            let mut buf = vec![0u8; self.buffer_size];
558            loop {
559                let n = f.read(&mut buf)?;
560                if n == 0 {
561                    break;
562                }
563                hasher.update(&buf[..n]);
564                progress.inc(ProgressValue::with_size(n as u64));
565            }
566        }
567        progress.finish();
568        self.num_hashed.fetch_add(1, atomic::Ordering::Relaxed);
569        let hash = hasher.finalize();
570        log::debug!(
571            "Computed hash in {}: '{}'",
572            FormattedDuration(start_time.elapsed()),
573            file
574        );
575        Ok(hash)
576    }
577}
578
579/// A group of duplicated files and their size.
580#[derive(Clone, Debug)]
581pub struct DuplicatedFiles {
582    pub paths: Vec<PathBuf>,
583    pub size: u64,
584}
585
586impl DuplicatedFiles {
587    fn wasted_size(&self) -> u64 {
588        self.size * (self.paths.len() as u64 - 1)
589    }
590
591    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
592        match output_format {
593            OutputFormat::Default => self.write_human(stdout())?,
594            OutputFormat::PowerShell => self.write_pwsh(stdout())?,
595            OutputFormat::Shell => self.write_shell(stdout())?,
596            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
597        }
598        Ok(())
599    }
600
601    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
602        writeln!(
603            writer,
604            "Identical {} files of {}:",
605            self.paths.len(),
606            crate::human_readable_size(self.size)
607        )?;
608        for path in &self.paths {
609            writeln!(writer, "  {}", path.display())?;
610        }
611        Ok(())
612    }
613
614    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
615        writeln!(writer, "- paths:")?;
616        for path in &self.paths {
617            writeln!(writer, "  - {:?}", path)?;
618        }
619        writeln!(writer, "  size: {}", self.size)?;
620        Ok(())
621    }
622
623    fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
624        self.write_shell_with(writer, "cp", Self::escape_shell)
625    }
626
627    fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
628        self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
629    }
630
631    fn write_shell_with(
632        &self,
633        mut writer: impl io::Write,
634        cmd: &str,
635        stringify: impl Fn(&Path) -> String,
636    ) -> anyhow::Result<()> {
637        let mut iter = self.paths.iter();
638        if let Some(path0) = iter.next() {
639            let path0 = stringify(path0);
640            for path in iter {
641                writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
642            }
643        }
644        Ok(())
645    }
646
647    fn escape_shell(path: &Path) -> String {
648        path.to_string_lossy().replace('\'', "\'\\'\'")
649    }
650
651    fn escape_shell_double(path: &Path) -> String {
652        path.to_string_lossy().replace('\'', "\'\'")
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::*;
659    use std::cmp::Ordering;
660
661    fn default_exclude() -> globset::GlobSet {
662        let mut builder = globset::GlobSetBuilder::new();
663        builder.add(
664            globset::GlobBuilder::new(".hash_cache")
665                .case_insensitive(true)
666                .build()
667                .unwrap(),
668        );
669        builder.build().unwrap()
670    }
671
672    #[test]
673    fn find_duplicates() -> anyhow::Result<()> {
674        let dir = tempfile::tempdir()?;
675
676        let file1_path = dir.path().join("same1.txt");
677        fs::write(&file1_path, "same content")?;
678
679        let file2_path = dir.path().join("same2.txt");
680        fs::write(&file2_path, "same content")?;
681
682        let diff_path = dir.path().join("diff.txt");
683        fs::write(&diff_path, "different content")?;
684
685        let mut hasher = FileHasher::new(&[dir.path()])?;
686        hasher.buffer_size = 8192;
687        let duplicates = hasher.find_duplicates()?;
688
689        assert_eq!(hasher.num_hashed.load(atomic::Ordering::Relaxed), 2);
690        assert_eq!(hasher.num_hash_looked_up.load(atomic::Ordering::Relaxed), 0);
691
692        assert_eq!(duplicates.len(), 1);
693        let group = &duplicates[0];
694        assert_eq!(group.paths.len(), 2);
695        assert_eq!(group.size, 12); // "same content" is 12 bytes
696
697        assert!(group.paths.contains(&file1_path));
698        assert!(group.paths.contains(&file2_path));
699
700        Ok(())
701    }
702
703    #[test]
704    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
705        let dir = tempfile::tempdir()?;
706        let dir_path = dir.path();
707
708        let sub_dir = dir_path.join("a").join("a");
709        fs::create_dir_all(&sub_dir)?;
710
711        let file1_path = sub_dir.join("1");
712        fs::write(&file1_path, "same content")?;
713
714        let file2_path = sub_dir.join("2");
715        fs::write(&file2_path, "same content")?;
716
717        // Create empty cache file in a/a to force it to be the cache base
718        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
719        fs::File::create(&cache_aa_path)?;
720
721        // Run find_duplicates on a/a
722        let hasher_aa = FileHasher::new(&[&sub_dir])?;
723        let duplicates_aa = hasher_aa.find_duplicates()?;
724        assert_eq!(duplicates_aa.len(), 1);
725        assert!(cache_aa_path.exists());
726        assert_eq!(hasher_aa.num_hashed.load(atomic::Ordering::Relaxed), 2);
727        assert_eq!(
728            hasher_aa.num_hash_looked_up.load(atomic::Ordering::Relaxed),
729            0
730        );
731
732        // Create empty cache file in a to force it to be the cache base
733        let root_a = dir_path.join("a");
734        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
735        fs::File::create(&cache_a_path)?;
736
737        // Run find_duplicates on a
738        let hasher_a = FileHasher::new(&[&root_a])?;
739        let duplicates_a = hasher_a.find_duplicates()?;
740        assert_eq!(duplicates_a.len(), 1);
741        assert_eq!(hasher_a.num_hashed.load(atomic::Ordering::Relaxed), 0);
742        assert_eq!(
743            hasher_a.num_hash_looked_up.load(atomic::Ordering::Relaxed),
744            2
745        );
746
747        // The merged child cache should be removed.
748        assert!(cache_a_path.exists());
749        assert!(!cache_aa_path.exists());
750
751        Ok(())
752    }
753
754    #[test]
755    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
756        let dir = tempfile::tempdir()?;
757
758        let file1_path = dir.path().join("same1.txt");
759        fs::write(&file1_path, "same content")?;
760
761        let file2_path = dir.path().join("same2.txt");
762        fs::write(&file2_path, "same content")?;
763
764        let exclude_path = dir.path().join("exclude.txt");
765        fs::write(&exclude_path, "same content")?;
766
767        let mut hasher = FileHasher::new(&[dir.path()])?;
768        hasher.buffer_size = 8192;
769        let mut builder = globset::GlobSetBuilder::new();
770        builder.add(
771            globset::GlobBuilder::new("exclude.txt")
772                .case_insensitive(true)
773                .build()?,
774        );
775        let filter = builder.build()?;
776        hasher.exclude = Some(filter);
777
778        let duplicates = hasher.find_duplicates()?;
779        assert_eq!(duplicates.len(), 1);
780        let group = &duplicates[0];
781        assert_eq!(group.paths.len(), 2);
782        assert!(group.paths.contains(&file1_path));
783        assert!(group.paths.contains(&file2_path));
784        assert!(!group.paths.contains(&exclude_path));
785        Ok(())
786    }
787
788    #[derive(Default)]
789    struct CheckCollector {
790        start_seen: bool,
791        total_files: Option<u64>,
792        results: Vec<FileComparisonResult>,
793        file_done_count: u64,
794        num_error: usize,
795    }
796
797    impl CheckCollector {
798        fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
799            let mut collector = Self::default();
800            collector._collect(rx, base_dir);
801            collector
802        }
803
804        fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
805            while let Ok(event) = rx.recv() {
806                match event {
807                    CheckEvent::StartChecking => self.start_seen = true,
808                    CheckEvent::Total(total) => self.total_files = Some(total.num_files),
809                    CheckEvent::Result(mut result, _size) => {
810                        result.relative_path = result
811                            .relative_path
812                            .strip_prefix(base_dir)
813                            .unwrap()
814                            .to_path_buf();
815                        self.results.push(result);
816                    }
817                    CheckEvent::Progress(progress_val) => {
818                        self.file_done_count += progress_val.num_files;
819                    }
820                    CheckEvent::Error(_) => {
821                        self.num_error += 1;
822                    }
823                }
824            }
825        }
826    }
827
828    #[test]
829    fn check_mode_empty_cache() -> anyhow::Result<()> {
830        let dir = tempfile::tempdir()?;
831        let dir_path = dir.path().to_path_buf();
832        println!("{:?}", dir_path);
833        let file1_path = dir.path().join("file1.txt");
834        fs::write(&file1_path, "content 1")?;
835        let file2_path = dir.path().join("file2.txt");
836        fs::write(&file2_path, "content 2")?;
837
838        let mut hasher = FileHasher::new(&[&dir_path])?;
839        hasher.exclude = Some(default_exclude());
840        let (tx, rx) = mpsc::channel();
841        hasher.check_streaming(tx, false)?;
842        let collector = CheckCollector::collect(rx, &dir_path);
843        assert!(collector.start_seen);
844        assert_eq!(collector.total_files, Some(2));
845        assert_eq!(collector.file_done_count, 0);
846        assert_eq!(collector.num_error, 0);
847
848        let mut results = collector.results;
849        results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
850        assert_eq!(results.len(), 2);
851        assert_eq!(results[0].relative_path, Path::new("file1.txt"));
852        assert_eq!(results[0].classification, Classification::OnlyInDir2);
853        assert_eq!(results[1].relative_path, Path::new("file2.txt"));
854        assert_eq!(results[1].classification, Classification::OnlyInDir2);
855
856        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
857        Ok(())
858    }
859
860    #[test]
861    fn check_mode_with_cache() -> anyhow::Result<()> {
862        let dir = tempfile::tempdir()?;
863        let dir_path = dir.path().to_path_buf();
864        let file1_path = dir.path().join("file1.txt");
865        let file2_path = dir.path().join("file2.txt");
866        fs::write(&file1_path, "content 1")?;
867        fs::write(&file2_path, "content 2")?;
868        let file1 = FileItem::try_from(file1_path.as_path())?;
869        let file2 = FileItem::try_from(file2_path.as_path())?;
870
871        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
872        hasher.exclude = Some(default_exclude());
873        let _hash1 = hasher.get_hash(&file1)?;
874        let _hash2 = hasher.get_hash(&file2)?;
875        hasher.save_cache()?;
876        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
877
878        let mut hasher = FileHasher::new(&[&dir_path])?;
879        hasher.exclude = Some(default_exclude());
880        let (tx, rx) = mpsc::channel();
881        hasher.check_streaming(tx, false)?;
882        let collector = CheckCollector::collect(rx, &dir_path);
883        assert_eq!(collector.results.len(), 0);
884        assert_eq!(collector.file_done_count, 2);
885
886        fs::write(&file1_path, "content 1 modified")?;
887
888        let file2_meta_before = fs::metadata(&file2_path)?;
889        let mtime_before = file2_meta_before.modified()?;
890        std::thread::sleep(time::Duration::from_millis(10));
891        fs::write(&file2_path, "content 2")?;
892        let file2_meta_after = fs::metadata(&file2_path)?;
893        let mtime_after = file2_meta_after.modified()?;
894        assert!(mtime_after > mtime_before);
895
896        let mut hasher = FileHasher::new(&[&dir_path])?;
897        hasher.exclude = Some(default_exclude());
898        let (tx, rx) = mpsc::channel();
899        hasher.check_streaming(tx, false)?;
900        let collector = CheckCollector::collect(rx, &dir_path);
901        assert_eq!(collector.results.len(), 1);
902        let results = collector.results;
903        assert_eq!(results[0].relative_path, Path::new("file1.txt"));
904        assert_eq!(results[0].modified_time_comparison, Some(Ordering::Less));
905        assert_eq!(results[0].size_comparison, Some(Ordering::Less));
906        assert_eq!(results[0].is_content_same, None);
907        assert_eq!(collector.file_done_count, 1);
908        Ok(())
909    }
910
911    #[test]
912    fn check_update_mode() -> anyhow::Result<()> {
913        let dir = tempfile::tempdir()?;
914        let dir_path = dir.path().to_path_buf();
915        let file1_path = dir.path().join("file1.txt");
916        fs::write(&file1_path, "content 1")?;
917
918        let mut hasher = FileHasher::new(&[&dir_path])?;
919        hasher.exclude = Some(default_exclude());
920        let (tx, rx) = mpsc::channel();
921        hasher.check_streaming(tx, true)?;
922        let _ = CheckCollector::collect(rx, &dir_path);
923        hasher.save_cache()?;
924        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
925
926        let cache = FileHashCache::new(&dir_path);
927        let file1 = FileItem::try_from(file1_path.as_path())?;
928        let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
929        assert!(hash1.is_some());
930
931        std::thread::sleep(time::Duration::from_millis(10));
932        fs::write(&file1_path, "content 1 modified")?;
933        let file1_mod = FileItem::try_from(file1_path.as_path())?;
934
935        let mut hasher = FileHasher::new(&[&dir_path])?;
936        hasher.exclude = Some(default_exclude());
937        let (tx, rx) = mpsc::channel();
938        hasher.check_streaming(tx, true)?;
939        let _ = CheckCollector::collect(rx, &dir_path);
940        hasher.save_cache()?;
941
942        let cache = FileHashCache::new(&dir_path);
943        let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
944        assert!(hash_mod.is_some());
945        assert_ne!(hash1, hash_mod);
946
947        std::thread::sleep(time::Duration::from_millis(10));
948        fs::write(&file1_path, "content 1 modified")?;
949        let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
950        assert!(file1_mod2.modified() > file1_mod.modified());
951
952        assert!(
953            cache
954                .get(&PathBuf::from("file1.txt"), &file1_mod2)
955                .is_none()
956        );
957
958        let mut hasher = FileHasher::new(&[&dir_path])?;
959        hasher.exclude = Some(default_exclude());
960        let (tx, rx) = mpsc::channel();
961        hasher.check_streaming(tx, true)?;
962        let _ = CheckCollector::collect(rx, &dir_path);
963        hasher.save_cache()?;
964
965        let cache = FileHashCache::new(&dir_path);
966        assert!(
967            cache
968                .get(&PathBuf::from("file1.txt"), &file1_mod2)
969                .is_some()
970        );
971        Ok(())
972    }
973
974    #[test]
975    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
976        let dir = tempfile::tempdir()?;
977        let dir_path = dir.path().to_path_buf();
978        let file1_path = dir.path().join("file1.txt");
979        let file2_path = dir.path().join("file2.txt");
980        fs::write(&file1_path, "content 1")?;
981        fs::write(&file2_path, "content 2")?;
982        let file1 = FileItem::try_from(file1_path.as_path())?;
983        let file2 = FileItem::try_from(file2_path.as_path())?;
984
985        let mut hasher = FileHasher::new(&[&dir_path])?;
986        hasher.exclude = Some(default_exclude());
987        let (tx, rx) = mpsc::channel();
988        hasher.check_streaming(tx, true)?;
989        let _ = CheckCollector::collect(rx, &dir_path);
990        hasher.save_cache()?;
991
992        // Verify both are in the cache
993        let cache = FileHashCache::new(&dir_path);
994        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
995        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
996
997        // Now delete file2 from disk
998        fs::remove_file(&file2_path)?;
999
1000        // Run check and save again
1001        let mut hasher = FileHasher::new(&[&dir_path])?;
1002        hasher.exclude = Some(default_exclude());
1003        let (tx, rx) = mpsc::channel();
1004        hasher.check_streaming(tx, true)?;
1005        let _ = CheckCollector::collect(rx, &dir_path);
1006        hasher.save_cache()?;
1007
1008        // Verify file2 is removed from cache, but file1 is still there
1009        let cache = FileHashCache::new(&dir_path);
1010        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1011        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1012        Ok(())
1013    }
1014
1015    #[test]
1016    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1017        let tmp = tempfile::tempdir()?;
1018        let dir1 = tmp.path().join("dir1");
1019        let dir2 = tmp.path().join("dir2");
1020        fs::create_dir(&dir1)?;
1021        fs::create_dir(&dir2)?;
1022        let file1_path = dir1.join("file1.txt");
1023        fs::write(&file1_path, "same content")?;
1024        let file2_path = dir2.join("file2.txt");
1025        fs::write(&file2_path, "same content")?;
1026        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1027        let duplicates = hasher.find_duplicates()?;
1028        assert_eq!(duplicates.len(), 1);
1029        let group = &duplicates[0];
1030        assert_eq!(group.paths.len(), 2);
1031        assert_eq!(group.size, 12);
1032        assert!(group.paths.contains(&file1_path));
1033        assert!(group.paths.contains(&file2_path));
1034
1035        Ok(())
1036    }
1037
1038    #[test]
1039    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1040        let tmp = tempfile::tempdir()?;
1041        let dir1 = tmp.path().join("dir1");
1042        let dir2 = tmp.path().join("dir2");
1043        fs::create_dir(&dir1)?;
1044        fs::create_dir(&dir2)?;
1045        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1046        assert!(hasher.check(false).is_err());
1047        Ok(())
1048    }
1049
1050    #[test]
1051    fn escape_shell() {
1052        let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1053        assert_eq!(escape_shell(""), "");
1054        assert_eq!(escape_shell("abc"), "abc");
1055        assert_eq!(escape_shell("a'b"), "a'\\''b");
1056        assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1057
1058        let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1059        assert_eq!(escape_shell_double(""), "");
1060        assert_eq!(escape_shell_double("abc"), "abc");
1061        assert_eq!(escape_shell_double("a'b"), "a''b");
1062        assert_eq!(escape_shell_double("a'b'"), "a''b''");
1063    }
1064
1065    #[test]
1066    fn write_dups_shell_empty() -> anyhow::Result<()> {
1067        let dup_empty = DuplicatedFiles {
1068            paths: vec![],
1069            size: 100,
1070        };
1071        let mut buf = Vec::new();
1072        dup_empty.write_shell(&mut buf)?;
1073        assert_eq!(String::from_utf8(buf)?, "");
1074        Ok(())
1075    }
1076
1077    #[test]
1078    fn write_dups_shell_one() -> anyhow::Result<()> {
1079        let dup_one = DuplicatedFiles {
1080            paths: vec![PathBuf::from("a.txt")],
1081            size: 100,
1082        };
1083        let mut buf = Vec::new();
1084        dup_one.write_shell(&mut buf)?;
1085        assert_eq!(String::from_utf8(buf)?, "");
1086        Ok(())
1087    }
1088
1089    #[test]
1090    fn write_dups_shell_two() -> anyhow::Result<()> {
1091        let dup_multiple = DuplicatedFiles {
1092            paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1093            size: 100,
1094        };
1095        let mut buf = Vec::new();
1096        dup_multiple.write_shell(&mut buf)?;
1097        assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1098        Ok(())
1099    }
1100
1101    #[test]
1102    fn write_dups_shell_three() -> anyhow::Result<()> {
1103        let dup_multiple = DuplicatedFiles {
1104            paths: vec![
1105                PathBuf::from("a.txt"),
1106                PathBuf::from("b.txt"),
1107                PathBuf::from("c.txt"),
1108            ],
1109            size: 100,
1110        };
1111        let mut buf = Vec::new();
1112        dup_multiple.write_shell(&mut buf)?;
1113        assert_eq!(
1114            String::from_utf8(buf)?,
1115            "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1116        );
1117        Ok(())
1118    }
1119
1120    #[test]
1121    fn write_dups_shell_quotes() -> anyhow::Result<()> {
1122        let dup_quotes = DuplicatedFiles {
1123            paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1124            size: 100,
1125        };
1126        let mut buf = Vec::new();
1127        dup_quotes.write_shell(&mut buf)?;
1128        assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1129
1130        let mut buf = Vec::new();
1131        dup_quotes.write_pwsh(&mut buf)?;
1132        assert_eq!(
1133            String::from_utf8(buf)?,
1134            "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1135        );
1136        Ok(())
1137    }
1138}