Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    Classification, ColumnFormatter, DirectoryComparer, FileComparer, FileComparisonResult,
3    FileHashCache, FileItem, FileIterator, OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10    collections::HashMap,
11    fs,
12    io::{self, Read, stdout},
13    path::{Path, PathBuf},
14    sync::{
15        Arc,
16        atomic::{self, AtomicUsize},
17        mpsc,
18    },
19    time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26    StartHashing,
27    Total(ProgressValue),
28    Result(FileItem, blake3::Hash),
29    Error,
30}
31
32#[derive(Debug)]
33enum CheckEvent {
34    StartChecking,
35    Total(ProgressValue),
36    Result(FileComparisonResult, ProgressValue),
37    Progress(ProgressValue),
38    Error(FileItem),
39}
40
41enum DupState {
42    Single(FileItem, usize),
43    Hashing,
44}
45
46/// A tool for finding duplicated files in a directory.
47pub struct FileHasher {
48    dirs: Vec<PathBuf>,
49    pub buffer_size: usize,
50    cache: Option<Arc<FileHashCache>>,
51    num_hashed: AtomicUsize,
52    num_hash_looked_up: AtomicUsize,
53    pub exclude: Option<GlobSet>,
54    pub progress: Option<Arc<ProgressBuilder>>,
55    pub output_format: OutputFormat,
56    pub jobs: usize,
57}
58
59impl FileHasher {
60    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
61
62    /// Creates a new `FileHasher` for the given directories.
63    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
64        if dirs.is_empty() {
65            anyhow::bail!("At least one directory must be specified.");
66        }
67        Ok(Self {
68            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
69            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
70            cache: None,
71            num_hashed: AtomicUsize::new(0),
72            num_hash_looked_up: AtomicUsize::new(0),
73            exclude: None,
74            progress: None,
75            output_format: OutputFormat::Default,
76            jobs: Self::DEFAULT_JOBS,
77        })
78    }
79
80    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
81        let mut hasher = Self::new(dirs)?;
82        hasher.cache = Some(hasher.new_cache()?);
83        Ok(hasher)
84    }
85
86    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
87        let common_ancestor = crate::common_ancestor(&self.dirs)
88            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
89        Ok(FileHashCache::find_or_new(&common_ancestor))
90    }
91
92    /// Gets the hash cache.
93    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
94        if self.cache.is_none() {
95            self.cache = Some(self.new_cache()?);
96        }
97        Ok(Arc::clone(self.cache.as_ref().unwrap()))
98    }
99
100    /// Remove a cache entry if it exists.
101    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
102        let cache = self.cache()?;
103        let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
104        cache.remove(relative);
105        Ok(())
106    }
107
108    /// Save the hash cache if it is dirty.
109    pub fn save_cache(&self) -> anyhow::Result<()> {
110        log::info!(
111            "Hash stats for {:?}: {} computed, {} looked up",
112            self.dirs,
113            self.num_hashed.load(atomic::Ordering::Relaxed),
114            self.num_hash_looked_up.load(atomic::Ordering::Relaxed)
115        );
116        if let Some(cache) = &self.cache {
117            cache.save()?;
118        }
119        Ok(())
120    }
121
122    /// Clears the loaded hashes in the cache.
123    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
124        let cache = self.cache()?;
125        for dir in &self.dirs {
126            let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
127            cache.clear(relative);
128        }
129        Ok(())
130    }
131
132    /// Executes the check/update process.
133    pub fn check(&self, update: bool) -> anyhow::Result<()> {
134        match self.output_format {
135            OutputFormat::Default | OutputFormat::Symbol => {}
136            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
137        }
138        if self.dirs.len() > 1 {
139            anyhow::bail!("Check mode only supports one directory.");
140        }
141        let start_time = time::Instant::now();
142        let mut progress = self
143            .progress
144            .as_ref()
145            .map(|progress| progress.add_spinner())
146            .unwrap_or_else(Progress::none);
147        progress.use_bytes();
148        progress.set_message("Scanning directory...");
149        let mut num_new = 0;
150        let mut num_modified = 0;
151        let mut num_error = 0;
152        std::thread::scope(|scope| {
153            let (tx, rx) = mpsc::channel();
154            scope.spawn(|| {
155                if let Err(e) = self.check_streaming(tx, update) {
156                    log::error!("Error during check: {}", e);
157                }
158            });
159            while let Ok(event) = rx.recv() {
160                match event {
161                    CheckEvent::StartChecking => {
162                        progress.set_message("Checking files...");
163                    }
164                    CheckEvent::Total(value) => {
165                        progress.set_length(value);
166                        progress.set_message("");
167                    }
168                    CheckEvent::Result(result, value) => {
169                        progress.inc(value);
170                        match self.output_format {
171                            OutputFormat::Symbol => progress.suspend_for(stdout(), || {
172                                println!(
173                                    "{} {}",
174                                    result.to_symbol_string(),
175                                    result.relative_path.display()
176                                );
177                            }),
178                            OutputFormat::Default => progress.suspend_for(stdout(), || {
179                                println!(
180                                    "{}: {}",
181                                    result.relative_path.display(),
182                                    result.to_string("cached", "current")
183                                );
184                            }),
185                            _ => unreachable!(),
186                        }
187                        if result.classification == Classification::OnlyInDir2 {
188                            num_new += 1;
189                        } else if result.is_identical_content() == Some(false) {
190                            num_modified += 1;
191                        }
192                    }
193                    CheckEvent::Progress(value) => {
194                        progress.inc(value);
195                    }
196                    CheckEvent::Error(file) => {
197                        progress.inc(ProgressValue::with_skip(file.size()));
198                        num_error += 1;
199                    }
200                }
201            }
202        });
203        progress.finish();
204        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
205        Ok(())
206    }
207
208    fn print_check_summary(
209        &self,
210        start_time: &time::Instant,
211        num_new: usize,
212        num_modified: usize,
213        num_error: usize,
214    ) -> io::Result<()> {
215        let summary = [
216            ("Elapsed:", 0),
217            (
218                "Hash computed:",
219                self.num_hashed.load(atomic::Ordering::Relaxed),
220            ),
221            ("New files:", num_new),
222            ("Modified files:", num_modified),
223            ("Errors:", num_error),
224        ];
225        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
226        let mut writer = std::io::stderr();
227        formatter.write_value(
228            &mut writer,
229            summary[0].0,
230            FormattedDuration(start_time.elapsed()),
231        )?;
232        formatter.write_values(&mut writer, &summary[1..])
233    }
234
235    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
236        assert_eq!(self.dirs.len(), 1);
237        let cache = self.new_cache()?;
238        let base_dir = &self.dirs[0];
239        let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
240        cache.set_remove_if_no_access(relative);
241        let cache_clone = Arc::clone(&cache);
242        std::thread::scope(|global_scope| {
243            let mut it = FileIterator::new(base_dir);
244            it.cache = Some(Arc::clone(&cache));
245            it.exclude = self.exclude.as_ref();
246            let it_rx = it.spawn_in_scope(global_scope);
247            tx.send(CheckEvent::StartChecking)?;
248            let pool = crate::build_thread_pool(self.jobs)?;
249            pool.scope(move |scope| -> anyhow::Result<()> {
250                let mut total = ProgressValue::default();
251                for file in it_rx {
252                    self.check_file(file, &cache, update, &mut total, &tx, scope);
253                }
254                tx.send(CheckEvent::Total(total))?;
255                Ok(())
256            })
257        })?;
258        cache_clone.save()?;
259        Ok(())
260    }
261
262    fn check_file<'scope>(
263        &'scope self,
264        file: FileItem,
265        cache: &Arc<FileHashCache>,
266        update: bool,
267        total: &mut ProgressValue,
268        tx: &mpsc::Sender<CheckEvent>,
269        scope: &rayon::Scope<'scope>,
270    ) {
271        *total += ProgressValue::with_size(file.size());
272        let tx = tx.clone();
273        let cache = Arc::clone(cache);
274        scope.spawn(move |_| {
275            if let Err(error) = self._check_file(&file, cache, update, &tx) {
276                log::error!("Failed to check file '{}': {}", file, error);
277                if tx.send(CheckEvent::Error(file)).is_err() {
278                    log::error!("Send failed");
279                }
280            }
281        });
282    }
283
284    fn _check_file(
285        &self,
286        file: &FileItem,
287        cache: Arc<FileHashCache>,
288        update: bool,
289        tx: &mpsc::Sender<CheckEvent>,
290    ) -> anyhow::Result<()> {
291        assert!(file.path().is_absolute());
292        let path_in_cache = file.relative_path(cache.base_dir());
293        match cache.get_entry(path_in_cache) {
294            Some(cached) => {
295                let mut result =
296                    FileComparisonResult::new(file.path().into(), Classification::InBoth);
297                result.update_moodified(cached.modified, file.modified());
298                if cached.size != 0 {
299                    result.update_size(cached.size, file.size());
300                }
301                if !update && cached.size != 0 && file.size() != cached.size {
302                    tx.send(CheckEvent::Result(
303                        result,
304                        ProgressValue::with_skip(file.size()),
305                    ))?;
306                    return Ok(());
307                }
308                let hash = self.compute_hash(file)?;
309                result.is_content_same = Some(hash == cached.hash);
310                if hash == cached.hash {
311                    if cached.should_update(file, update) {
312                        cache.insert(path_in_cache, file, hash);
313                    }
314                    tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
315                } else {
316                    if update {
317                        cache.insert(path_in_cache, file, hash);
318                    }
319                    tx.send(CheckEvent::Result(
320                        result,
321                        ProgressValue::with_size(file.size()),
322                    ))?;
323                }
324            }
325            None => {
326                if update {
327                    let hash = self.compute_hash(file)?;
328                    cache.insert(path_in_cache, file, hash);
329                }
330                tx.send(CheckEvent::Result(
331                    FileComparisonResult::new(file.path().into(), Classification::OnlyInDir2),
332                    ProgressValue::with_size(file.size()),
333                ))?;
334            }
335        }
336        Ok(())
337    }
338
339    /// Executes the duplicate file finding process and prints results.
340    pub fn run(&self) -> anyhow::Result<()> {
341        let start_time = time::Instant::now();
342        let mut duplicates = self.find_duplicates()?;
343        let mut total_wasted_space = 0;
344        if !duplicates.is_empty() {
345            duplicates.sort_by_key(|a| a.size);
346            total_wasted_space = self.print_duplicates_results(&duplicates)?;
347        }
348        self.print_duplicates_summary(&start_time, total_wasted_space)?;
349        Ok(())
350    }
351
352    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
353        let mut total_wasted_space = 0;
354        for dupes in duplicates {
355            dupes.print(self.output_format)?;
356            total_wasted_space += dupes.wasted_size();
357        }
358        Ok(total_wasted_space)
359    }
360
361    fn print_duplicates_summary(
362        &self,
363        start_time: &time::Instant,
364        total_wasted_space: u64,
365    ) -> io::Result<()> {
366        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
367        let num_hashed = self.num_hashed.load(atomic::Ordering::Relaxed).to_string();
368        let total_wasted_space = crate::human_readable_size(total_wasted_space);
369        let summary = [
370            ("Elapsed:", elapsed),
371            ("Hash computed:", num_hashed),
372            ("Total wasted space:", total_wasted_space),
373        ];
374        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
375        formatter.write_values(&mut io::stderr(), &summary)
376    }
377
378    /// Finds duplicated files and returns a list of duplicate groups.
379    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
380        let mut progress = self
381            .progress
382            .as_ref()
383            .map(|progress| progress.add_spinner())
384            .unwrap_or_else(Progress::none);
385        progress.set_message("Scanning directories...");
386
387        let (tx, rx) = mpsc::channel();
388        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
389        std::thread::scope(|scope| {
390            scope.spawn(|| {
391                if let Err(e) = self.find_duplicates_streaming(tx) {
392                    log::error!("Error during duplicate finding: {}", e);
393                }
394            });
395
396            while let Ok(event) = rx.recv() {
397                match event {
398                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
399                    DupEvent::Total(value) => progress.set_length(value),
400                    DupEvent::Result(file, hash) => {
401                        progress.inc(ProgressValue::with_size(file.size()));
402                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
403                            paths: Vec::new(),
404                            size: file.size(),
405                        });
406                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
407                        assert_eq!(
408                            entry.size,
409                            file.size(),
410                            "Hash collision: sizes do not match"
411                        );
412                        entry.paths.push(file.into_path_buf());
413                    }
414                    DupEvent::Error => {}
415                }
416            }
417        });
418        progress.finish();
419
420        let mut duplicates = Vec::new();
421        for (_, mut dupes) in by_hash {
422            if dupes.paths.len() > 1 {
423                dupes.paths.sort();
424                duplicates.push(dupes);
425            }
426        }
427        Ok(duplicates)
428    }
429
430    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
431        std::thread::scope(|global_scope| {
432            let (it_rx, caches) = self.stream_file_items(global_scope)?;
433            let caches = &caches;
434            let pool = crate::build_thread_pool(self.jobs)?;
435            pool.scope(move |scope| -> anyhow::Result<()> {
436                let mut by_size: HashMap<u64, DupState> = HashMap::new();
437                let mut total = ProgressValue::default();
438                tx.send(DupEvent::StartHashing)?;
439                for (file, dir_index) in it_rx {
440                    let size = file.size();
441                    if size == 0 {
442                        continue;
443                    }
444                    let cache = &caches[dir_index];
445                    match by_size.entry(size) {
446                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
447                        {
448                            DupState::Single(file0, dir_index0) => {
449                                // We found a second file of identical size.
450                                // Time to start hashing both the *original* matching file and the *new* one!
451                                let cache0 = &caches[*dir_index0];
452                                self.send_hash(file0, cache0, &tx, scope);
453                                self.send_hash(&file, cache, &tx, scope);
454                                total += ProgressValue::with_size(file0.size());
455                                total += ProgressValue::with_size(file.size());
456
457                                // Modify the state to indicate we are now fully hashing this size bucket.
458                                *occ.get_mut() = DupState::Hashing;
459                            }
460                            DupState::Hashing => {
461                                // File size bucket already hashing; just dynamically spawn the new file immediately.
462                                self.send_hash(&file, cache, &tx, scope);
463                                total += ProgressValue::with_size(file.size());
464                            }
465                        },
466                        std::collections::hash_map::Entry::Vacant(vac) => {
467                            vac.insert(DupState::Single(file, dir_index));
468                        }
469                    }
470                }
471                tx.send(DupEvent::Total(total))?;
472                Ok(())
473            })?;
474            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
475            Ok::<(), anyhow::Error>(())
476        })?;
477        Ok(())
478    }
479
480    fn stream_file_items<'scope, 'env>(
481        &'env self,
482        scope: &'scope std::thread::Scope<'scope, 'env>,
483    ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
484        let (it_tx, it_rx) = mpsc::channel();
485        let mut caches = Vec::with_capacity(self.dirs.len());
486        for (dir_index, dir) in self.dirs.iter().enumerate() {
487            let mut it = FileIterator::new(dir);
488            let cache = FileHashCache::find_or_new(dir);
489            it.cache = Some(Arc::clone(&cache));
490            it.exclude = self.exclude.as_ref();
491            let it_tx = it_tx.clone();
492            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
493            caches.push(cache);
494        }
495        Ok((it_rx, caches))
496    }
497
498    fn send_hash<'scope>(
499        &'scope self,
500        file: &FileItem,
501        cache: &Arc<FileHashCache>,
502        tx: &mpsc::Sender<DupEvent>,
503        scope: &rayon::Scope<'scope>,
504    ) {
505        let (hash, relative) = self
506            .get_hash_from_cache(file, cache)
507            .expect("path should be in cache base_dir");
508        if let Some(hash) = hash {
509            let _ = tx.send(DupEvent::Result(file.clone(), hash));
510            return;
511        }
512
513        let file = file.clone();
514        let relative = relative.to_path_buf();
515        let tx = tx.clone();
516        let cache = Arc::clone(cache);
517        scope.spawn(move |_| {
518            if let Ok(hash) = self.compute_hash(&file) {
519                cache.insert(&relative, &file, hash);
520                let _ = tx.send(DupEvent::Result(file, hash));
521            } else {
522                log::error!("Failed to hash file: '{}'", file);
523                let _ = tx.send(DupEvent::Error);
524            }
525        });
526    }
527
528    /// Gets the hash of a file, using the cache if available.
529    pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
530        let cache = self.cache.as_ref().expect("cache should be initialized");
531        let (hash, relative) = self.get_hash_from_cache(file, cache)?;
532        if let Some(hash) = hash {
533            return Ok(hash);
534        }
535
536        let hash = self.compute_hash(file)?;
537        cache.insert(relative, file, hash);
538        Ok(hash)
539    }
540
541    fn get_hash_from_cache<'a>(
542        &self,
543        file: &'a FileItem,
544        cache: &FileHashCache,
545    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
546        let relative = file.relative_path(cache.base_dir());
547        if let Some(hash) = cache.get(relative, file) {
548            self.num_hash_looked_up
549                .fetch_add(1, atomic::Ordering::Relaxed);
550            return Ok((Some(hash), relative));
551        }
552        Ok((None, relative))
553    }
554
555    fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
556        let start_time = time::Instant::now();
557        let mut f = fs::File::open(file.path())?;
558        let mut progress = self
559            .progress
560            .as_ref()
561            .map(|progress| progress.add_file(file.path(), file.size()))
562            .unwrap_or_else(Progress::none);
563        let mut hasher = blake3::Hasher::new();
564        if self.buffer_size == 0 {
565            if file.size() > 0 {
566                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
567                hasher.update(&mmap[..]);
568                progress.inc(ProgressValue::with_size(file.size()));
569            }
570        } else {
571            let mut buf = vec![0u8; self.buffer_size];
572            loop {
573                let n = f.read(&mut buf)?;
574                if n == 0 {
575                    break;
576                }
577                hasher.update(&buf[..n]);
578                progress.inc(ProgressValue::with_size(n as u64));
579            }
580        }
581        progress.finish();
582        self.num_hashed.fetch_add(1, atomic::Ordering::Relaxed);
583        let hash = hasher.finalize();
584        log::debug!(
585            "Computed hash in {}: '{}'",
586            FormattedDuration(start_time.elapsed()),
587            file
588        );
589        Ok(hash)
590    }
591}
592
593/// A group of duplicated files and their size.
594#[derive(Clone, Debug)]
595pub struct DuplicatedFiles {
596    pub paths: Vec<PathBuf>,
597    pub size: u64,
598}
599
600impl DuplicatedFiles {
601    fn wasted_size(&self) -> u64 {
602        self.size * (self.paths.len() as u64 - 1)
603    }
604
605    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
606        match output_format {
607            OutputFormat::Default => self.write_human(stdout())?,
608            OutputFormat::PowerShell => self.write_pwsh(stdout())?,
609            OutputFormat::Shell => self.write_shell(stdout())?,
610            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
611        }
612        Ok(())
613    }
614
615    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
616        writeln!(
617            writer,
618            "Identical {} files of {}:",
619            self.paths.len(),
620            crate::human_readable_size(self.size)
621        )?;
622        for path in &self.paths {
623            writeln!(writer, "  {}", path.display())?;
624        }
625        Ok(())
626    }
627
628    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
629        writeln!(writer, "- paths:")?;
630        for path in &self.paths {
631            writeln!(writer, "  - {:?}", path)?;
632        }
633        writeln!(writer, "  size: {}", self.size)?;
634        Ok(())
635    }
636
637    fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
638        self.write_shell_with(writer, "cp", Self::escape_shell)
639    }
640
641    fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
642        self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
643    }
644
645    fn write_shell_with(
646        &self,
647        mut writer: impl io::Write,
648        cmd: &str,
649        stringify: impl Fn(&Path) -> String,
650    ) -> anyhow::Result<()> {
651        let mut iter = self.paths.iter();
652        if let Some(path0) = iter.next() {
653            let path0 = stringify(path0);
654            for path in iter {
655                writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
656            }
657        }
658        Ok(())
659    }
660
661    fn escape_shell(path: &Path) -> String {
662        path.to_string_lossy().replace('\'', "\'\\'\'")
663    }
664
665    fn escape_shell_double(path: &Path) -> String {
666        path.to_string_lossy().replace('\'', "\'\'")
667    }
668}
669
670#[cfg(test)]
671mod tests {
672    use super::*;
673    use std::cmp::Ordering;
674
675    fn default_exclude() -> globset::GlobSet {
676        let mut builder = globset::GlobSetBuilder::new();
677        builder.add(
678            globset::GlobBuilder::new(".hash_cache")
679                .case_insensitive(true)
680                .build()
681                .unwrap(),
682        );
683        builder.build().unwrap()
684    }
685
686    #[test]
687    fn find_duplicates() -> anyhow::Result<()> {
688        let dir = tempfile::tempdir()?;
689
690        let file1_path = dir.path().join("same1.txt");
691        fs::write(&file1_path, "same content")?;
692
693        let file2_path = dir.path().join("same2.txt");
694        fs::write(&file2_path, "same content")?;
695
696        let diff_path = dir.path().join("diff.txt");
697        fs::write(&diff_path, "different content")?;
698
699        let mut hasher = FileHasher::new(&[dir.path()])?;
700        hasher.buffer_size = 8192;
701        let duplicates = hasher.find_duplicates()?;
702
703        assert_eq!(hasher.num_hashed.load(atomic::Ordering::Relaxed), 2);
704        assert_eq!(hasher.num_hash_looked_up.load(atomic::Ordering::Relaxed), 0);
705
706        assert_eq!(duplicates.len(), 1);
707        let group = &duplicates[0];
708        assert_eq!(group.paths.len(), 2);
709        assert_eq!(group.size, 12); // "same content" is 12 bytes
710
711        assert!(group.paths.contains(&file1_path));
712        assert!(group.paths.contains(&file2_path));
713
714        Ok(())
715    }
716
717    #[test]
718    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
719        let dir = tempfile::tempdir()?;
720        let dir_path = dir.path();
721
722        let sub_dir = dir_path.join("a").join("a");
723        fs::create_dir_all(&sub_dir)?;
724
725        let file1_path = sub_dir.join("1");
726        fs::write(&file1_path, "same content")?;
727
728        let file2_path = sub_dir.join("2");
729        fs::write(&file2_path, "same content")?;
730
731        // Create empty cache file in a/a to force it to be the cache base
732        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
733        fs::File::create(&cache_aa_path)?;
734
735        // Run find_duplicates on a/a
736        let hasher_aa = FileHasher::new(&[&sub_dir])?;
737        let duplicates_aa = hasher_aa.find_duplicates()?;
738        assert_eq!(duplicates_aa.len(), 1);
739        assert!(cache_aa_path.exists());
740        assert_eq!(hasher_aa.num_hashed.load(atomic::Ordering::Relaxed), 2);
741        assert_eq!(
742            hasher_aa.num_hash_looked_up.load(atomic::Ordering::Relaxed),
743            0
744        );
745
746        // Create empty cache file in a to force it to be the cache base
747        let root_a = dir_path.join("a");
748        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
749        fs::File::create(&cache_a_path)?;
750
751        // Run find_duplicates on a
752        let hasher_a = FileHasher::new(&[&root_a])?;
753        let duplicates_a = hasher_a.find_duplicates()?;
754        assert_eq!(duplicates_a.len(), 1);
755        assert_eq!(hasher_a.num_hashed.load(atomic::Ordering::Relaxed), 0);
756        assert_eq!(
757            hasher_a.num_hash_looked_up.load(atomic::Ordering::Relaxed),
758            2
759        );
760
761        // The merged child cache should be removed.
762        assert!(cache_a_path.exists());
763        assert!(!cache_aa_path.exists());
764
765        Ok(())
766    }
767
768    #[test]
769    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
770        let dir = tempfile::tempdir()?;
771
772        let file1_path = dir.path().join("same1.txt");
773        fs::write(&file1_path, "same content")?;
774
775        let file2_path = dir.path().join("same2.txt");
776        fs::write(&file2_path, "same content")?;
777
778        let exclude_path = dir.path().join("exclude.txt");
779        fs::write(&exclude_path, "same content")?;
780
781        let mut hasher = FileHasher::new(&[dir.path()])?;
782        hasher.buffer_size = 8192;
783        let mut builder = globset::GlobSetBuilder::new();
784        builder.add(
785            globset::GlobBuilder::new("exclude.txt")
786                .case_insensitive(true)
787                .build()?,
788        );
789        let filter = builder.build()?;
790        hasher.exclude = Some(filter);
791
792        let duplicates = hasher.find_duplicates()?;
793        assert_eq!(duplicates.len(), 1);
794        let group = &duplicates[0];
795        assert_eq!(group.paths.len(), 2);
796        assert!(group.paths.contains(&file1_path));
797        assert!(group.paths.contains(&file2_path));
798        assert!(!group.paths.contains(&exclude_path));
799        Ok(())
800    }
801
802    #[derive(Default)]
803    struct CheckCollector {
804        start_seen: bool,
805        total_files: Option<u64>,
806        results: Vec<FileComparisonResult>,
807        file_done_count: u64,
808        num_error: usize,
809    }
810
811    impl CheckCollector {
812        fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
813            let mut collector = Self::default();
814            collector._collect(rx, base_dir);
815            collector
816        }
817
818        fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
819            while let Ok(event) = rx.recv() {
820                match event {
821                    CheckEvent::StartChecking => self.start_seen = true,
822                    CheckEvent::Total(total) => self.total_files = Some(total.num_files),
823                    CheckEvent::Result(mut result, _size) => {
824                        result.relative_path = result
825                            .relative_path
826                            .strip_prefix(base_dir)
827                            .unwrap()
828                            .to_path_buf();
829                        self.results.push(result);
830                    }
831                    CheckEvent::Progress(progress_val) => {
832                        self.file_done_count += progress_val.num_files;
833                    }
834                    CheckEvent::Error(_) => {
835                        self.num_error += 1;
836                    }
837                }
838            }
839        }
840    }
841
842    #[test]
843    fn check_mode_empty_cache() -> anyhow::Result<()> {
844        let dir = tempfile::tempdir()?;
845        let dir_path = dir.path().to_path_buf();
846        println!("{:?}", dir_path);
847        let file1_path = dir.path().join("file1.txt");
848        fs::write(&file1_path, "content 1")?;
849        let file2_path = dir.path().join("file2.txt");
850        fs::write(&file2_path, "content 2")?;
851
852        let mut hasher = FileHasher::new(&[&dir_path])?;
853        hasher.exclude = Some(default_exclude());
854        let (tx, rx) = mpsc::channel();
855        hasher.check_streaming(tx, false)?;
856        let collector = CheckCollector::collect(rx, &dir_path);
857        assert!(collector.start_seen);
858        assert_eq!(collector.total_files, Some(2));
859        assert_eq!(collector.file_done_count, 0);
860        assert_eq!(collector.num_error, 0);
861
862        let mut results = collector.results;
863        results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
864        assert_eq!(results.len(), 2);
865        assert_eq!(results[0].relative_path, Path::new("file1.txt"));
866        assert_eq!(results[0].classification, Classification::OnlyInDir2);
867        assert_eq!(results[1].relative_path, Path::new("file2.txt"));
868        assert_eq!(results[1].classification, Classification::OnlyInDir2);
869
870        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
871        Ok(())
872    }
873
874    #[test]
875    fn check_mode_with_cache() -> anyhow::Result<()> {
876        let dir = tempfile::tempdir()?;
877        let dir_path = dir.path().to_path_buf();
878        let file1_path = dir.path().join("file1.txt");
879        let file2_path = dir.path().join("file2.txt");
880        fs::write(&file1_path, "content 1")?;
881        fs::write(&file2_path, "content 2")?;
882        let file1 = FileItem::try_from(file1_path.as_path())?;
883        let file2 = FileItem::try_from(file2_path.as_path())?;
884
885        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
886        hasher.exclude = Some(default_exclude());
887        let _hash1 = hasher.get_hash(&file1)?;
888        let _hash2 = hasher.get_hash(&file2)?;
889        hasher.save_cache()?;
890        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
891
892        let mut hasher = FileHasher::new(&[&dir_path])?;
893        hasher.exclude = Some(default_exclude());
894        let (tx, rx) = mpsc::channel();
895        hasher.check_streaming(tx, false)?;
896        let collector = CheckCollector::collect(rx, &dir_path);
897        assert_eq!(collector.results.len(), 0);
898        assert_eq!(collector.file_done_count, 2);
899
900        fs::write(&file1_path, "content 1 modified")?;
901
902        let file2_meta_before = fs::metadata(&file2_path)?;
903        let mtime_before = file2_meta_before.modified()?;
904        std::thread::sleep(time::Duration::from_millis(10));
905        fs::write(&file2_path, "content 2")?;
906        let file2_meta_after = fs::metadata(&file2_path)?;
907        let mtime_after = file2_meta_after.modified()?;
908        assert!(mtime_after > mtime_before);
909
910        let mut hasher = FileHasher::new(&[&dir_path])?;
911        hasher.exclude = Some(default_exclude());
912        let (tx, rx) = mpsc::channel();
913        hasher.check_streaming(tx, false)?;
914        let collector = CheckCollector::collect(rx, &dir_path);
915        assert_eq!(collector.results.len(), 1);
916        let results = collector.results;
917        assert_eq!(results[0].relative_path, Path::new("file1.txt"));
918        assert_eq!(results[0].modified_time_comparison, Some(Ordering::Less));
919        assert_eq!(results[0].size_comparison, Some(Ordering::Less));
920        assert_eq!(results[0].is_content_same, None);
921        assert_eq!(collector.file_done_count, 1);
922        Ok(())
923    }
924
925    #[test]
926    fn check_update_mode() -> anyhow::Result<()> {
927        let dir = tempfile::tempdir()?;
928        let dir_path = dir.path().to_path_buf();
929        let file1_path = dir.path().join("file1.txt");
930        fs::write(&file1_path, "content 1")?;
931
932        let mut hasher = FileHasher::new(&[&dir_path])?;
933        hasher.exclude = Some(default_exclude());
934        let (tx, rx) = mpsc::channel();
935        hasher.check_streaming(tx, true)?;
936        let _ = CheckCollector::collect(rx, &dir_path);
937        hasher.save_cache()?;
938        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
939
940        let cache = FileHashCache::new(&dir_path);
941        let file1 = FileItem::try_from(file1_path.as_path())?;
942        let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
943        assert!(hash1.is_some());
944
945        std::thread::sleep(time::Duration::from_millis(10));
946        fs::write(&file1_path, "content 1 modified")?;
947        let file1_mod = FileItem::try_from(file1_path.as_path())?;
948
949        let mut hasher = FileHasher::new(&[&dir_path])?;
950        hasher.exclude = Some(default_exclude());
951        let (tx, rx) = mpsc::channel();
952        hasher.check_streaming(tx, true)?;
953        let _ = CheckCollector::collect(rx, &dir_path);
954        hasher.save_cache()?;
955
956        let cache = FileHashCache::new(&dir_path);
957        let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
958        assert!(hash_mod.is_some());
959        assert_ne!(hash1, hash_mod);
960
961        std::thread::sleep(time::Duration::from_millis(10));
962        fs::write(&file1_path, "content 1 modified")?;
963        let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
964        assert!(file1_mod2.modified() > file1_mod.modified());
965
966        assert!(
967            cache
968                .get(&PathBuf::from("file1.txt"), &file1_mod2)
969                .is_none()
970        );
971
972        let mut hasher = FileHasher::new(&[&dir_path])?;
973        hasher.exclude = Some(default_exclude());
974        let (tx, rx) = mpsc::channel();
975        hasher.check_streaming(tx, true)?;
976        let _ = CheckCollector::collect(rx, &dir_path);
977        hasher.save_cache()?;
978
979        let cache = FileHashCache::new(&dir_path);
980        assert!(
981            cache
982                .get(&PathBuf::from("file1.txt"), &file1_mod2)
983                .is_some()
984        );
985        Ok(())
986    }
987
988    #[test]
989    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
990        let dir = tempfile::tempdir()?;
991        let dir_path = dir.path().to_path_buf();
992        let file1_path = dir.path().join("file1.txt");
993        let file2_path = dir.path().join("file2.txt");
994        fs::write(&file1_path, "content 1")?;
995        fs::write(&file2_path, "content 2")?;
996        let file1 = FileItem::try_from(file1_path.as_path())?;
997        let file2 = FileItem::try_from(file2_path.as_path())?;
998
999        let mut hasher = FileHasher::new(&[&dir_path])?;
1000        hasher.exclude = Some(default_exclude());
1001        let (tx, rx) = mpsc::channel();
1002        hasher.check_streaming(tx, true)?;
1003        let _ = CheckCollector::collect(rx, &dir_path);
1004        hasher.save_cache()?;
1005
1006        // Verify both are in the cache
1007        let cache = FileHashCache::new(&dir_path);
1008        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1009        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
1010
1011        // Now delete file2 from disk
1012        fs::remove_file(&file2_path)?;
1013
1014        // Run check and save again
1015        let mut hasher = FileHasher::new(&[&dir_path])?;
1016        hasher.exclude = Some(default_exclude());
1017        let (tx, rx) = mpsc::channel();
1018        hasher.check_streaming(tx, true)?;
1019        let _ = CheckCollector::collect(rx, &dir_path);
1020        hasher.save_cache()?;
1021
1022        // Verify file2 is removed from cache, but file1 is still there
1023        let cache = FileHashCache::new(&dir_path);
1024        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1025        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1026        Ok(())
1027    }
1028
1029    #[test]
1030    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1031        let tmp = tempfile::tempdir()?;
1032        let dir1 = tmp.path().join("dir1");
1033        let dir2 = tmp.path().join("dir2");
1034        fs::create_dir(&dir1)?;
1035        fs::create_dir(&dir2)?;
1036        let file1_path = dir1.join("file1.txt");
1037        fs::write(&file1_path, "same content")?;
1038        let file2_path = dir2.join("file2.txt");
1039        fs::write(&file2_path, "same content")?;
1040        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1041        let duplicates = hasher.find_duplicates()?;
1042        assert_eq!(duplicates.len(), 1);
1043        let group = &duplicates[0];
1044        assert_eq!(group.paths.len(), 2);
1045        assert_eq!(group.size, 12);
1046        assert!(group.paths.contains(&file1_path));
1047        assert!(group.paths.contains(&file2_path));
1048
1049        Ok(())
1050    }
1051
1052    #[test]
1053    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1054        let tmp = tempfile::tempdir()?;
1055        let dir1 = tmp.path().join("dir1");
1056        let dir2 = tmp.path().join("dir2");
1057        fs::create_dir(&dir1)?;
1058        fs::create_dir(&dir2)?;
1059        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1060        assert!(hasher.check(false).is_err());
1061        Ok(())
1062    }
1063
1064    #[test]
1065    fn escape_shell() {
1066        let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1067        assert_eq!(escape_shell(""), "");
1068        assert_eq!(escape_shell("abc"), "abc");
1069        assert_eq!(escape_shell("a'b"), "a'\\''b");
1070        assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1071
1072        let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1073        assert_eq!(escape_shell_double(""), "");
1074        assert_eq!(escape_shell_double("abc"), "abc");
1075        assert_eq!(escape_shell_double("a'b"), "a''b");
1076        assert_eq!(escape_shell_double("a'b'"), "a''b''");
1077    }
1078
1079    #[test]
1080    fn write_dups_shell_empty() -> anyhow::Result<()> {
1081        let dup_empty = DuplicatedFiles {
1082            paths: vec![],
1083            size: 100,
1084        };
1085        let mut buf = Vec::new();
1086        dup_empty.write_shell(&mut buf)?;
1087        assert_eq!(String::from_utf8(buf)?, "");
1088        Ok(())
1089    }
1090
1091    #[test]
1092    fn write_dups_shell_one() -> anyhow::Result<()> {
1093        let dup_one = DuplicatedFiles {
1094            paths: vec![PathBuf::from("a.txt")],
1095            size: 100,
1096        };
1097        let mut buf = Vec::new();
1098        dup_one.write_shell(&mut buf)?;
1099        assert_eq!(String::from_utf8(buf)?, "");
1100        Ok(())
1101    }
1102
1103    #[test]
1104    fn write_dups_shell_two() -> anyhow::Result<()> {
1105        let dup_multiple = DuplicatedFiles {
1106            paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1107            size: 100,
1108        };
1109        let mut buf = Vec::new();
1110        dup_multiple.write_shell(&mut buf)?;
1111        assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1112        Ok(())
1113    }
1114
1115    #[test]
1116    fn write_dups_shell_three() -> anyhow::Result<()> {
1117        let dup_multiple = DuplicatedFiles {
1118            paths: vec![
1119                PathBuf::from("a.txt"),
1120                PathBuf::from("b.txt"),
1121                PathBuf::from("c.txt"),
1122            ],
1123            size: 100,
1124        };
1125        let mut buf = Vec::new();
1126        dup_multiple.write_shell(&mut buf)?;
1127        assert_eq!(
1128            String::from_utf8(buf)?,
1129            "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1130        );
1131        Ok(())
1132    }
1133
1134    #[test]
1135    fn write_dups_shell_quotes() -> anyhow::Result<()> {
1136        let dup_quotes = DuplicatedFiles {
1137            paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1138            size: 100,
1139        };
1140        let mut buf = Vec::new();
1141        dup_quotes.write_shell(&mut buf)?;
1142        assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1143
1144        let mut buf = Vec::new();
1145        dup_quotes.write_pwsh(&mut buf)?;
1146        assert_eq!(
1147            String::from_utf8(buf)?,
1148            "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1149        );
1150        Ok(())
1151    }
1152}