Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileItem, FileIterator,
3    OutputFormat, Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10    collections::HashMap,
11    fs,
12    io::{self, Read, stdout},
13    path::{Path, PathBuf},
14    sync::{
15        Arc,
16        atomic::{AtomicUsize, Ordering},
17        mpsc,
18    },
19    time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26    StartHashing,
27    NumFiles(usize),
28    Result(FileItem, blake3::Hash),
29    Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34    Unchanged,
35    New,
36    Modified,
37}
38
39#[derive(Debug, PartialEq)]
40enum CheckEvent {
41    StartChecking,
42    TotalFiles(usize),
43    Result(PathBuf, CheckStatus),
44    FileDone,
45    Error,
46}
47
48enum DupState {
49    Single(FileItem, usize),
50    Hashing,
51}
52
53/// A tool for finding duplicated files in a directory.
54pub struct FileHasher {
55    dirs: Vec<PathBuf>,
56    pub buffer_size: usize,
57    cache: Option<Arc<FileHashCache>>,
58    num_hashed: AtomicUsize,
59    num_hash_looked_up: AtomicUsize,
60    pub exclude: Option<GlobSet>,
61    pub progress: Option<Arc<ProgressBuilder>>,
62    pub output_format: OutputFormat,
63    pub jobs: usize,
64}
65
66impl FileHasher {
67    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
68
69    /// Creates a new `FileHasher` for the given directories.
70    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
71        if dirs.is_empty() {
72            anyhow::bail!("At least one directory must be specified.");
73        }
74        Ok(Self {
75            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
76            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
77            cache: None,
78            num_hashed: AtomicUsize::new(0),
79            num_hash_looked_up: AtomicUsize::new(0),
80            exclude: None,
81            progress: None,
82            output_format: OutputFormat::Default,
83            jobs: Self::DEFAULT_JOBS,
84        })
85    }
86
87    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
88        let mut hasher = Self::new(dirs)?;
89        hasher.cache = Some(hasher.new_cache()?);
90        Ok(hasher)
91    }
92
93    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
94        let common_ancestor = crate::common_ancestor(&self.dirs)
95            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
96        Ok(FileHashCache::find_or_new(&common_ancestor))
97    }
98
99    /// Gets the hash cache.
100    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
101        if self.cache.is_none() {
102            self.cache = Some(self.new_cache()?);
103        }
104        Ok(Arc::clone(self.cache.as_ref().unwrap()))
105    }
106
107    /// Remove a cache entry if it exists.
108    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
109        let cache = self.cache()?;
110        let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
111        cache.remove(relative);
112        Ok(())
113    }
114
115    /// Save the hash cache if it is dirty.
116    pub fn save_cache(&self) -> anyhow::Result<()> {
117        log::info!(
118            "Hash stats for {:?}: {} computed, {} looked up",
119            self.dirs,
120            self.num_hashed.load(Ordering::Relaxed),
121            self.num_hash_looked_up.load(Ordering::Relaxed)
122        );
123        if let Some(cache) = &self.cache {
124            cache.save()?;
125        }
126        Ok(())
127    }
128
129    /// Clears the loaded hashes in the cache.
130    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
131        let cache = self.cache()?;
132        for dir in &self.dirs {
133            let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
134            cache.clear(relative);
135        }
136        Ok(())
137    }
138
139    /// Executes the check/update process.
140    pub fn check(&self, update: bool) -> anyhow::Result<()> {
141        match self.output_format {
142            OutputFormat::Default | OutputFormat::Symbol => {}
143            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
144        }
145        if self.dirs.len() > 1 {
146            anyhow::bail!("Check mode only supports one directory.");
147        }
148        let start_time = time::Instant::now();
149        let progress = self
150            .progress
151            .as_ref()
152            .map(|progress| progress.add_spinner())
153            .unwrap_or_else(Progress::none);
154        progress.set_message("Scanning directory...");
155        let mut num_new = 0;
156        let mut num_modified = 0;
157        let mut num_error = 0;
158        std::thread::scope(|scope| {
159            let (tx, rx) = mpsc::channel();
160            scope.spawn(|| {
161                if let Err(e) = self.check_streaming(tx, update) {
162                    log::error!("Error during check: {}", e);
163                }
164            });
165            while let Ok(event) = rx.recv() {
166                match event {
167                    CheckEvent::StartChecking => {
168                        progress.set_message("Checking files...");
169                    }
170                    CheckEvent::TotalFiles(total) => {
171                        progress.set_length(total as u64);
172                        progress.set_message("");
173                    }
174                    CheckEvent::Result(path, status) => {
175                        let symbol = match status {
176                            CheckStatus::New => {
177                                num_new += 1;
178                                '+'
179                            }
180                            CheckStatus::Modified => {
181                                num_modified += 1;
182                                '!'
183                            }
184                            CheckStatus::Unchanged => unreachable!(),
185                        };
186                        progress.inc(1);
187                        progress.suspend_for(stdout(), || {
188                            println!("{} {}", symbol, path.display());
189                        });
190                    }
191                    CheckEvent::FileDone => {
192                        progress.inc(1);
193                    }
194                    CheckEvent::Error => {
195                        progress.inc(1);
196                        num_error += 1;
197                    }
198                }
199            }
200        });
201        progress.finish();
202        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
203        Ok(())
204    }
205
206    fn print_check_summary(
207        &self,
208        start_time: &time::Instant,
209        num_new: usize,
210        num_modified: usize,
211        num_error: usize,
212    ) -> io::Result<()> {
213        let summary = [
214            ("Elapsed:", 0),
215            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
216            ("New files:", num_new),
217            ("Modified files:", num_modified),
218            ("Errors:", num_error),
219        ];
220        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
221        let mut writer = std::io::stderr();
222        formatter.write_value(
223            &mut writer,
224            summary[0].0,
225            FormattedDuration(start_time.elapsed()),
226        )?;
227        formatter.write_values(&mut writer, &summary[1..])
228    }
229
230    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
231        assert_eq!(self.dirs.len(), 1);
232        let cache = self.new_cache()?;
233        let base_dir = &self.dirs[0];
234        let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
235        cache.set_remove_if_no_access(relative);
236        let cache_clone = Arc::clone(&cache);
237        std::thread::scope(|global_scope| {
238            let mut it = FileIterator::new(base_dir);
239            it.cache = Some(Arc::clone(&cache));
240            it.exclude = self.exclude.as_ref();
241            let it_rx = it.spawn_in_scope(global_scope);
242            tx.send(CheckEvent::StartChecking)?;
243            let pool = crate::build_thread_pool(self.jobs)?;
244            pool.scope(move |scope| -> anyhow::Result<()> {
245                let mut total_files = 0;
246                for file in it_rx {
247                    total_files += 1;
248                    let tx = tx.clone();
249                    let cache = Arc::clone(&cache);
250                    scope.spawn(move |_| {
251                        let status = self.check_file(&file, &cache, update);
252                        let event = match status {
253                            Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
254                                let rel_path = file.relative_path(base_dir);
255                                CheckEvent::Result(rel_path.into(), status.unwrap())
256                            }
257                            Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
258                            Err(e) => {
259                                log::error!("Failed to check file '{}': {}", file, e);
260                                CheckEvent::Error
261                            }
262                        };
263                        if tx.send(event).is_err() {
264                            log::error!("Send failed");
265                        }
266                    });
267                }
268                tx.send(CheckEvent::TotalFiles(total_files))?;
269                Ok(())
270            })
271        })?;
272        cache_clone.save()?;
273        Ok(())
274    }
275
276    fn check_file(
277        &self,
278        file: &FileItem,
279        cache: &FileHashCache,
280        update: bool,
281    ) -> anyhow::Result<CheckStatus> {
282        assert!(file.path().is_absolute());
283        let computed_hash = self.compute_hash(file)?;
284        let rel_path = file.relative_path(cache.base_dir());
285        let cached_hash = cache.get_by_path(rel_path);
286        let status = match cached_hash {
287            None => CheckStatus::New,
288            Some(cached) => {
289                if computed_hash != cached {
290                    CheckStatus::Modified
291                } else {
292                    CheckStatus::Unchanged
293                }
294            }
295        };
296        if update {
297            let modified = file.modified();
298            match status {
299                CheckStatus::New | CheckStatus::Modified => {
300                    cache.insert(rel_path, modified, computed_hash);
301                }
302                CheckStatus::Unchanged => {
303                    if cache.get(rel_path, modified).is_none() {
304                        cache.insert(rel_path, modified, computed_hash);
305                    }
306                }
307            }
308        }
309        Ok(status)
310    }
311
312    /// Executes the duplicate file finding process and prints results.
313    pub fn run(&self) -> anyhow::Result<()> {
314        let start_time = time::Instant::now();
315        let mut duplicates = self.find_duplicates()?;
316        let mut total_wasted_space = 0;
317        if !duplicates.is_empty() {
318            duplicates.sort_by_key(|a| a.size);
319            total_wasted_space = self.print_duplicates_results(&duplicates)?;
320        }
321        self.print_duplicates_summary(&start_time, total_wasted_space)?;
322        Ok(())
323    }
324
325    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
326        let mut total_wasted_space = 0;
327        for dupes in duplicates {
328            dupes.print(self.output_format)?;
329            total_wasted_space += dupes.wasted_size();
330        }
331        Ok(total_wasted_space)
332    }
333
334    fn print_duplicates_summary(
335        &self,
336        start_time: &time::Instant,
337        total_wasted_space: u64,
338    ) -> io::Result<()> {
339        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
340        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
341        let total_wasted_space = crate::human_readable_size(total_wasted_space);
342        let summary = [
343            ("Elapsed:", elapsed),
344            ("Hash computed:", num_hashed),
345            ("Total wasted space:", total_wasted_space),
346        ];
347        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
348        formatter.write_values(&mut io::stderr(), &summary)
349    }
350
351    /// Finds duplicated files and returns a list of duplicate groups.
352    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
353        let progress = self
354            .progress
355            .as_ref()
356            .map(|progress| progress.add_spinner())
357            .unwrap_or_else(Progress::none);
358        progress.set_message("Scanning directories...");
359
360        let (tx, rx) = mpsc::channel();
361        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
362        std::thread::scope(|scope| {
363            scope.spawn(|| {
364                if let Err(e) = self.find_duplicates_streaming(tx) {
365                    log::error!("Error during duplicate finding: {}", e);
366                }
367            });
368
369            while let Ok(event) = rx.recv() {
370                match event {
371                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
372                    DupEvent::NumFiles(num) => progress.set_length(num as u64),
373                    DupEvent::Result(file, hash) => {
374                        progress.inc(1);
375                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
376                            paths: Vec::new(),
377                            size: file.size(),
378                        });
379                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
380                        assert_eq!(
381                            entry.size,
382                            file.size(),
383                            "Hash collision: sizes do not match"
384                        );
385                        entry.paths.push(file.into_path_buf());
386                    }
387                    DupEvent::Error => progress.inc(1),
388                }
389            }
390        });
391        progress.finish();
392
393        let mut duplicates = Vec::new();
394        for (_, mut dupes) in by_hash {
395            if dupes.paths.len() > 1 {
396                dupes.paths.sort();
397                duplicates.push(dupes);
398            }
399        }
400        Ok(duplicates)
401    }
402
403    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
404        std::thread::scope(|global_scope| {
405            let (it_rx, caches) = self.stream_file_items(global_scope)?;
406            let caches = &caches;
407            let pool = crate::build_thread_pool(self.jobs)?;
408            pool.scope(move |scope| -> anyhow::Result<()> {
409                let mut by_size: HashMap<u64, DupState> = HashMap::new();
410                let mut num_hashed = 0;
411                tx.send(DupEvent::StartHashing)?;
412                for (file, dir_index) in it_rx {
413                    let size = file.size();
414                    if size == 0 {
415                        continue;
416                    }
417                    let cache = &caches[dir_index];
418                    match by_size.entry(size) {
419                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
420                        {
421                            DupState::Single(file0, dir_index0) => {
422                                // We found a second file of identical size.
423                                // Time to start hashing both the *original* matching file and the *new* one!
424                                let cache0 = &caches[*dir_index0];
425                                self.send_hash(file0, cache0, &tx, scope);
426                                self.send_hash(&file, cache, &tx, scope);
427
428                                // Modify the state to indicate we are now fully hashing this size bucket.
429                                *occ.get_mut() = DupState::Hashing;
430                                num_hashed += 2;
431                            }
432                            DupState::Hashing => {
433                                // File size bucket already hashing; just dynamically spawn the new file immediately.
434                                self.send_hash(&file, cache, &tx, scope);
435                                num_hashed += 1;
436                            }
437                        },
438                        std::collections::hash_map::Entry::Vacant(vac) => {
439                            vac.insert(DupState::Single(file, dir_index));
440                        }
441                    }
442                }
443                tx.send(DupEvent::NumFiles(num_hashed))?;
444                Ok(())
445            })?;
446            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
447            Ok::<(), anyhow::Error>(())
448        })?;
449        Ok(())
450    }
451
452    fn stream_file_items<'scope, 'env>(
453        &'env self,
454        scope: &'scope std::thread::Scope<'scope, 'env>,
455    ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
456        let (it_tx, it_rx) = mpsc::channel();
457        let mut caches = Vec::with_capacity(self.dirs.len());
458        for (dir_index, dir) in self.dirs.iter().enumerate() {
459            let mut it = FileIterator::new(dir);
460            let cache = FileHashCache::find_or_new(dir);
461            it.cache = Some(Arc::clone(&cache));
462            it.exclude = self.exclude.as_ref();
463            let it_tx = it_tx.clone();
464            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
465            caches.push(cache);
466        }
467        Ok((it_rx, caches))
468    }
469
470    fn send_hash<'scope>(
471        &'scope self,
472        file: &FileItem,
473        cache: &Arc<FileHashCache>,
474        tx: &mpsc::Sender<DupEvent>,
475        scope: &rayon::Scope<'scope>,
476    ) {
477        let (hash, relative) = self
478            .get_hash_from_cache(file, cache)
479            .expect("path should be in cache base_dir");
480        if let Some(hash) = hash {
481            let _ = tx.send(DupEvent::Result(file.clone(), hash));
482            return;
483        }
484
485        let file = file.clone();
486        let relative = relative.to_path_buf();
487        let tx = tx.clone();
488        let cache = Arc::clone(cache);
489        scope.spawn(move |_| {
490            if let Ok(hash) = self.compute_hash(&file) {
491                cache.insert(&relative, file.modified(), hash);
492                let _ = tx.send(DupEvent::Result(file, hash));
493            } else {
494                log::error!("Failed to hash file: '{}'", file);
495                let _ = tx.send(DupEvent::Error);
496            }
497        });
498    }
499
500    /// Gets the hash of a file, using the cache if available.
501    pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
502        let cache = self.cache.as_ref().expect("cache should be initialized");
503        let (hash, relative) = self.get_hash_from_cache(file, cache)?;
504        if let Some(hash) = hash {
505            return Ok(hash);
506        }
507
508        let hash = self.compute_hash(file)?;
509        cache.insert(relative, file.modified(), hash);
510        Ok(hash)
511    }
512
513    fn get_hash_from_cache<'a>(
514        &self,
515        file: &'a FileItem,
516        cache: &FileHashCache,
517    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
518        let relative = file.relative_path(cache.base_dir());
519        if let Some(hash) = cache.get(relative, file.modified()) {
520            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
521            return Ok((Some(hash), relative));
522        }
523        Ok((None, relative))
524    }
525
526    fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
527        let start_time = time::Instant::now();
528        let mut f = fs::File::open(file.path())?;
529        let progress = self
530            .progress
531            .as_ref()
532            .map(|progress| progress.add_file(file.path(), file.size()))
533            .unwrap_or_else(Progress::none);
534        let mut hasher = blake3::Hasher::new();
535        if self.buffer_size == 0 {
536            if file.size() > 0 {
537                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
538                hasher.update(&mmap[..]);
539                progress.inc(file.size());
540            }
541        } else {
542            let mut buf = vec![0u8; self.buffer_size];
543            loop {
544                let n = f.read(&mut buf)?;
545                if n == 0 {
546                    break;
547                }
548                hasher.update(&buf[..n]);
549                progress.inc(n as u64);
550            }
551        }
552        progress.finish();
553        self.num_hashed.fetch_add(1, Ordering::Relaxed);
554        let hash = hasher.finalize();
555        log::debug!(
556            "Computed hash in {}: '{}'",
557            FormattedDuration(start_time.elapsed()),
558            file
559        );
560        Ok(hash)
561    }
562}
563
564/// A group of duplicated files and their size.
565#[derive(Clone, Debug)]
566pub struct DuplicatedFiles {
567    pub paths: Vec<PathBuf>,
568    pub size: u64,
569}
570
571impl DuplicatedFiles {
572    fn wasted_size(&self) -> u64 {
573        self.size * (self.paths.len() as u64 - 1)
574    }
575
576    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
577        match output_format {
578            OutputFormat::Default => self.write_human(stdout())?,
579            OutputFormat::PowerShell => self.write_pwsh(stdout())?,
580            OutputFormat::Shell => self.write_shell(stdout())?,
581            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
582        }
583        Ok(())
584    }
585
586    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
587        writeln!(
588            writer,
589            "Identical {} files of {}:",
590            self.paths.len(),
591            crate::human_readable_size(self.size)
592        )?;
593        for path in &self.paths {
594            writeln!(writer, "  {}", path.display())?;
595        }
596        Ok(())
597    }
598
599    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
600        writeln!(writer, "- paths:")?;
601        for path in &self.paths {
602            writeln!(writer, "  - {:?}", path)?;
603        }
604        writeln!(writer, "  size: {}", self.size)?;
605        Ok(())
606    }
607
608    fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
609        self.write_shell_with(writer, "cp", Self::escape_shell)
610    }
611
612    fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
613        self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
614    }
615
616    fn write_shell_with(
617        &self,
618        mut writer: impl io::Write,
619        cmd: &str,
620        stringify: impl Fn(&Path) -> String,
621    ) -> anyhow::Result<()> {
622        let mut iter = self.paths.iter();
623        if let Some(path0) = iter.next() {
624            let path0 = stringify(path0);
625            for path in iter {
626                writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
627            }
628        }
629        Ok(())
630    }
631
632    fn escape_shell(path: &Path) -> String {
633        path.to_string_lossy().replace('\'', "\'\\'\'")
634    }
635
636    fn escape_shell_double(path: &Path) -> String {
637        path.to_string_lossy().replace('\'', "\'\'")
638    }
639}
640
641#[cfg(test)]
642mod tests {
643    use super::*;
644
645    fn default_exclude() -> globset::GlobSet {
646        let mut builder = globset::GlobSetBuilder::new();
647        builder.add(
648            globset::GlobBuilder::new(".hash_cache")
649                .case_insensitive(true)
650                .build()
651                .unwrap(),
652        );
653        builder.build().unwrap()
654    }
655
656    #[test]
657    fn find_duplicates() -> anyhow::Result<()> {
658        let dir = tempfile::tempdir()?;
659
660        let file1_path = dir.path().join("same1.txt");
661        fs::write(&file1_path, "same content")?;
662
663        let file2_path = dir.path().join("same2.txt");
664        fs::write(&file2_path, "same content")?;
665
666        let diff_path = dir.path().join("diff.txt");
667        fs::write(&diff_path, "different content")?;
668
669        let mut hasher = FileHasher::new(&[dir.path()])?;
670        hasher.buffer_size = 8192;
671        let duplicates = hasher.find_duplicates()?;
672
673        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
674        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
675
676        assert_eq!(duplicates.len(), 1);
677        let group = &duplicates[0];
678        assert_eq!(group.paths.len(), 2);
679        assert_eq!(group.size, 12); // "same content" is 12 bytes
680
681        assert!(group.paths.contains(&file1_path));
682        assert!(group.paths.contains(&file2_path));
683
684        Ok(())
685    }
686
687    #[test]
688    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
689        let dir = tempfile::tempdir()?;
690        let dir_path = dir.path();
691
692        let sub_dir = dir_path.join("a").join("a");
693        fs::create_dir_all(&sub_dir)?;
694
695        let file1_path = sub_dir.join("1");
696        fs::write(&file1_path, "same content")?;
697
698        let file2_path = sub_dir.join("2");
699        fs::write(&file2_path, "same content")?;
700
701        // Create empty cache file in a/a to force it to be the cache base
702        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
703        fs::File::create(&cache_aa_path)?;
704
705        // Run find_duplicates on a/a
706        let hasher_aa = FileHasher::new(&[&sub_dir])?;
707        let duplicates_aa = hasher_aa.find_duplicates()?;
708        assert_eq!(duplicates_aa.len(), 1);
709        assert!(cache_aa_path.exists());
710        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
711        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
712
713        // Create empty cache file in a to force it to be the cache base
714        let root_a = dir_path.join("a");
715        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
716        fs::File::create(&cache_a_path)?;
717
718        // Run find_duplicates on a
719        let hasher_a = FileHasher::new(&[&root_a])?;
720        let duplicates_a = hasher_a.find_duplicates()?;
721        assert_eq!(duplicates_a.len(), 1);
722        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
723        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
724
725        // The merged child cache should be removed.
726        assert!(cache_a_path.exists());
727        assert!(!cache_aa_path.exists());
728
729        Ok(())
730    }
731
732    #[test]
733    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
734        let dir = tempfile::tempdir()?;
735
736        let file1_path = dir.path().join("same1.txt");
737        fs::write(&file1_path, "same content")?;
738
739        let file2_path = dir.path().join("same2.txt");
740        fs::write(&file2_path, "same content")?;
741
742        let exclude_path = dir.path().join("exclude.txt");
743        fs::write(&exclude_path, "same content")?;
744
745        let mut hasher = FileHasher::new(&[dir.path()])?;
746        hasher.buffer_size = 8192;
747        let mut builder = globset::GlobSetBuilder::new();
748        builder.add(
749            globset::GlobBuilder::new("exclude.txt")
750                .case_insensitive(true)
751                .build()?,
752        );
753        let filter = builder.build()?;
754        hasher.exclude = Some(filter);
755
756        let duplicates = hasher.find_duplicates()?;
757        assert_eq!(duplicates.len(), 1);
758        let group = &duplicates[0];
759        assert_eq!(group.paths.len(), 2);
760        assert!(group.paths.contains(&file1_path));
761        assert!(group.paths.contains(&file2_path));
762        assert!(!group.paths.contains(&exclude_path));
763        Ok(())
764    }
765
766    #[test]
767    fn check_mode_empty_cache() -> anyhow::Result<()> {
768        let dir = tempfile::tempdir()?;
769        let dir_path = dir.path().to_path_buf();
770        println!("{:?}", dir_path);
771        let file1_path = dir.path().join("file1.txt");
772        fs::write(&file1_path, "content 1")?;
773        let file2_path = dir.path().join("file2.txt");
774        fs::write(&file2_path, "content 2")?;
775
776        let mut hasher = FileHasher::new(&[&dir_path])?;
777        hasher.exclude = Some(default_exclude());
778        let (tx, rx) = mpsc::channel();
779        hasher.check_streaming(tx, false)?;
780        let mut results = Vec::new();
781        let mut start_seen = false;
782        let mut total_files = None;
783        let mut file_done_count = 0;
784        let mut num_error = 0;
785        while let Ok(event) = rx.recv() {
786            match event {
787                CheckEvent::StartChecking => start_seen = true,
788                CheckEvent::TotalFiles(total) => total_files = Some(total),
789                CheckEvent::Result(path, status) => results.push((path, status)),
790                CheckEvent::FileDone => file_done_count += 1,
791                CheckEvent::Error => num_error += 1,
792            }
793        }
794        assert!(start_seen);
795        assert_eq!(total_files, Some(2));
796        assert_eq!(file_done_count, 0);
797        assert_eq!(num_error, 0);
798
799        results.sort_by(|a, b| a.0.cmp(&b.0));
800        assert_eq!(results.len(), 2);
801        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
802        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
803
804        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
805        Ok(())
806    }
807
808    #[test]
809    fn check_mode_with_cache() -> anyhow::Result<()> {
810        let dir = tempfile::tempdir()?;
811        let dir_path = dir.path().to_path_buf();
812        let file1_path = dir.path().join("file1.txt");
813        let file2_path = dir.path().join("file2.txt");
814        fs::write(&file1_path, "content 1")?;
815        fs::write(&file2_path, "content 2")?;
816        let file1 = FileItem::try_from(file1_path.as_path())?;
817        let file2 = FileItem::try_from(file2_path.as_path())?;
818
819        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
820        hasher.exclude = Some(default_exclude());
821        let _hash1 = hasher.get_hash(&file1)?;
822        let _hash2 = hasher.get_hash(&file2)?;
823        hasher.save_cache()?;
824        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
825
826        let mut hasher = FileHasher::new(&[&dir_path])?;
827        hasher.exclude = Some(default_exclude());
828        let (tx, rx) = mpsc::channel();
829        hasher.check_streaming(tx, false)?;
830        let mut results = Vec::new();
831        let mut file_done_count = 0;
832        while let Ok(event) = rx.recv() {
833            match event {
834                CheckEvent::Result(path, status) => results.push((path, status)),
835                CheckEvent::FileDone => file_done_count += 1,
836                _ => {}
837            }
838        }
839        assert_eq!(results.len(), 0);
840        assert_eq!(file_done_count, 2);
841
842        fs::write(&file1_path, "content 1 modified")?;
843
844        let file2_meta_before = fs::metadata(&file2_path)?;
845        let mtime_before = file2_meta_before.modified()?;
846        std::thread::sleep(time::Duration::from_millis(10));
847        fs::write(&file2_path, "content 2")?;
848        let file2_meta_after = fs::metadata(&file2_path)?;
849        let mtime_after = file2_meta_after.modified()?;
850        assert!(mtime_after > mtime_before);
851
852        let mut hasher = FileHasher::new(&[&dir_path])?;
853        hasher.exclude = Some(default_exclude());
854        let (tx, rx) = mpsc::channel();
855        hasher.check_streaming(tx, false)?;
856        let mut results = Vec::new();
857        let mut file_done_count = 0;
858        while let Ok(event) = rx.recv() {
859            match event {
860                CheckEvent::Result(path, status) => results.push((path, status)),
861                CheckEvent::FileDone => file_done_count += 1,
862                _ => {}
863            }
864        }
865        assert_eq!(results.len(), 1);
866        assert_eq!(
867            results[0],
868            (PathBuf::from("file1.txt"), CheckStatus::Modified)
869        );
870        assert_eq!(file_done_count, 1);
871        Ok(())
872    }
873
874    #[test]
875    fn check_update_mode() -> anyhow::Result<()> {
876        let dir = tempfile::tempdir()?;
877        let dir_path = dir.path().to_path_buf();
878        let file1_path = dir.path().join("file1.txt");
879        fs::write(&file1_path, "content 1")?;
880
881        let mut hasher = FileHasher::new(&[&dir_path])?;
882        hasher.exclude = Some(default_exclude());
883        let (tx, rx) = mpsc::channel();
884        hasher.check_streaming(tx, true)?;
885        while rx.recv().is_ok() {}
886        hasher.save_cache()?;
887        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
888
889        let cache = FileHashCache::new(&dir_path);
890        let mtime1 = fs::metadata(&file1_path)?.modified()?;
891        let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
892        assert!(hash1.is_some());
893
894        std::thread::sleep(time::Duration::from_millis(10));
895        fs::write(&file1_path, "content 1 modified")?;
896        let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
897
898        let mut hasher = FileHasher::new(&[&dir_path])?;
899        hasher.exclude = Some(default_exclude());
900        let (tx, rx) = mpsc::channel();
901        hasher.check_streaming(tx, true)?;
902        while rx.recv().is_ok() {}
903        hasher.save_cache()?;
904
905        let cache = FileHashCache::new(&dir_path);
906        let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
907        assert!(hash_mod.is_some());
908        assert_ne!(hash1, hash_mod);
909
910        std::thread::sleep(time::Duration::from_millis(10));
911        fs::write(&file1_path, "content 1 modified")?;
912        let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
913        assert!(mtime1_mod2 > mtime1_mod);
914
915        assert!(
916            cache
917                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
918                .is_none()
919        );
920
921        let mut hasher = FileHasher::new(&[&dir_path])?;
922        hasher.exclude = Some(default_exclude());
923        let (tx, rx) = mpsc::channel();
924        hasher.check_streaming(tx, true)?;
925        while rx.recv().is_ok() {}
926        hasher.save_cache()?;
927
928        let cache = FileHashCache::new(&dir_path);
929        assert!(
930            cache
931                .get(&PathBuf::from("file1.txt"), mtime1_mod2)
932                .is_some()
933        );
934        Ok(())
935    }
936
937    #[test]
938    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
939        let dir = tempfile::tempdir()?;
940        let dir_path = dir.path().to_path_buf();
941        let file1_path = dir.path().join("file1.txt");
942        let file2_path = dir.path().join("file2.txt");
943        fs::write(&file1_path, "content 1")?;
944        fs::write(&file2_path, "content 2")?;
945        let mtime1 = fs::metadata(&file1_path)?.modified()?;
946        let mtime2 = fs::metadata(&file2_path)?.modified()?;
947
948        let mut hasher = FileHasher::new(&[&dir_path])?;
949        hasher.exclude = Some(default_exclude());
950        let (tx, rx) = mpsc::channel();
951        hasher.check_streaming(tx, true)?;
952        while rx.recv().is_ok() {}
953        hasher.save_cache()?;
954
955        // Verify both are in the cache
956        let cache = FileHashCache::new(&dir_path);
957        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
958        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
959
960        // Now delete file2 from disk
961        fs::remove_file(&file2_path)?;
962
963        // Run check and save again
964        let mut hasher = FileHasher::new(&[&dir_path])?;
965        hasher.exclude = Some(default_exclude());
966        let (tx, rx) = mpsc::channel();
967        hasher.check_streaming(tx, true)?;
968        while rx.recv().is_ok() {}
969        hasher.save_cache()?;
970
971        // Verify file2 is removed from cache, but file1 is still there
972        let cache = FileHashCache::new(&dir_path);
973        assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
974        assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
975        Ok(())
976    }
977
978    #[test]
979    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
980        let tmp = tempfile::tempdir()?;
981        let dir1 = tmp.path().join("dir1");
982        let dir2 = tmp.path().join("dir2");
983        fs::create_dir(&dir1)?;
984        fs::create_dir(&dir2)?;
985        let file1_path = dir1.join("file1.txt");
986        fs::write(&file1_path, "same content")?;
987        let file2_path = dir2.join("file2.txt");
988        fs::write(&file2_path, "same content")?;
989        let hasher = FileHasher::new(&[&dir1, &dir2])?;
990        let duplicates = hasher.find_duplicates()?;
991        assert_eq!(duplicates.len(), 1);
992        let group = &duplicates[0];
993        assert_eq!(group.paths.len(), 2);
994        assert_eq!(group.size, 12);
995        assert!(group.paths.contains(&file1_path));
996        assert!(group.paths.contains(&file2_path));
997
998        Ok(())
999    }
1000
1001    #[test]
1002    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1003        let tmp = tempfile::tempdir()?;
1004        let dir1 = tmp.path().join("dir1");
1005        let dir2 = tmp.path().join("dir2");
1006        fs::create_dir(&dir1)?;
1007        fs::create_dir(&dir2)?;
1008        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1009        assert!(hasher.check(false).is_err());
1010        Ok(())
1011    }
1012
1013    #[test]
1014    fn escape_shell() {
1015        let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1016        assert_eq!(escape_shell(""), "");
1017        assert_eq!(escape_shell("abc"), "abc");
1018        assert_eq!(escape_shell("a'b"), "a'\\''b");
1019        assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1020
1021        let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1022        assert_eq!(escape_shell_double(""), "");
1023        assert_eq!(escape_shell_double("abc"), "abc");
1024        assert_eq!(escape_shell_double("a'b"), "a''b");
1025        assert_eq!(escape_shell_double("a'b'"), "a''b''");
1026    }
1027
1028    #[test]
1029    fn write_dups_shell_empty() -> anyhow::Result<()> {
1030        let dup_empty = DuplicatedFiles {
1031            paths: vec![],
1032            size: 100,
1033        };
1034        let mut buf = Vec::new();
1035        dup_empty.write_shell(&mut buf)?;
1036        assert_eq!(String::from_utf8(buf)?, "");
1037        Ok(())
1038    }
1039
1040    #[test]
1041    fn write_dups_shell_one() -> anyhow::Result<()> {
1042        let dup_one = DuplicatedFiles {
1043            paths: vec![PathBuf::from("a.txt")],
1044            size: 100,
1045        };
1046        let mut buf = Vec::new();
1047        dup_one.write_shell(&mut buf)?;
1048        assert_eq!(String::from_utf8(buf)?, "");
1049        Ok(())
1050    }
1051
1052    #[test]
1053    fn write_dups_shell_two() -> anyhow::Result<()> {
1054        let dup_multiple = DuplicatedFiles {
1055            paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1056            size: 100,
1057        };
1058        let mut buf = Vec::new();
1059        dup_multiple.write_shell(&mut buf)?;
1060        assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1061        Ok(())
1062    }
1063
1064    #[test]
1065    fn write_dups_shell_three() -> anyhow::Result<()> {
1066        let dup_multiple = DuplicatedFiles {
1067            paths: vec![
1068                PathBuf::from("a.txt"),
1069                PathBuf::from("b.txt"),
1070                PathBuf::from("c.txt"),
1071            ],
1072            size: 100,
1073        };
1074        let mut buf = Vec::new();
1075        dup_multiple.write_shell(&mut buf)?;
1076        assert_eq!(
1077            String::from_utf8(buf)?,
1078            "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1079        );
1080        Ok(())
1081    }
1082
1083    #[test]
1084    fn write_dups_shell_quotes() -> anyhow::Result<()> {
1085        let dup_quotes = DuplicatedFiles {
1086            paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1087            size: 100,
1088        };
1089        let mut buf = Vec::new();
1090        dup_quotes.write_shell(&mut buf)?;
1091        assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1092
1093        let mut buf = Vec::new();
1094        dup_quotes.write_pwsh(&mut buf)?;
1095        assert_eq!(
1096            String::from_utf8(buf)?,
1097            "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1098        );
1099        Ok(())
1100    }
1101}