Skip to main content

compare_dir/
file_hasher.rs

1use crate::{
2    ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileItem, FileIterator,
3    OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10    collections::HashMap,
11    fs,
12    io::{self, Read, stdout},
13    path::{Path, PathBuf},
14    sync::{
15        Arc,
16        atomic::{AtomicUsize, Ordering},
17        mpsc,
18    },
19    time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26    StartHashing,
27    Total(ProgressValue),
28    Result(FileItem, blake3::Hash),
29    Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34    New,
35    Modified,
36}
37
38#[derive(Debug)]
39enum CheckEvent {
40    StartChecking,
41    Total(ProgressValue),
42    Result(FileItem, CheckStatus, ProgressValue),
43    Progress(ProgressValue),
44    Error(FileItem),
45}
46
47enum DupState {
48    Single(FileItem, usize),
49    Hashing,
50}
51
52/// A tool for finding duplicated files in a directory.
53pub struct FileHasher {
54    dirs: Vec<PathBuf>,
55    pub buffer_size: usize,
56    cache: Option<Arc<FileHashCache>>,
57    num_hashed: AtomicUsize,
58    num_hash_looked_up: AtomicUsize,
59    pub exclude: Option<GlobSet>,
60    pub progress: Option<Arc<ProgressBuilder>>,
61    pub output_format: OutputFormat,
62    pub jobs: usize,
63}
64
65impl FileHasher {
66    const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
67
68    /// Creates a new `FileHasher` for the given directories.
69    pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
70        if dirs.is_empty() {
71            anyhow::bail!("At least one directory must be specified.");
72        }
73        Ok(Self {
74            dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
75            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
76            cache: None,
77            num_hashed: AtomicUsize::new(0),
78            num_hash_looked_up: AtomicUsize::new(0),
79            exclude: None,
80            progress: None,
81            output_format: OutputFormat::Default,
82            jobs: Self::DEFAULT_JOBS,
83        })
84    }
85
86    pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
87        let mut hasher = Self::new(dirs)?;
88        hasher.cache = Some(hasher.new_cache()?);
89        Ok(hasher)
90    }
91
92    fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
93        let common_ancestor = crate::common_ancestor(&self.dirs)
94            .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
95        Ok(FileHashCache::find_or_new(&common_ancestor))
96    }
97
98    /// Gets the hash cache.
99    pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
100        if self.cache.is_none() {
101            self.cache = Some(self.new_cache()?);
102        }
103        Ok(Arc::clone(self.cache.as_ref().unwrap()))
104    }
105
106    /// Remove a cache entry if it exists.
107    pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
108        let cache = self.cache()?;
109        let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
110        cache.remove(relative);
111        Ok(())
112    }
113
114    /// Save the hash cache if it is dirty.
115    pub fn save_cache(&self) -> anyhow::Result<()> {
116        log::info!(
117            "Hash stats for {:?}: {} computed, {} looked up",
118            self.dirs,
119            self.num_hashed.load(Ordering::Relaxed),
120            self.num_hash_looked_up.load(Ordering::Relaxed)
121        );
122        if let Some(cache) = &self.cache {
123            cache.save()?;
124        }
125        Ok(())
126    }
127
128    /// Clears the loaded hashes in the cache.
129    pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
130        let cache = self.cache()?;
131        for dir in &self.dirs {
132            let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
133            cache.clear(relative);
134        }
135        Ok(())
136    }
137
138    /// Executes the check/update process.
139    pub fn check(&self, update: bool) -> anyhow::Result<()> {
140        match self.output_format {
141            OutputFormat::Default | OutputFormat::Symbol => {}
142            _ => anyhow::bail!("Check mode only supports default or symbol output format."),
143        }
144        if self.dirs.len() > 1 {
145            anyhow::bail!("Check mode only supports one directory.");
146        }
147        let start_time = time::Instant::now();
148        let mut progress = self
149            .progress
150            .as_ref()
151            .map(|progress| progress.add_spinner())
152            .unwrap_or_else(Progress::none);
153        progress.use_bytes();
154        progress.set_message("Scanning directory...");
155        let mut num_new = 0;
156        let mut num_modified = 0;
157        let mut num_error = 0;
158        std::thread::scope(|scope| {
159            let (tx, rx) = mpsc::channel();
160            scope.spawn(|| {
161                if let Err(e) = self.check_streaming(tx, update) {
162                    log::error!("Error during check: {}", e);
163                }
164            });
165            while let Ok(event) = rx.recv() {
166                match event {
167                    CheckEvent::StartChecking => {
168                        progress.set_message("Checking files...");
169                    }
170                    CheckEvent::Total(value) => {
171                        progress.set_length(value);
172                        progress.set_message("");
173                    }
174                    CheckEvent::Result(file, status, value) => {
175                        let symbol = match status {
176                            CheckStatus::New => {
177                                num_new += 1;
178                                '+'
179                            }
180                            CheckStatus::Modified => {
181                                num_modified += 1;
182                                '!'
183                            }
184                        };
185                        progress.inc(value);
186                        progress.suspend_for(stdout(), || {
187                            let base_dir = &self.dirs[0];
188                            let rel_path = file.relative_path(base_dir);
189                            println!("{} {}", symbol, rel_path.display());
190                        });
191                    }
192                    CheckEvent::Progress(value) => {
193                        progress.inc(value);
194                    }
195                    CheckEvent::Error(file) => {
196                        progress.inc(ProgressValue::with_skip(file.size()));
197                        num_error += 1;
198                    }
199                }
200            }
201        });
202        progress.finish();
203        self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
204        Ok(())
205    }
206
207    fn print_check_summary(
208        &self,
209        start_time: &time::Instant,
210        num_new: usize,
211        num_modified: usize,
212        num_error: usize,
213    ) -> io::Result<()> {
214        let summary = [
215            ("Elapsed:", 0),
216            ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
217            ("New files:", num_new),
218            ("Modified files:", num_modified),
219            ("Errors:", num_error),
220        ];
221        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
222        let mut writer = std::io::stderr();
223        formatter.write_value(
224            &mut writer,
225            summary[0].0,
226            FormattedDuration(start_time.elapsed()),
227        )?;
228        formatter.write_values(&mut writer, &summary[1..])
229    }
230
231    fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
232        assert_eq!(self.dirs.len(), 1);
233        let cache = self.new_cache()?;
234        let base_dir = &self.dirs[0];
235        let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
236        cache.set_remove_if_no_access(relative);
237        let cache_clone = Arc::clone(&cache);
238        std::thread::scope(|global_scope| {
239            let mut it = FileIterator::new(base_dir);
240            it.cache = Some(Arc::clone(&cache));
241            it.exclude = self.exclude.as_ref();
242            let it_rx = it.spawn_in_scope(global_scope);
243            tx.send(CheckEvent::StartChecking)?;
244            let pool = crate::build_thread_pool(self.jobs)?;
245            pool.scope(move |scope| -> anyhow::Result<()> {
246                let mut total = ProgressValue::default();
247                for file in it_rx {
248                    self.check_file(file, &cache, update, &mut total, &tx, scope);
249                }
250                tx.send(CheckEvent::Total(total))?;
251                Ok(())
252            })
253        })?;
254        cache_clone.save()?;
255        Ok(())
256    }
257
258    fn check_file<'scope>(
259        &'scope self,
260        file: FileItem,
261        cache: &Arc<FileHashCache>,
262        update: bool,
263        total: &mut ProgressValue,
264        tx: &mpsc::Sender<CheckEvent>,
265        scope: &rayon::Scope<'scope>,
266    ) {
267        *total += ProgressValue::with_size(file.size());
268        let tx = tx.clone();
269        let cache = Arc::clone(cache);
270        scope.spawn(move |_| {
271            if let Err(error) = self._check_file(&file, cache, update, &tx) {
272                log::error!("Failed to check file '{}': {}", file, error);
273                if tx.send(CheckEvent::Error(file)).is_err() {
274                    log::error!("Send failed");
275                }
276            }
277        });
278    }
279
280    fn _check_file(
281        &self,
282        file: &FileItem,
283        cache: Arc<FileHashCache>,
284        update: bool,
285        tx: &mpsc::Sender<CheckEvent>,
286    ) -> anyhow::Result<()> {
287        assert!(file.path().is_absolute());
288        let path_in_cache = file.relative_path(cache.base_dir());
289        match cache.get_entry(path_in_cache) {
290            Some(cached) => {
291                if !update && cached.size != 0 && file.size() != cached.size {
292                    tx.send(CheckEvent::Result(
293                        file.clone(),
294                        CheckStatus::Modified,
295                        ProgressValue::with_skip(file.size()),
296                    ))?;
297                    return Ok(());
298                }
299                let hash = self.compute_hash(file)?;
300                if hash == cached.hash {
301                    if cached.should_update(file, update) {
302                        cache.insert(path_in_cache, file, hash);
303                    }
304                    tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
305                } else {
306                    if update {
307                        cache.insert(path_in_cache, file, hash);
308                    }
309                    tx.send(CheckEvent::Result(
310                        file.clone(),
311                        CheckStatus::Modified,
312                        ProgressValue::with_size(file.size()),
313                    ))?;
314                }
315            }
316            None => {
317                if update {
318                    let hash = self.compute_hash(file)?;
319                    cache.insert(path_in_cache, file, hash);
320                }
321                tx.send(CheckEvent::Result(
322                    file.clone(),
323                    CheckStatus::New,
324                    ProgressValue::with_size(file.size()),
325                ))?;
326            }
327        }
328        Ok(())
329    }
330
331    /// Executes the duplicate file finding process and prints results.
332    pub fn run(&self) -> anyhow::Result<()> {
333        let start_time = time::Instant::now();
334        let mut duplicates = self.find_duplicates()?;
335        let mut total_wasted_space = 0;
336        if !duplicates.is_empty() {
337            duplicates.sort_by_key(|a| a.size);
338            total_wasted_space = self.print_duplicates_results(&duplicates)?;
339        }
340        self.print_duplicates_summary(&start_time, total_wasted_space)?;
341        Ok(())
342    }
343
344    fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
345        let mut total_wasted_space = 0;
346        for dupes in duplicates {
347            dupes.print(self.output_format)?;
348            total_wasted_space += dupes.wasted_size();
349        }
350        Ok(total_wasted_space)
351    }
352
353    fn print_duplicates_summary(
354        &self,
355        start_time: &time::Instant,
356        total_wasted_space: u64,
357    ) -> io::Result<()> {
358        let elapsed = FormattedDuration(start_time.elapsed()).to_string();
359        let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
360        let total_wasted_space = crate::human_readable_size(total_wasted_space);
361        let summary = [
362            ("Elapsed:", elapsed),
363            ("Hash computed:", num_hashed),
364            ("Total wasted space:", total_wasted_space),
365        ];
366        let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
367        formatter.write_values(&mut io::stderr(), &summary)
368    }
369
370    /// Finds duplicated files and returns a list of duplicate groups.
371    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
372        let mut progress = self
373            .progress
374            .as_ref()
375            .map(|progress| progress.add_spinner())
376            .unwrap_or_else(Progress::none);
377        progress.set_message("Scanning directories...");
378
379        let (tx, rx) = mpsc::channel();
380        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
381        std::thread::scope(|scope| {
382            scope.spawn(|| {
383                if let Err(e) = self.find_duplicates_streaming(tx) {
384                    log::error!("Error during duplicate finding: {}", e);
385                }
386            });
387
388            while let Ok(event) = rx.recv() {
389                match event {
390                    DupEvent::StartHashing => progress.set_message("Hashing files..."),
391                    DupEvent::Total(value) => progress.set_length(value),
392                    DupEvent::Result(file, hash) => {
393                        progress.inc(ProgressValue::with_size(file.size()));
394                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
395                            paths: Vec::new(),
396                            size: file.size(),
397                        });
398                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
399                        assert_eq!(
400                            entry.size,
401                            file.size(),
402                            "Hash collision: sizes do not match"
403                        );
404                        entry.paths.push(file.into_path_buf());
405                    }
406                    DupEvent::Error => {}
407                }
408            }
409        });
410        progress.finish();
411
412        let mut duplicates = Vec::new();
413        for (_, mut dupes) in by_hash {
414            if dupes.paths.len() > 1 {
415                dupes.paths.sort();
416                duplicates.push(dupes);
417            }
418        }
419        Ok(duplicates)
420    }
421
422    fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
423        std::thread::scope(|global_scope| {
424            let (it_rx, caches) = self.stream_file_items(global_scope)?;
425            let caches = &caches;
426            let pool = crate::build_thread_pool(self.jobs)?;
427            pool.scope(move |scope| -> anyhow::Result<()> {
428                let mut by_size: HashMap<u64, DupState> = HashMap::new();
429                let mut total = ProgressValue::default();
430                tx.send(DupEvent::StartHashing)?;
431                for (file, dir_index) in it_rx {
432                    let size = file.size();
433                    if size == 0 {
434                        continue;
435                    }
436                    let cache = &caches[dir_index];
437                    match by_size.entry(size) {
438                        std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
439                        {
440                            DupState::Single(file0, dir_index0) => {
441                                // We found a second file of identical size.
442                                // Time to start hashing both the *original* matching file and the *new* one!
443                                let cache0 = &caches[*dir_index0];
444                                self.send_hash(file0, cache0, &tx, scope);
445                                self.send_hash(&file, cache, &tx, scope);
446                                total += ProgressValue::with_size(file0.size());
447                                total += ProgressValue::with_size(file.size());
448
449                                // Modify the state to indicate we are now fully hashing this size bucket.
450                                *occ.get_mut() = DupState::Hashing;
451                            }
452                            DupState::Hashing => {
453                                // File size bucket already hashing; just dynamically spawn the new file immediately.
454                                self.send_hash(&file, cache, &tx, scope);
455                                total += ProgressValue::with_size(file.size());
456                            }
457                        },
458                        std::collections::hash_map::Entry::Vacant(vac) => {
459                            vac.insert(DupState::Single(file, dir_index));
460                        }
461                    }
462                }
463                tx.send(DupEvent::Total(total))?;
464                Ok(())
465            })?;
466            pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
467            Ok::<(), anyhow::Error>(())
468        })?;
469        Ok(())
470    }
471
472    fn stream_file_items<'scope, 'env>(
473        &'env self,
474        scope: &'scope std::thread::Scope<'scope, 'env>,
475    ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
476        let (it_tx, it_rx) = mpsc::channel();
477        let mut caches = Vec::with_capacity(self.dirs.len());
478        for (dir_index, dir) in self.dirs.iter().enumerate() {
479            let mut it = FileIterator::new(dir);
480            let cache = FileHashCache::find_or_new(dir);
481            it.cache = Some(Arc::clone(&cache));
482            it.exclude = self.exclude.as_ref();
483            let it_tx = it_tx.clone();
484            scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
485            caches.push(cache);
486        }
487        Ok((it_rx, caches))
488    }
489
490    fn send_hash<'scope>(
491        &'scope self,
492        file: &FileItem,
493        cache: &Arc<FileHashCache>,
494        tx: &mpsc::Sender<DupEvent>,
495        scope: &rayon::Scope<'scope>,
496    ) {
497        let (hash, relative) = self
498            .get_hash_from_cache(file, cache)
499            .expect("path should be in cache base_dir");
500        if let Some(hash) = hash {
501            let _ = tx.send(DupEvent::Result(file.clone(), hash));
502            return;
503        }
504
505        let file = file.clone();
506        let relative = relative.to_path_buf();
507        let tx = tx.clone();
508        let cache = Arc::clone(cache);
509        scope.spawn(move |_| {
510            if let Ok(hash) = self.compute_hash(&file) {
511                cache.insert(&relative, &file, hash);
512                let _ = tx.send(DupEvent::Result(file, hash));
513            } else {
514                log::error!("Failed to hash file: '{}'", file);
515                let _ = tx.send(DupEvent::Error);
516            }
517        });
518    }
519
520    /// Gets the hash of a file, using the cache if available.
521    pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
522        let cache = self.cache.as_ref().expect("cache should be initialized");
523        let (hash, relative) = self.get_hash_from_cache(file, cache)?;
524        if let Some(hash) = hash {
525            return Ok(hash);
526        }
527
528        let hash = self.compute_hash(file)?;
529        cache.insert(relative, file, hash);
530        Ok(hash)
531    }
532
533    fn get_hash_from_cache<'a>(
534        &self,
535        file: &'a FileItem,
536        cache: &FileHashCache,
537    ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
538        let relative = file.relative_path(cache.base_dir());
539        if let Some(hash) = cache.get(relative, file) {
540            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
541            return Ok((Some(hash), relative));
542        }
543        Ok((None, relative))
544    }
545
546    fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
547        let start_time = time::Instant::now();
548        let mut f = fs::File::open(file.path())?;
549        let mut progress = self
550            .progress
551            .as_ref()
552            .map(|progress| progress.add_file(file.path(), file.size()))
553            .unwrap_or_else(Progress::none);
554        let mut hasher = blake3::Hasher::new();
555        if self.buffer_size == 0 {
556            if file.size() > 0 {
557                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
558                hasher.update(&mmap[..]);
559                progress.inc(ProgressValue::with_size(file.size()));
560            }
561        } else {
562            let mut buf = vec![0u8; self.buffer_size];
563            loop {
564                let n = f.read(&mut buf)?;
565                if n == 0 {
566                    break;
567                }
568                hasher.update(&buf[..n]);
569                progress.inc(ProgressValue::with_size(n as u64));
570            }
571        }
572        progress.finish();
573        self.num_hashed.fetch_add(1, Ordering::Relaxed);
574        let hash = hasher.finalize();
575        log::debug!(
576            "Computed hash in {}: '{}'",
577            FormattedDuration(start_time.elapsed()),
578            file
579        );
580        Ok(hash)
581    }
582}
583
584/// A group of duplicated files and their size.
585#[derive(Clone, Debug)]
586pub struct DuplicatedFiles {
587    pub paths: Vec<PathBuf>,
588    pub size: u64,
589}
590
591impl DuplicatedFiles {
592    fn wasted_size(&self) -> u64 {
593        self.size * (self.paths.len() as u64 - 1)
594    }
595
596    fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
597        match output_format {
598            OutputFormat::Default => self.write_human(stdout())?,
599            OutputFormat::PowerShell => self.write_pwsh(stdout())?,
600            OutputFormat::Shell => self.write_shell(stdout())?,
601            OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
602        }
603        Ok(())
604    }
605
606    fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
607        writeln!(
608            writer,
609            "Identical {} files of {}:",
610            self.paths.len(),
611            crate::human_readable_size(self.size)
612        )?;
613        for path in &self.paths {
614            writeln!(writer, "  {}", path.display())?;
615        }
616        Ok(())
617    }
618
619    fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
620        writeln!(writer, "- paths:")?;
621        for path in &self.paths {
622            writeln!(writer, "  - {:?}", path)?;
623        }
624        writeln!(writer, "  size: {}", self.size)?;
625        Ok(())
626    }
627
628    fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
629        self.write_shell_with(writer, "cp", Self::escape_shell)
630    }
631
632    fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
633        self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
634    }
635
636    fn write_shell_with(
637        &self,
638        mut writer: impl io::Write,
639        cmd: &str,
640        stringify: impl Fn(&Path) -> String,
641    ) -> anyhow::Result<()> {
642        let mut iter = self.paths.iter();
643        if let Some(path0) = iter.next() {
644            let path0 = stringify(path0);
645            for path in iter {
646                writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
647            }
648        }
649        Ok(())
650    }
651
652    fn escape_shell(path: &Path) -> String {
653        path.to_string_lossy().replace('\'', "\'\\'\'")
654    }
655
656    fn escape_shell_double(path: &Path) -> String {
657        path.to_string_lossy().replace('\'', "\'\'")
658    }
659}
660
661#[cfg(test)]
662mod tests {
663    use super::*;
664
665    fn default_exclude() -> globset::GlobSet {
666        let mut builder = globset::GlobSetBuilder::new();
667        builder.add(
668            globset::GlobBuilder::new(".hash_cache")
669                .case_insensitive(true)
670                .build()
671                .unwrap(),
672        );
673        builder.build().unwrap()
674    }
675
676    #[test]
677    fn find_duplicates() -> anyhow::Result<()> {
678        let dir = tempfile::tempdir()?;
679
680        let file1_path = dir.path().join("same1.txt");
681        fs::write(&file1_path, "same content")?;
682
683        let file2_path = dir.path().join("same2.txt");
684        fs::write(&file2_path, "same content")?;
685
686        let diff_path = dir.path().join("diff.txt");
687        fs::write(&diff_path, "different content")?;
688
689        let mut hasher = FileHasher::new(&[dir.path()])?;
690        hasher.buffer_size = 8192;
691        let duplicates = hasher.find_duplicates()?;
692
693        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
694        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
695
696        assert_eq!(duplicates.len(), 1);
697        let group = &duplicates[0];
698        assert_eq!(group.paths.len(), 2);
699        assert_eq!(group.size, 12); // "same content" is 12 bytes
700
701        assert!(group.paths.contains(&file1_path));
702        assert!(group.paths.contains(&file2_path));
703
704        Ok(())
705    }
706
707    #[test]
708    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
709        let dir = tempfile::tempdir()?;
710        let dir_path = dir.path();
711
712        let sub_dir = dir_path.join("a").join("a");
713        fs::create_dir_all(&sub_dir)?;
714
715        let file1_path = sub_dir.join("1");
716        fs::write(&file1_path, "same content")?;
717
718        let file2_path = sub_dir.join("2");
719        fs::write(&file2_path, "same content")?;
720
721        // Create empty cache file in a/a to force it to be the cache base
722        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
723        fs::File::create(&cache_aa_path)?;
724
725        // Run find_duplicates on a/a
726        let hasher_aa = FileHasher::new(&[&sub_dir])?;
727        let duplicates_aa = hasher_aa.find_duplicates()?;
728        assert_eq!(duplicates_aa.len(), 1);
729        assert!(cache_aa_path.exists());
730        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
731        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
732
733        // Create empty cache file in a to force it to be the cache base
734        let root_a = dir_path.join("a");
735        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
736        fs::File::create(&cache_a_path)?;
737
738        // Run find_duplicates on a
739        let hasher_a = FileHasher::new(&[&root_a])?;
740        let duplicates_a = hasher_a.find_duplicates()?;
741        assert_eq!(duplicates_a.len(), 1);
742        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
743        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
744
745        // The merged child cache should be removed.
746        assert!(cache_a_path.exists());
747        assert!(!cache_aa_path.exists());
748
749        Ok(())
750    }
751
752    #[test]
753    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
754        let dir = tempfile::tempdir()?;
755
756        let file1_path = dir.path().join("same1.txt");
757        fs::write(&file1_path, "same content")?;
758
759        let file2_path = dir.path().join("same2.txt");
760        fs::write(&file2_path, "same content")?;
761
762        let exclude_path = dir.path().join("exclude.txt");
763        fs::write(&exclude_path, "same content")?;
764
765        let mut hasher = FileHasher::new(&[dir.path()])?;
766        hasher.buffer_size = 8192;
767        let mut builder = globset::GlobSetBuilder::new();
768        builder.add(
769            globset::GlobBuilder::new("exclude.txt")
770                .case_insensitive(true)
771                .build()?,
772        );
773        let filter = builder.build()?;
774        hasher.exclude = Some(filter);
775
776        let duplicates = hasher.find_duplicates()?;
777        assert_eq!(duplicates.len(), 1);
778        let group = &duplicates[0];
779        assert_eq!(group.paths.len(), 2);
780        assert!(group.paths.contains(&file1_path));
781        assert!(group.paths.contains(&file2_path));
782        assert!(!group.paths.contains(&exclude_path));
783        Ok(())
784    }
785
786    #[derive(Default)]
787    struct CheckCollector {
788        start_seen: bool,
789        total_files: Option<u64>,
790        results: Vec<(PathBuf, CheckStatus)>,
791        file_done_count: u64,
792        num_error: usize,
793    }
794
795    impl CheckCollector {
796        fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
797            let mut collector = Self::default();
798            collector._collect(rx, base_dir);
799            collector
800        }
801
802        fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
803            while let Ok(event) = rx.recv() {
804                match event {
805                    CheckEvent::StartChecking => self.start_seen = true,
806                    CheckEvent::Total(total) => self.total_files = Some(total.num_files),
807                    CheckEvent::Result(file, status, _size) => {
808                        let stripped = file.path().strip_prefix(base_dir).unwrap().to_path_buf();
809                        self.results.push((stripped, status));
810                    }
811                    CheckEvent::Progress(progress_val) => {
812                        self.file_done_count += progress_val.num_files;
813                    }
814                    CheckEvent::Error(_) => {
815                        self.num_error += 1;
816                    }
817                }
818            }
819        }
820    }
821
822    #[test]
823    fn check_mode_empty_cache() -> anyhow::Result<()> {
824        let dir = tempfile::tempdir()?;
825        let dir_path = dir.path().to_path_buf();
826        println!("{:?}", dir_path);
827        let file1_path = dir.path().join("file1.txt");
828        fs::write(&file1_path, "content 1")?;
829        let file2_path = dir.path().join("file2.txt");
830        fs::write(&file2_path, "content 2")?;
831
832        let mut hasher = FileHasher::new(&[&dir_path])?;
833        hasher.exclude = Some(default_exclude());
834        let (tx, rx) = mpsc::channel();
835        hasher.check_streaming(tx, false)?;
836        let collector = CheckCollector::collect(rx, &dir_path);
837        assert!(collector.start_seen);
838        assert_eq!(collector.total_files, Some(2));
839        assert_eq!(collector.file_done_count, 0);
840        assert_eq!(collector.num_error, 0);
841
842        let mut results = collector.results;
843        results.sort_by(|a, b| a.0.cmp(&b.0));
844        assert_eq!(results.len(), 2);
845        assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
846        assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
847
848        assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
849        Ok(())
850    }
851
852    #[test]
853    fn check_mode_with_cache() -> anyhow::Result<()> {
854        let dir = tempfile::tempdir()?;
855        let dir_path = dir.path().to_path_buf();
856        let file1_path = dir.path().join("file1.txt");
857        let file2_path = dir.path().join("file2.txt");
858        fs::write(&file1_path, "content 1")?;
859        fs::write(&file2_path, "content 2")?;
860        let file1 = FileItem::try_from(file1_path.as_path())?;
861        let file2 = FileItem::try_from(file2_path.as_path())?;
862
863        let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
864        hasher.exclude = Some(default_exclude());
865        let _hash1 = hasher.get_hash(&file1)?;
866        let _hash2 = hasher.get_hash(&file2)?;
867        hasher.save_cache()?;
868        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
869
870        let mut hasher = FileHasher::new(&[&dir_path])?;
871        hasher.exclude = Some(default_exclude());
872        let (tx, rx) = mpsc::channel();
873        hasher.check_streaming(tx, false)?;
874        let collector = CheckCollector::collect(rx, &dir_path);
875        assert_eq!(collector.results.len(), 0);
876        assert_eq!(collector.file_done_count, 2);
877
878        fs::write(&file1_path, "content 1 modified")?;
879
880        let file2_meta_before = fs::metadata(&file2_path)?;
881        let mtime_before = file2_meta_before.modified()?;
882        std::thread::sleep(time::Duration::from_millis(10));
883        fs::write(&file2_path, "content 2")?;
884        let file2_meta_after = fs::metadata(&file2_path)?;
885        let mtime_after = file2_meta_after.modified()?;
886        assert!(mtime_after > mtime_before);
887
888        let mut hasher = FileHasher::new(&[&dir_path])?;
889        hasher.exclude = Some(default_exclude());
890        let (tx, rx) = mpsc::channel();
891        hasher.check_streaming(tx, false)?;
892        let collector = CheckCollector::collect(rx, &dir_path);
893        assert_eq!(collector.results.len(), 1);
894        let results = collector.results;
895        assert_eq!(
896            results[0],
897            (PathBuf::from("file1.txt"), CheckStatus::Modified)
898        );
899        assert_eq!(collector.file_done_count, 1);
900        Ok(())
901    }
902
903    #[test]
904    fn check_update_mode() -> anyhow::Result<()> {
905        let dir = tempfile::tempdir()?;
906        let dir_path = dir.path().to_path_buf();
907        let file1_path = dir.path().join("file1.txt");
908        fs::write(&file1_path, "content 1")?;
909
910        let mut hasher = FileHasher::new(&[&dir_path])?;
911        hasher.exclude = Some(default_exclude());
912        let (tx, rx) = mpsc::channel();
913        hasher.check_streaming(tx, true)?;
914        let _ = CheckCollector::collect(rx, &dir_path);
915        hasher.save_cache()?;
916        assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
917
918        let cache = FileHashCache::new(&dir_path);
919        let file1 = FileItem::try_from(file1_path.as_path())?;
920        let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
921        assert!(hash1.is_some());
922
923        std::thread::sleep(time::Duration::from_millis(10));
924        fs::write(&file1_path, "content 1 modified")?;
925        let file1_mod = FileItem::try_from(file1_path.as_path())?;
926
927        let mut hasher = FileHasher::new(&[&dir_path])?;
928        hasher.exclude = Some(default_exclude());
929        let (tx, rx) = mpsc::channel();
930        hasher.check_streaming(tx, true)?;
931        let _ = CheckCollector::collect(rx, &dir_path);
932        hasher.save_cache()?;
933
934        let cache = FileHashCache::new(&dir_path);
935        let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
936        assert!(hash_mod.is_some());
937        assert_ne!(hash1, hash_mod);
938
939        std::thread::sleep(time::Duration::from_millis(10));
940        fs::write(&file1_path, "content 1 modified")?;
941        let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
942        assert!(file1_mod2.modified() > file1_mod.modified());
943
944        assert!(
945            cache
946                .get(&PathBuf::from("file1.txt"), &file1_mod2)
947                .is_none()
948        );
949
950        let mut hasher = FileHasher::new(&[&dir_path])?;
951        hasher.exclude = Some(default_exclude());
952        let (tx, rx) = mpsc::channel();
953        hasher.check_streaming(tx, true)?;
954        let _ = CheckCollector::collect(rx, &dir_path);
955        hasher.save_cache()?;
956
957        let cache = FileHashCache::new(&dir_path);
958        assert!(
959            cache
960                .get(&PathBuf::from("file1.txt"), &file1_mod2)
961                .is_some()
962        );
963        Ok(())
964    }
965
966    #[test]
967    fn check_cleanup_deleted_files() -> anyhow::Result<()> {
968        let dir = tempfile::tempdir()?;
969        let dir_path = dir.path().to_path_buf();
970        let file1_path = dir.path().join("file1.txt");
971        let file2_path = dir.path().join("file2.txt");
972        fs::write(&file1_path, "content 1")?;
973        fs::write(&file2_path, "content 2")?;
974        let file1 = FileItem::try_from(file1_path.as_path())?;
975        let file2 = FileItem::try_from(file2_path.as_path())?;
976
977        let mut hasher = FileHasher::new(&[&dir_path])?;
978        hasher.exclude = Some(default_exclude());
979        let (tx, rx) = mpsc::channel();
980        hasher.check_streaming(tx, true)?;
981        let _ = CheckCollector::collect(rx, &dir_path);
982        hasher.save_cache()?;
983
984        // Verify both are in the cache
985        let cache = FileHashCache::new(&dir_path);
986        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
987        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
988
989        // Now delete file2 from disk
990        fs::remove_file(&file2_path)?;
991
992        // Run check and save again
993        let mut hasher = FileHasher::new(&[&dir_path])?;
994        hasher.exclude = Some(default_exclude());
995        let (tx, rx) = mpsc::channel();
996        hasher.check_streaming(tx, true)?;
997        let _ = CheckCollector::collect(rx, &dir_path);
998        hasher.save_cache()?;
999
1000        // Verify file2 is removed from cache, but file1 is still there
1001        let cache = FileHashCache::new(&dir_path);
1002        assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1003        assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1004        Ok(())
1005    }
1006
1007    #[test]
1008    fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1009        let tmp = tempfile::tempdir()?;
1010        let dir1 = tmp.path().join("dir1");
1011        let dir2 = tmp.path().join("dir2");
1012        fs::create_dir(&dir1)?;
1013        fs::create_dir(&dir2)?;
1014        let file1_path = dir1.join("file1.txt");
1015        fs::write(&file1_path, "same content")?;
1016        let file2_path = dir2.join("file2.txt");
1017        fs::write(&file2_path, "same content")?;
1018        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1019        let duplicates = hasher.find_duplicates()?;
1020        assert_eq!(duplicates.len(), 1);
1021        let group = &duplicates[0];
1022        assert_eq!(group.paths.len(), 2);
1023        assert_eq!(group.size, 12);
1024        assert!(group.paths.contains(&file1_path));
1025        assert!(group.paths.contains(&file2_path));
1026
1027        Ok(())
1028    }
1029
1030    #[test]
1031    fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1032        let tmp = tempfile::tempdir()?;
1033        let dir1 = tmp.path().join("dir1");
1034        let dir2 = tmp.path().join("dir2");
1035        fs::create_dir(&dir1)?;
1036        fs::create_dir(&dir2)?;
1037        let hasher = FileHasher::new(&[&dir1, &dir2])?;
1038        assert!(hasher.check(false).is_err());
1039        Ok(())
1040    }
1041
1042    #[test]
1043    fn escape_shell() {
1044        let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1045        assert_eq!(escape_shell(""), "");
1046        assert_eq!(escape_shell("abc"), "abc");
1047        assert_eq!(escape_shell("a'b"), "a'\\''b");
1048        assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1049
1050        let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1051        assert_eq!(escape_shell_double(""), "");
1052        assert_eq!(escape_shell_double("abc"), "abc");
1053        assert_eq!(escape_shell_double("a'b"), "a''b");
1054        assert_eq!(escape_shell_double("a'b'"), "a''b''");
1055    }
1056
1057    #[test]
1058    fn write_dups_shell_empty() -> anyhow::Result<()> {
1059        let dup_empty = DuplicatedFiles {
1060            paths: vec![],
1061            size: 100,
1062        };
1063        let mut buf = Vec::new();
1064        dup_empty.write_shell(&mut buf)?;
1065        assert_eq!(String::from_utf8(buf)?, "");
1066        Ok(())
1067    }
1068
1069    #[test]
1070    fn write_dups_shell_one() -> anyhow::Result<()> {
1071        let dup_one = DuplicatedFiles {
1072            paths: vec![PathBuf::from("a.txt")],
1073            size: 100,
1074        };
1075        let mut buf = Vec::new();
1076        dup_one.write_shell(&mut buf)?;
1077        assert_eq!(String::from_utf8(buf)?, "");
1078        Ok(())
1079    }
1080
1081    #[test]
1082    fn write_dups_shell_two() -> anyhow::Result<()> {
1083        let dup_multiple = DuplicatedFiles {
1084            paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1085            size: 100,
1086        };
1087        let mut buf = Vec::new();
1088        dup_multiple.write_shell(&mut buf)?;
1089        assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1090        Ok(())
1091    }
1092
1093    #[test]
1094    fn write_dups_shell_three() -> anyhow::Result<()> {
1095        let dup_multiple = DuplicatedFiles {
1096            paths: vec![
1097                PathBuf::from("a.txt"),
1098                PathBuf::from("b.txt"),
1099                PathBuf::from("c.txt"),
1100            ],
1101            size: 100,
1102        };
1103        let mut buf = Vec::new();
1104        dup_multiple.write_shell(&mut buf)?;
1105        assert_eq!(
1106            String::from_utf8(buf)?,
1107            "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1108        );
1109        Ok(())
1110    }
1111
1112    #[test]
1113    fn write_dups_shell_quotes() -> anyhow::Result<()> {
1114        let dup_quotes = DuplicatedFiles {
1115            paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1116            size: 100,
1117        };
1118        let mut buf = Vec::new();
1119        dup_quotes.write_shell(&mut buf)?;
1120        assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1121
1122        let mut buf = Vec::new();
1123        dup_quotes.write_pwsh(&mut buf)?;
1124        assert_eq!(
1125            String::from_utf8(buf)?,
1126            "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1127        );
1128        Ok(())
1129    }
1130}