1use crate::{
2 Classification, ColumnFormatter, DirectoryComparer, FileComparer, FileComparisonResult,
3 FileHashCache, FileItem, FileIterator, OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10 collections::HashMap,
11 fs,
12 io::{self, Read, stdout},
13 path::{Path, PathBuf},
14 sync::{
15 Arc,
16 atomic::{self, AtomicUsize},
17 mpsc,
18 },
19 time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26 StartHashing,
27 Total(ProgressValue),
28 Result(FileItem, blake3::Hash),
29 Error,
30}
31
32#[derive(Debug)]
33enum CheckEvent {
34 StartChecking,
35 Total(ProgressValue),
36 Result(FileComparisonResult, ProgressValue),
37 Progress(ProgressValue),
38 Error(FileItem),
39}
40
41enum DupState {
42 Single(FileItem, usize),
43 Hashing,
44}
45
46pub struct FileHasher {
48 dirs: Vec<PathBuf>,
49 pub buffer_size: usize,
50 cache: Option<Arc<FileHashCache>>,
51 num_hashed: AtomicUsize,
52 num_hash_looked_up: AtomicUsize,
53 pub exclude: Option<GlobSet>,
54 pub progress: Option<Arc<ProgressBuilder>>,
55 pub output_format: OutputFormat,
56 pub jobs: usize,
57}
58
59impl FileHasher {
60 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
61
62 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
64 if dirs.is_empty() {
65 anyhow::bail!("At least one directory must be specified.");
66 }
67 Ok(Self {
68 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
69 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
70 cache: None,
71 num_hashed: AtomicUsize::new(0),
72 num_hash_looked_up: AtomicUsize::new(0),
73 exclude: None,
74 progress: None,
75 output_format: OutputFormat::Default,
76 jobs: Self::DEFAULT_JOBS,
77 })
78 }
79
80 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
81 let mut hasher = Self::new(dirs)?;
82 hasher.cache = Some(hasher.new_cache()?);
83 Ok(hasher)
84 }
85
86 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
87 let common_ancestor = crate::common_ancestor(&self.dirs)
88 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
89 Ok(FileHashCache::find_or_new(&common_ancestor))
90 }
91
92 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
94 if self.cache.is_none() {
95 self.cache = Some(self.new_cache()?);
96 }
97 Ok(Arc::clone(self.cache.as_ref().unwrap()))
98 }
99
100 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
102 let cache = self.cache()?;
103 let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
104 cache.remove(relative);
105 Ok(())
106 }
107
108 pub fn save_cache(&self) -> anyhow::Result<()> {
110 log::info!(
111 "Hash stats for {:?}: {} computed, {} looked up",
112 self.dirs,
113 self.num_hashed.load(atomic::Ordering::Relaxed),
114 self.num_hash_looked_up.load(atomic::Ordering::Relaxed)
115 );
116 if let Some(cache) = &self.cache {
117 cache.save()?;
118 }
119 Ok(())
120 }
121
122 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
124 let cache = self.cache()?;
125 for dir in &self.dirs {
126 let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
127 cache.clear(relative);
128 }
129 Ok(())
130 }
131
132 pub fn check(&self, update: bool) -> anyhow::Result<()> {
134 match self.output_format {
135 OutputFormat::Default | OutputFormat::Symbol => {}
136 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
137 }
138 if self.dirs.len() > 1 {
139 anyhow::bail!("Check mode only supports one directory.");
140 }
141 let start_time = time::Instant::now();
142 let mut progress = self
143 .progress
144 .as_ref()
145 .map(|progress| progress.add_spinner())
146 .unwrap_or_else(Progress::none);
147 progress.use_bytes();
148 progress.set_message("Scanning directory...");
149 let mut num_new = 0;
150 let mut num_modified = 0;
151 let mut num_error = 0;
152 std::thread::scope(|scope| {
153 let (tx, rx) = mpsc::channel();
154 scope.spawn(|| {
155 if let Err(e) = self.check_streaming(tx, update) {
156 log::error!("Error during check: {}", e);
157 }
158 });
159 while let Ok(event) = rx.recv() {
160 match event {
161 CheckEvent::StartChecking => {
162 progress.set_message("Checking files...");
163 }
164 CheckEvent::Total(value) => {
165 progress.set_length(value);
166 progress.set_message("");
167 }
168 CheckEvent::Result(result, value) => {
169 progress.inc(value);
170 progress.suspend_for(stdout(), || {
171 result.print(self.output_format, "cached", "current")
172 });
173 if result.classification == Classification::OnlyInDir2 {
174 num_new += 1;
175 } else if result.is_identical_content() == Some(false) {
176 num_modified += 1;
177 }
178 }
179 CheckEvent::Progress(value) => {
180 progress.inc(value);
181 }
182 CheckEvent::Error(file) => {
183 progress.inc(ProgressValue::with_skip(file.size()));
184 num_error += 1;
185 }
186 }
187 }
188 });
189 progress.finish();
190 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
191 Ok(())
192 }
193
194 fn print_check_summary(
195 &self,
196 start_time: &time::Instant,
197 num_new: usize,
198 num_modified: usize,
199 num_error: usize,
200 ) -> io::Result<()> {
201 let summary = [
202 ("Elapsed:", 0),
203 (
204 "Hash computed:",
205 self.num_hashed.load(atomic::Ordering::Relaxed),
206 ),
207 ("New files:", num_new),
208 ("Modified files:", num_modified),
209 ("Errors:", num_error),
210 ];
211 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
212 let mut writer = std::io::stderr();
213 formatter.write_value(
214 &mut writer,
215 summary[0].0,
216 FormattedDuration(start_time.elapsed()),
217 )?;
218 formatter.write_values(&mut writer, &summary[1..])
219 }
220
221 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
222 assert_eq!(self.dirs.len(), 1);
223 let cache = self.new_cache()?;
224 let base_dir = &self.dirs[0];
225 let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
226 cache.set_remove_if_no_access(relative);
227 let cache_clone = Arc::clone(&cache);
228 std::thread::scope(|global_scope| {
229 let mut it = FileIterator::new(base_dir);
230 it.cache = Some(Arc::clone(&cache));
231 it.exclude = self.exclude.as_ref();
232 let it_rx = it.spawn_in_scope(global_scope);
233 tx.send(CheckEvent::StartChecking)?;
234 let pool = crate::build_thread_pool(self.jobs)?;
235 pool.scope(move |scope| -> anyhow::Result<()> {
236 let mut total = ProgressValue::default();
237 for file in it_rx {
238 self.check_file(file, &cache, update, &mut total, &tx, scope);
239 }
240 tx.send(CheckEvent::Total(total))?;
241 Ok(())
242 })
243 })?;
244 cache_clone.save()?;
245 Ok(())
246 }
247
248 fn check_file<'scope>(
249 &'scope self,
250 file: FileItem,
251 cache: &Arc<FileHashCache>,
252 update: bool,
253 total: &mut ProgressValue,
254 tx: &mpsc::Sender<CheckEvent>,
255 scope: &rayon::Scope<'scope>,
256 ) {
257 *total += ProgressValue::with_size(file.size());
258 let tx = tx.clone();
259 let cache = Arc::clone(cache);
260 scope.spawn(move |_| {
261 if let Err(error) = self._check_file(&file, cache, update, &tx) {
262 log::error!("Failed to check file '{}': {}", file, error);
263 if tx.send(CheckEvent::Error(file)).is_err() {
264 log::error!("Send failed");
265 }
266 }
267 });
268 }
269
270 fn _check_file(
271 &self,
272 file: &FileItem,
273 cache: Arc<FileHashCache>,
274 update: bool,
275 tx: &mpsc::Sender<CheckEvent>,
276 ) -> anyhow::Result<()> {
277 assert!(file.path().is_absolute());
278 let path_in_cache = file.relative_path(cache.base_dir());
279 match cache.get_entry(path_in_cache) {
280 Some(cached) => {
281 let mut result =
282 FileComparisonResult::new(file.path().into(), Classification::InBoth);
283 result.update_moodified(cached.modified, file.modified());
284 if cached.size != 0 {
285 result.update_size(cached.size, file.size());
286 }
287 if !update && cached.size != 0 && file.size() != cached.size {
288 tx.send(CheckEvent::Result(
289 result,
290 ProgressValue::with_skip(file.size()),
291 ))?;
292 return Ok(());
293 }
294 let hash = self.compute_hash(file)?;
295 result.is_content_same = Some(hash == cached.hash);
296 if hash == cached.hash {
297 if cached.should_update(file, update) {
298 cache.insert(path_in_cache, file, hash);
299 }
300 tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
301 } else {
302 if update {
303 cache.insert(path_in_cache, file, hash);
304 }
305 tx.send(CheckEvent::Result(
306 result,
307 ProgressValue::with_size(file.size()),
308 ))?;
309 }
310 }
311 None => {
312 if update {
313 let hash = self.compute_hash(file)?;
314 cache.insert(path_in_cache, file, hash);
315 }
316 tx.send(CheckEvent::Result(
317 FileComparisonResult::new(file.path().into(), Classification::OnlyInDir2),
318 ProgressValue::with_size(file.size()),
319 ))?;
320 }
321 }
322 Ok(())
323 }
324
325 pub fn run(&self) -> anyhow::Result<()> {
327 let start_time = time::Instant::now();
328 let mut duplicates = self.find_duplicates()?;
329 let mut total_wasted_space = 0;
330 if !duplicates.is_empty() {
331 duplicates.sort_by_key(|a| a.size);
332 total_wasted_space = self.print_duplicates_results(&duplicates)?;
333 }
334 self.print_duplicates_summary(&start_time, total_wasted_space)?;
335 Ok(())
336 }
337
338 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
339 let mut total_wasted_space = 0;
340 for dupes in duplicates {
341 dupes.print(self.output_format)?;
342 total_wasted_space += dupes.wasted_size();
343 }
344 Ok(total_wasted_space)
345 }
346
347 fn print_duplicates_summary(
348 &self,
349 start_time: &time::Instant,
350 total_wasted_space: u64,
351 ) -> io::Result<()> {
352 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
353 let num_hashed = self.num_hashed.load(atomic::Ordering::Relaxed).to_string();
354 let total_wasted_space = crate::human_readable_size(total_wasted_space);
355 let summary = [
356 ("Elapsed:", elapsed),
357 ("Hash computed:", num_hashed),
358 ("Total wasted space:", total_wasted_space),
359 ];
360 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
361 formatter.write_values(&mut io::stderr(), &summary)
362 }
363
364 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
366 let mut progress = self
367 .progress
368 .as_ref()
369 .map(|progress| progress.add_spinner())
370 .unwrap_or_else(Progress::none);
371 progress.set_message("Scanning directories...");
372
373 let (tx, rx) = mpsc::channel();
374 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
375 std::thread::scope(|scope| {
376 scope.spawn(|| {
377 if let Err(e) = self.find_duplicates_streaming(tx) {
378 log::error!("Error during duplicate finding: {}", e);
379 }
380 });
381
382 while let Ok(event) = rx.recv() {
383 match event {
384 DupEvent::StartHashing => progress.set_message("Hashing files..."),
385 DupEvent::Total(value) => progress.set_length(value),
386 DupEvent::Result(file, hash) => {
387 progress.inc(ProgressValue::with_size(file.size()));
388 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
389 paths: Vec::new(),
390 size: file.size(),
391 });
392 assert_eq!(
394 entry.size,
395 file.size(),
396 "Hash collision: sizes do not match"
397 );
398 entry.paths.push(file.into_path_buf());
399 }
400 DupEvent::Error => {}
401 }
402 }
403 });
404 progress.finish();
405
406 let mut duplicates = Vec::new();
407 for (_, mut dupes) in by_hash {
408 if dupes.paths.len() > 1 {
409 dupes.paths.sort();
410 duplicates.push(dupes);
411 }
412 }
413 Ok(duplicates)
414 }
415
416 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
417 std::thread::scope(|global_scope| {
418 let (it_rx, caches) = self.stream_file_items(global_scope)?;
419 let caches = &caches;
420 let pool = crate::build_thread_pool(self.jobs)?;
421 pool.scope(move |scope| -> anyhow::Result<()> {
422 let mut by_size: HashMap<u64, DupState> = HashMap::new();
423 let mut total = ProgressValue::default();
424 tx.send(DupEvent::StartHashing)?;
425 for (file, dir_index) in it_rx {
426 let size = file.size();
427 if size == 0 {
428 continue;
429 }
430 let cache = &caches[dir_index];
431 match by_size.entry(size) {
432 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
433 {
434 DupState::Single(file0, dir_index0) => {
435 let cache0 = &caches[*dir_index0];
438 self.send_hash(file0, cache0, &tx, scope);
439 self.send_hash(&file, cache, &tx, scope);
440 total += ProgressValue::with_size(file0.size());
441 total += ProgressValue::with_size(file.size());
442
443 *occ.get_mut() = DupState::Hashing;
445 }
446 DupState::Hashing => {
447 self.send_hash(&file, cache, &tx, scope);
449 total += ProgressValue::with_size(file.size());
450 }
451 },
452 std::collections::hash_map::Entry::Vacant(vac) => {
453 vac.insert(DupState::Single(file, dir_index));
454 }
455 }
456 }
457 tx.send(DupEvent::Total(total))?;
458 Ok(())
459 })?;
460 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
461 Ok::<(), anyhow::Error>(())
462 })?;
463 Ok(())
464 }
465
466 fn stream_file_items<'scope, 'env>(
467 &'env self,
468 scope: &'scope std::thread::Scope<'scope, 'env>,
469 ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
470 let (it_tx, it_rx) = mpsc::channel();
471 let mut caches = Vec::with_capacity(self.dirs.len());
472 for (dir_index, dir) in self.dirs.iter().enumerate() {
473 let mut it = FileIterator::new(dir);
474 let cache = FileHashCache::find_or_new(dir);
475 it.cache = Some(Arc::clone(&cache));
476 it.exclude = self.exclude.as_ref();
477 let it_tx = it_tx.clone();
478 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
479 caches.push(cache);
480 }
481 Ok((it_rx, caches))
482 }
483
484 fn send_hash<'scope>(
485 &'scope self,
486 file: &FileItem,
487 cache: &Arc<FileHashCache>,
488 tx: &mpsc::Sender<DupEvent>,
489 scope: &rayon::Scope<'scope>,
490 ) {
491 let (hash, relative) = self
492 .get_hash_from_cache(file, cache)
493 .expect("path should be in cache base_dir");
494 if let Some(hash) = hash {
495 let _ = tx.send(DupEvent::Result(file.clone(), hash));
496 return;
497 }
498
499 let file = file.clone();
500 let relative = relative.to_path_buf();
501 let tx = tx.clone();
502 let cache = Arc::clone(cache);
503 scope.spawn(move |_| {
504 if let Ok(hash) = self.compute_hash(&file) {
505 cache.insert(&relative, &file, hash);
506 let _ = tx.send(DupEvent::Result(file, hash));
507 } else {
508 log::error!("Failed to hash file: '{}'", file);
509 let _ = tx.send(DupEvent::Error);
510 }
511 });
512 }
513
514 pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
516 let cache = self.cache.as_ref().expect("cache should be initialized");
517 let (hash, relative) = self.get_hash_from_cache(file, cache)?;
518 if let Some(hash) = hash {
519 return Ok(hash);
520 }
521
522 let hash = self.compute_hash(file)?;
523 cache.insert(relative, file, hash);
524 Ok(hash)
525 }
526
527 fn get_hash_from_cache<'a>(
528 &self,
529 file: &'a FileItem,
530 cache: &FileHashCache,
531 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
532 let relative = file.relative_path(cache.base_dir());
533 if let Some(hash) = cache.get(relative, file) {
534 self.num_hash_looked_up
535 .fetch_add(1, atomic::Ordering::Relaxed);
536 return Ok((Some(hash), relative));
537 }
538 Ok((None, relative))
539 }
540
541 fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
542 let start_time = time::Instant::now();
543 let mut f = fs::File::open(file.path())?;
544 let mut progress = self
545 .progress
546 .as_ref()
547 .map(|progress| progress.add_file(file.path(), file.size()))
548 .unwrap_or_else(Progress::none);
549 let mut hasher = blake3::Hasher::new();
550 if self.buffer_size == 0 {
551 if file.size() > 0 {
552 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
553 hasher.update(&mmap[..]);
554 progress.inc(ProgressValue::with_size(file.size()));
555 }
556 } else {
557 let mut buf = vec![0u8; self.buffer_size];
558 loop {
559 let n = f.read(&mut buf)?;
560 if n == 0 {
561 break;
562 }
563 hasher.update(&buf[..n]);
564 progress.inc(ProgressValue::with_size(n as u64));
565 }
566 }
567 progress.finish();
568 self.num_hashed.fetch_add(1, atomic::Ordering::Relaxed);
569 let hash = hasher.finalize();
570 log::trace!(
571 "Computed hash in {}: '{}'",
572 FormattedDuration(start_time.elapsed()),
573 file
574 );
575 Ok(hash)
576 }
577}
578
579#[derive(Clone, Debug)]
581pub struct DuplicatedFiles {
582 pub paths: Vec<PathBuf>,
583 pub size: u64,
584}
585
586impl DuplicatedFiles {
587 fn wasted_size(&self) -> u64 {
588 self.size * (self.paths.len() as u64 - 1)
589 }
590
591 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
592 match output_format {
593 OutputFormat::Default => self.write_human(stdout())?,
594 OutputFormat::PowerShell => self.write_pwsh(stdout())?,
595 OutputFormat::Shell => self.write_shell(stdout())?,
596 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
597 }
598 Ok(())
599 }
600
601 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
602 writeln!(
603 writer,
604 "Identical {} files of {}:",
605 self.paths.len(),
606 crate::human_readable_size(self.size)
607 )?;
608 for path in &self.paths {
609 writeln!(writer, " {}", path.display())?;
610 }
611 Ok(())
612 }
613
614 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
615 writeln!(writer, "- paths:")?;
616 for path in &self.paths {
617 writeln!(writer, " - {:?}", path)?;
618 }
619 writeln!(writer, " size: {}", self.size)?;
620 Ok(())
621 }
622
623 fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
624 self.write_shell_with(writer, "cp", Self::escape_shell)
625 }
626
627 fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
628 self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
629 }
630
631 fn write_shell_with(
632 &self,
633 mut writer: impl io::Write,
634 cmd: &str,
635 stringify: impl Fn(&Path) -> String,
636 ) -> anyhow::Result<()> {
637 let mut iter = self.paths.iter();
638 if let Some(path0) = iter.next() {
639 let path0 = stringify(path0);
640 for path in iter {
641 writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
642 }
643 }
644 Ok(())
645 }
646
647 fn escape_shell(path: &Path) -> String {
648 path.to_string_lossy().replace('\'', "\'\\'\'")
649 }
650
651 fn escape_shell_double(path: &Path) -> String {
652 path.to_string_lossy().replace('\'', "\'\'")
653 }
654}
655
656#[cfg(test)]
657mod tests {
658 use super::*;
659 use std::cmp::Ordering;
660
661 fn default_exclude() -> globset::GlobSet {
662 let mut builder = globset::GlobSetBuilder::new();
663 builder.add(
664 globset::GlobBuilder::new(".hash_cache")
665 .case_insensitive(true)
666 .build()
667 .unwrap(),
668 );
669 builder.build().unwrap()
670 }
671
672 #[test]
673 fn find_duplicates() -> anyhow::Result<()> {
674 let dir = tempfile::tempdir()?;
675
676 let file1_path = dir.path().join("same1.txt");
677 fs::write(&file1_path, "same content")?;
678
679 let file2_path = dir.path().join("same2.txt");
680 fs::write(&file2_path, "same content")?;
681
682 let diff_path = dir.path().join("diff.txt");
683 fs::write(&diff_path, "different content")?;
684
685 let mut hasher = FileHasher::new(&[dir.path()])?;
686 hasher.buffer_size = 8192;
687 let duplicates = hasher.find_duplicates()?;
688
689 assert_eq!(hasher.num_hashed.load(atomic::Ordering::Relaxed), 2);
690 assert_eq!(hasher.num_hash_looked_up.load(atomic::Ordering::Relaxed), 0);
691
692 assert_eq!(duplicates.len(), 1);
693 let group = &duplicates[0];
694 assert_eq!(group.paths.len(), 2);
695 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
698 assert!(group.paths.contains(&file2_path));
699
700 Ok(())
701 }
702
703 #[test]
704 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
705 let dir = tempfile::tempdir()?;
706 let dir_path = dir.path();
707
708 let sub_dir = dir_path.join("a").join("a");
709 fs::create_dir_all(&sub_dir)?;
710
711 let file1_path = sub_dir.join("1");
712 fs::write(&file1_path, "same content")?;
713
714 let file2_path = sub_dir.join("2");
715 fs::write(&file2_path, "same content")?;
716
717 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
719 fs::File::create(&cache_aa_path)?;
720
721 let hasher_aa = FileHasher::new(&[&sub_dir])?;
723 let duplicates_aa = hasher_aa.find_duplicates()?;
724 assert_eq!(duplicates_aa.len(), 1);
725 assert!(cache_aa_path.exists());
726 assert_eq!(hasher_aa.num_hashed.load(atomic::Ordering::Relaxed), 2);
727 assert_eq!(
728 hasher_aa.num_hash_looked_up.load(atomic::Ordering::Relaxed),
729 0
730 );
731
732 let root_a = dir_path.join("a");
734 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
735 fs::File::create(&cache_a_path)?;
736
737 let hasher_a = FileHasher::new(&[&root_a])?;
739 let duplicates_a = hasher_a.find_duplicates()?;
740 assert_eq!(duplicates_a.len(), 1);
741 assert_eq!(hasher_a.num_hashed.load(atomic::Ordering::Relaxed), 0);
742 assert_eq!(
743 hasher_a.num_hash_looked_up.load(atomic::Ordering::Relaxed),
744 2
745 );
746
747 assert!(cache_a_path.exists());
749 assert!(!cache_aa_path.exists());
750
751 Ok(())
752 }
753
754 #[test]
755 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
756 let dir = tempfile::tempdir()?;
757
758 let file1_path = dir.path().join("same1.txt");
759 fs::write(&file1_path, "same content")?;
760
761 let file2_path = dir.path().join("same2.txt");
762 fs::write(&file2_path, "same content")?;
763
764 let exclude_path = dir.path().join("exclude.txt");
765 fs::write(&exclude_path, "same content")?;
766
767 let mut hasher = FileHasher::new(&[dir.path()])?;
768 hasher.buffer_size = 8192;
769 let mut builder = globset::GlobSetBuilder::new();
770 builder.add(
771 globset::GlobBuilder::new("exclude.txt")
772 .case_insensitive(true)
773 .build()?,
774 );
775 let filter = builder.build()?;
776 hasher.exclude = Some(filter);
777
778 let duplicates = hasher.find_duplicates()?;
779 assert_eq!(duplicates.len(), 1);
780 let group = &duplicates[0];
781 assert_eq!(group.paths.len(), 2);
782 assert!(group.paths.contains(&file1_path));
783 assert!(group.paths.contains(&file2_path));
784 assert!(!group.paths.contains(&exclude_path));
785 Ok(())
786 }
787
788 #[derive(Default)]
789 struct CheckCollector {
790 start_seen: bool,
791 total_files: Option<u64>,
792 results: Vec<FileComparisonResult>,
793 file_done_count: u64,
794 num_error: usize,
795 }
796
797 impl CheckCollector {
798 fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
799 let mut collector = Self::default();
800 collector._collect(rx, base_dir);
801 collector
802 }
803
804 fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
805 while let Ok(event) = rx.recv() {
806 match event {
807 CheckEvent::StartChecking => self.start_seen = true,
808 CheckEvent::Total(total) => self.total_files = Some(total.num_files),
809 CheckEvent::Result(mut result, _size) => {
810 result.relative_path = result
811 .relative_path
812 .strip_prefix(base_dir)
813 .unwrap()
814 .to_path_buf();
815 self.results.push(result);
816 }
817 CheckEvent::Progress(progress_val) => {
818 self.file_done_count += progress_val.num_files;
819 }
820 CheckEvent::Error(_) => {
821 self.num_error += 1;
822 }
823 }
824 }
825 }
826 }
827
828 #[test]
829 fn check_mode_empty_cache() -> anyhow::Result<()> {
830 let dir = tempfile::tempdir()?;
831 let dir_path = dir.path().to_path_buf();
832 println!("{:?}", dir_path);
833 let file1_path = dir.path().join("file1.txt");
834 fs::write(&file1_path, "content 1")?;
835 let file2_path = dir.path().join("file2.txt");
836 fs::write(&file2_path, "content 2")?;
837
838 let mut hasher = FileHasher::new(&[&dir_path])?;
839 hasher.exclude = Some(default_exclude());
840 let (tx, rx) = mpsc::channel();
841 hasher.check_streaming(tx, false)?;
842 let collector = CheckCollector::collect(rx, &dir_path);
843 assert!(collector.start_seen);
844 assert_eq!(collector.total_files, Some(2));
845 assert_eq!(collector.file_done_count, 0);
846 assert_eq!(collector.num_error, 0);
847
848 let mut results = collector.results;
849 results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
850 assert_eq!(results.len(), 2);
851 assert_eq!(results[0].relative_path, Path::new("file1.txt"));
852 assert_eq!(results[0].classification, Classification::OnlyInDir2);
853 assert_eq!(results[1].relative_path, Path::new("file2.txt"));
854 assert_eq!(results[1].classification, Classification::OnlyInDir2);
855
856 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
857 Ok(())
858 }
859
860 #[test]
861 fn check_mode_with_cache() -> anyhow::Result<()> {
862 let dir = tempfile::tempdir()?;
863 let dir_path = dir.path().to_path_buf();
864 let file1_path = dir.path().join("file1.txt");
865 let file2_path = dir.path().join("file2.txt");
866 fs::write(&file1_path, "content 1")?;
867 fs::write(&file2_path, "content 2")?;
868 let file1 = FileItem::try_from(file1_path.as_path())?;
869 let file2 = FileItem::try_from(file2_path.as_path())?;
870
871 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
872 hasher.exclude = Some(default_exclude());
873 let _hash1 = hasher.get_hash(&file1)?;
874 let _hash2 = hasher.get_hash(&file2)?;
875 hasher.save_cache()?;
876 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
877
878 let mut hasher = FileHasher::new(&[&dir_path])?;
879 hasher.exclude = Some(default_exclude());
880 let (tx, rx) = mpsc::channel();
881 hasher.check_streaming(tx, false)?;
882 let collector = CheckCollector::collect(rx, &dir_path);
883 assert_eq!(collector.results.len(), 0);
884 assert_eq!(collector.file_done_count, 2);
885
886 fs::write(&file1_path, "content 1 modified")?;
887
888 let file2_meta_before = fs::metadata(&file2_path)?;
889 let mtime_before = file2_meta_before.modified()?;
890 std::thread::sleep(time::Duration::from_millis(10));
891 fs::write(&file2_path, "content 2")?;
892 let file2_meta_after = fs::metadata(&file2_path)?;
893 let mtime_after = file2_meta_after.modified()?;
894 assert!(mtime_after > mtime_before);
895
896 let mut hasher = FileHasher::new(&[&dir_path])?;
897 hasher.exclude = Some(default_exclude());
898 let (tx, rx) = mpsc::channel();
899 hasher.check_streaming(tx, false)?;
900 let collector = CheckCollector::collect(rx, &dir_path);
901 assert_eq!(collector.results.len(), 1);
902 let results = collector.results;
903 assert_eq!(results[0].relative_path, Path::new("file1.txt"));
904 assert_eq!(results[0].modified_time_comparison, Some(Ordering::Less));
905 assert_eq!(results[0].size_comparison, Some(Ordering::Less));
906 assert_eq!(results[0].is_content_same, None);
907 assert_eq!(collector.file_done_count, 1);
908 Ok(())
909 }
910
911 #[test]
912 fn check_update_mode() -> anyhow::Result<()> {
913 let dir = tempfile::tempdir()?;
914 let dir_path = dir.path().to_path_buf();
915 let file1_path = dir.path().join("file1.txt");
916 fs::write(&file1_path, "content 1")?;
917
918 let mut hasher = FileHasher::new(&[&dir_path])?;
919 hasher.exclude = Some(default_exclude());
920 let (tx, rx) = mpsc::channel();
921 hasher.check_streaming(tx, true)?;
922 let _ = CheckCollector::collect(rx, &dir_path);
923 hasher.save_cache()?;
924 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
925
926 let cache = FileHashCache::new(&dir_path);
927 let file1 = FileItem::try_from(file1_path.as_path())?;
928 let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
929 assert!(hash1.is_some());
930
931 std::thread::sleep(time::Duration::from_millis(10));
932 fs::write(&file1_path, "content 1 modified")?;
933 let file1_mod = FileItem::try_from(file1_path.as_path())?;
934
935 let mut hasher = FileHasher::new(&[&dir_path])?;
936 hasher.exclude = Some(default_exclude());
937 let (tx, rx) = mpsc::channel();
938 hasher.check_streaming(tx, true)?;
939 let _ = CheckCollector::collect(rx, &dir_path);
940 hasher.save_cache()?;
941
942 let cache = FileHashCache::new(&dir_path);
943 let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
944 assert!(hash_mod.is_some());
945 assert_ne!(hash1, hash_mod);
946
947 std::thread::sleep(time::Duration::from_millis(10));
948 fs::write(&file1_path, "content 1 modified")?;
949 let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
950 assert!(file1_mod2.modified() > file1_mod.modified());
951
952 assert!(
953 cache
954 .get(&PathBuf::from("file1.txt"), &file1_mod2)
955 .is_none()
956 );
957
958 let mut hasher = FileHasher::new(&[&dir_path])?;
959 hasher.exclude = Some(default_exclude());
960 let (tx, rx) = mpsc::channel();
961 hasher.check_streaming(tx, true)?;
962 let _ = CheckCollector::collect(rx, &dir_path);
963 hasher.save_cache()?;
964
965 let cache = FileHashCache::new(&dir_path);
966 assert!(
967 cache
968 .get(&PathBuf::from("file1.txt"), &file1_mod2)
969 .is_some()
970 );
971 Ok(())
972 }
973
974 #[test]
975 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
976 let dir = tempfile::tempdir()?;
977 let dir_path = dir.path().to_path_buf();
978 let file1_path = dir.path().join("file1.txt");
979 let file2_path = dir.path().join("file2.txt");
980 fs::write(&file1_path, "content 1")?;
981 fs::write(&file2_path, "content 2")?;
982 let file1 = FileItem::try_from(file1_path.as_path())?;
983 let file2 = FileItem::try_from(file2_path.as_path())?;
984
985 let mut hasher = FileHasher::new(&[&dir_path])?;
986 hasher.exclude = Some(default_exclude());
987 let (tx, rx) = mpsc::channel();
988 hasher.check_streaming(tx, true)?;
989 let _ = CheckCollector::collect(rx, &dir_path);
990 hasher.save_cache()?;
991
992 let cache = FileHashCache::new(&dir_path);
994 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
995 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
996
997 fs::remove_file(&file2_path)?;
999
1000 let mut hasher = FileHasher::new(&[&dir_path])?;
1002 hasher.exclude = Some(default_exclude());
1003 let (tx, rx) = mpsc::channel();
1004 hasher.check_streaming(tx, true)?;
1005 let _ = CheckCollector::collect(rx, &dir_path);
1006 hasher.save_cache()?;
1007
1008 let cache = FileHashCache::new(&dir_path);
1010 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1011 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1012 Ok(())
1013 }
1014
1015 #[test]
1016 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1017 let tmp = tempfile::tempdir()?;
1018 let dir1 = tmp.path().join("dir1");
1019 let dir2 = tmp.path().join("dir2");
1020 fs::create_dir(&dir1)?;
1021 fs::create_dir(&dir2)?;
1022 let file1_path = dir1.join("file1.txt");
1023 fs::write(&file1_path, "same content")?;
1024 let file2_path = dir2.join("file2.txt");
1025 fs::write(&file2_path, "same content")?;
1026 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1027 let duplicates = hasher.find_duplicates()?;
1028 assert_eq!(duplicates.len(), 1);
1029 let group = &duplicates[0];
1030 assert_eq!(group.paths.len(), 2);
1031 assert_eq!(group.size, 12);
1032 assert!(group.paths.contains(&file1_path));
1033 assert!(group.paths.contains(&file2_path));
1034
1035 Ok(())
1036 }
1037
1038 #[test]
1039 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1040 let tmp = tempfile::tempdir()?;
1041 let dir1 = tmp.path().join("dir1");
1042 let dir2 = tmp.path().join("dir2");
1043 fs::create_dir(&dir1)?;
1044 fs::create_dir(&dir2)?;
1045 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1046 assert!(hasher.check(false).is_err());
1047 Ok(())
1048 }
1049
1050 #[test]
1051 fn escape_shell() {
1052 let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1053 assert_eq!(escape_shell(""), "");
1054 assert_eq!(escape_shell("abc"), "abc");
1055 assert_eq!(escape_shell("a'b"), "a'\\''b");
1056 assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1057
1058 let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1059 assert_eq!(escape_shell_double(""), "");
1060 assert_eq!(escape_shell_double("abc"), "abc");
1061 assert_eq!(escape_shell_double("a'b"), "a''b");
1062 assert_eq!(escape_shell_double("a'b'"), "a''b''");
1063 }
1064
1065 #[test]
1066 fn write_dups_shell_empty() -> anyhow::Result<()> {
1067 let dup_empty = DuplicatedFiles {
1068 paths: vec![],
1069 size: 100,
1070 };
1071 let mut buf = Vec::new();
1072 dup_empty.write_shell(&mut buf)?;
1073 assert_eq!(String::from_utf8(buf)?, "");
1074 Ok(())
1075 }
1076
1077 #[test]
1078 fn write_dups_shell_one() -> anyhow::Result<()> {
1079 let dup_one = DuplicatedFiles {
1080 paths: vec![PathBuf::from("a.txt")],
1081 size: 100,
1082 };
1083 let mut buf = Vec::new();
1084 dup_one.write_shell(&mut buf)?;
1085 assert_eq!(String::from_utf8(buf)?, "");
1086 Ok(())
1087 }
1088
1089 #[test]
1090 fn write_dups_shell_two() -> anyhow::Result<()> {
1091 let dup_multiple = DuplicatedFiles {
1092 paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1093 size: 100,
1094 };
1095 let mut buf = Vec::new();
1096 dup_multiple.write_shell(&mut buf)?;
1097 assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1098 Ok(())
1099 }
1100
1101 #[test]
1102 fn write_dups_shell_three() -> anyhow::Result<()> {
1103 let dup_multiple = DuplicatedFiles {
1104 paths: vec![
1105 PathBuf::from("a.txt"),
1106 PathBuf::from("b.txt"),
1107 PathBuf::from("c.txt"),
1108 ],
1109 size: 100,
1110 };
1111 let mut buf = Vec::new();
1112 dup_multiple.write_shell(&mut buf)?;
1113 assert_eq!(
1114 String::from_utf8(buf)?,
1115 "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1116 );
1117 Ok(())
1118 }
1119
1120 #[test]
1121 fn write_dups_shell_quotes() -> anyhow::Result<()> {
1122 let dup_quotes = DuplicatedFiles {
1123 paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1124 size: 100,
1125 };
1126 let mut buf = Vec::new();
1127 dup_quotes.write_shell(&mut buf)?;
1128 assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1129
1130 let mut buf = Vec::new();
1131 dup_quotes.write_pwsh(&mut buf)?;
1132 assert_eq!(
1133 String::from_utf8(buf)?,
1134 "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1135 );
1136 Ok(())
1137 }
1138}