1use crate::{
2 Classification, ColumnFormatter, DirectoryComparer, FileComparer, FileComparisonResult,
3 FileHashCache, FileItem, FileIterator, OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10 collections::HashMap,
11 fs,
12 io::{self, Read, stdout},
13 path::{Path, PathBuf},
14 sync::{
15 Arc,
16 atomic::{self, AtomicUsize},
17 mpsc,
18 },
19 time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26 StartHashing,
27 Total(ProgressValue),
28 Result(FileItem, blake3::Hash),
29 Error,
30}
31
32#[derive(Debug)]
33enum CheckEvent {
34 StartChecking,
35 Total(ProgressValue),
36 Result(FileComparisonResult, ProgressValue),
37 Progress(ProgressValue),
38 Error(FileItem),
39}
40
41enum DupState {
42 Single(FileItem, usize),
43 Hashing,
44}
45
46pub struct FileHasher {
48 dirs: Vec<PathBuf>,
49 pub buffer_size: usize,
50 cache: Option<Arc<FileHashCache>>,
51 num_hashed: AtomicUsize,
52 num_hash_looked_up: AtomicUsize,
53 pub exclude: Option<GlobSet>,
54 pub progress: Option<Arc<ProgressBuilder>>,
55 pub output_format: OutputFormat,
56 pub jobs: usize,
57}
58
59impl FileHasher {
60 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
61
62 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
64 if dirs.is_empty() {
65 anyhow::bail!("At least one directory must be specified.");
66 }
67 Ok(Self {
68 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
69 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
70 cache: None,
71 num_hashed: AtomicUsize::new(0),
72 num_hash_looked_up: AtomicUsize::new(0),
73 exclude: None,
74 progress: None,
75 output_format: OutputFormat::Default,
76 jobs: Self::DEFAULT_JOBS,
77 })
78 }
79
80 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
81 let mut hasher = Self::new(dirs)?;
82 hasher.cache = Some(hasher.new_cache()?);
83 Ok(hasher)
84 }
85
86 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
87 let common_ancestor = crate::common_ancestor(&self.dirs)
88 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
89 Ok(FileHashCache::find_or_new(&common_ancestor))
90 }
91
92 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
94 if self.cache.is_none() {
95 self.cache = Some(self.new_cache()?);
96 }
97 Ok(Arc::clone(self.cache.as_ref().unwrap()))
98 }
99
100 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
102 let cache = self.cache()?;
103 let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
104 cache.remove(relative);
105 Ok(())
106 }
107
108 pub fn save_cache(&self) -> anyhow::Result<()> {
110 log::info!(
111 "Hash stats for {:?}: {} computed, {} looked up",
112 self.dirs,
113 self.num_hashed.load(atomic::Ordering::Relaxed),
114 self.num_hash_looked_up.load(atomic::Ordering::Relaxed)
115 );
116 if let Some(cache) = &self.cache {
117 cache.save()?;
118 }
119 Ok(())
120 }
121
122 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
124 let cache = self.cache()?;
125 for dir in &self.dirs {
126 let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
127 cache.clear(relative);
128 }
129 Ok(())
130 }
131
132 pub fn check(&self, update: bool) -> anyhow::Result<()> {
134 match self.output_format {
135 OutputFormat::Default | OutputFormat::Symbol => {}
136 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
137 }
138 if self.dirs.len() > 1 {
139 anyhow::bail!("Check mode only supports one directory.");
140 }
141 let start_time = time::Instant::now();
142 let mut progress = self
143 .progress
144 .as_ref()
145 .map(|progress| progress.add_spinner())
146 .unwrap_or_else(Progress::none);
147 progress.use_bytes();
148 progress.set_message("Scanning directory...");
149 let mut num_new = 0;
150 let mut num_modified = 0;
151 let mut num_error = 0;
152 std::thread::scope(|scope| {
153 let (tx, rx) = mpsc::channel();
154 scope.spawn(|| {
155 if let Err(e) = self.check_streaming(tx, update) {
156 log::error!("Error during check: {}", e);
157 }
158 });
159 while let Ok(event) = rx.recv() {
160 match event {
161 CheckEvent::StartChecking => {
162 progress.set_message("Checking files...");
163 }
164 CheckEvent::Total(value) => {
165 progress.set_length(value);
166 progress.set_message("");
167 }
168 CheckEvent::Result(result, value) => {
169 progress.inc(value);
170 match self.output_format {
171 OutputFormat::Symbol => progress.suspend_for(stdout(), || {
172 println!(
173 "{} {}",
174 result.to_symbol_string(),
175 result.relative_path.display()
176 );
177 }),
178 OutputFormat::Default => progress.suspend_for(stdout(), || {
179 println!(
180 "{}: {}",
181 result.relative_path.display(),
182 result.to_string("cached", "current")
183 );
184 }),
185 _ => unreachable!(),
186 }
187 if result.classification == Classification::OnlyInDir2 {
188 num_new += 1;
189 } else if result.is_identical_content() == Some(false) {
190 num_modified += 1;
191 }
192 }
193 CheckEvent::Progress(value) => {
194 progress.inc(value);
195 }
196 CheckEvent::Error(file) => {
197 progress.inc(ProgressValue::with_skip(file.size()));
198 num_error += 1;
199 }
200 }
201 }
202 });
203 progress.finish();
204 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
205 Ok(())
206 }
207
208 fn print_check_summary(
209 &self,
210 start_time: &time::Instant,
211 num_new: usize,
212 num_modified: usize,
213 num_error: usize,
214 ) -> io::Result<()> {
215 let summary = [
216 ("Elapsed:", 0),
217 (
218 "Hash computed:",
219 self.num_hashed.load(atomic::Ordering::Relaxed),
220 ),
221 ("New files:", num_new),
222 ("Modified files:", num_modified),
223 ("Errors:", num_error),
224 ];
225 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
226 let mut writer = std::io::stderr();
227 formatter.write_value(
228 &mut writer,
229 summary[0].0,
230 FormattedDuration(start_time.elapsed()),
231 )?;
232 formatter.write_values(&mut writer, &summary[1..])
233 }
234
235 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
236 assert_eq!(self.dirs.len(), 1);
237 let cache = self.new_cache()?;
238 let base_dir = &self.dirs[0];
239 let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
240 cache.set_remove_if_no_access(relative);
241 let cache_clone = Arc::clone(&cache);
242 std::thread::scope(|global_scope| {
243 let mut it = FileIterator::new(base_dir);
244 it.cache = Some(Arc::clone(&cache));
245 it.exclude = self.exclude.as_ref();
246 let it_rx = it.spawn_in_scope(global_scope);
247 tx.send(CheckEvent::StartChecking)?;
248 let pool = crate::build_thread_pool(self.jobs)?;
249 pool.scope(move |scope| -> anyhow::Result<()> {
250 let mut total = ProgressValue::default();
251 for file in it_rx {
252 self.check_file(file, &cache, update, &mut total, &tx, scope);
253 }
254 tx.send(CheckEvent::Total(total))?;
255 Ok(())
256 })
257 })?;
258 cache_clone.save()?;
259 Ok(())
260 }
261
262 fn check_file<'scope>(
263 &'scope self,
264 file: FileItem,
265 cache: &Arc<FileHashCache>,
266 update: bool,
267 total: &mut ProgressValue,
268 tx: &mpsc::Sender<CheckEvent>,
269 scope: &rayon::Scope<'scope>,
270 ) {
271 *total += ProgressValue::with_size(file.size());
272 let tx = tx.clone();
273 let cache = Arc::clone(cache);
274 scope.spawn(move |_| {
275 if let Err(error) = self._check_file(&file, cache, update, &tx) {
276 log::error!("Failed to check file '{}': {}", file, error);
277 if tx.send(CheckEvent::Error(file)).is_err() {
278 log::error!("Send failed");
279 }
280 }
281 });
282 }
283
284 fn _check_file(
285 &self,
286 file: &FileItem,
287 cache: Arc<FileHashCache>,
288 update: bool,
289 tx: &mpsc::Sender<CheckEvent>,
290 ) -> anyhow::Result<()> {
291 assert!(file.path().is_absolute());
292 let path_in_cache = file.relative_path(cache.base_dir());
293 match cache.get_entry(path_in_cache) {
294 Some(cached) => {
295 let mut result =
296 FileComparisonResult::new(file.path().into(), Classification::InBoth);
297 result.update_moodified(cached.modified, file.modified());
298 if cached.size != 0 {
299 result.update_size(cached.size, file.size());
300 }
301 if !update && cached.size != 0 && file.size() != cached.size {
302 tx.send(CheckEvent::Result(
303 result,
304 ProgressValue::with_skip(file.size()),
305 ))?;
306 return Ok(());
307 }
308 let hash = self.compute_hash(file)?;
309 result.is_content_same = Some(hash == cached.hash);
310 if hash == cached.hash {
311 if cached.should_update(file, update) {
312 cache.insert(path_in_cache, file, hash);
313 }
314 tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
315 } else {
316 if update {
317 cache.insert(path_in_cache, file, hash);
318 }
319 tx.send(CheckEvent::Result(
320 result,
321 ProgressValue::with_size(file.size()),
322 ))?;
323 }
324 }
325 None => {
326 if update {
327 let hash = self.compute_hash(file)?;
328 cache.insert(path_in_cache, file, hash);
329 }
330 tx.send(CheckEvent::Result(
331 FileComparisonResult::new(file.path().into(), Classification::OnlyInDir2),
332 ProgressValue::with_size(file.size()),
333 ))?;
334 }
335 }
336 Ok(())
337 }
338
339 pub fn run(&self) -> anyhow::Result<()> {
341 let start_time = time::Instant::now();
342 let mut duplicates = self.find_duplicates()?;
343 let mut total_wasted_space = 0;
344 if !duplicates.is_empty() {
345 duplicates.sort_by_key(|a| a.size);
346 total_wasted_space = self.print_duplicates_results(&duplicates)?;
347 }
348 self.print_duplicates_summary(&start_time, total_wasted_space)?;
349 Ok(())
350 }
351
352 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
353 let mut total_wasted_space = 0;
354 for dupes in duplicates {
355 dupes.print(self.output_format)?;
356 total_wasted_space += dupes.wasted_size();
357 }
358 Ok(total_wasted_space)
359 }
360
361 fn print_duplicates_summary(
362 &self,
363 start_time: &time::Instant,
364 total_wasted_space: u64,
365 ) -> io::Result<()> {
366 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
367 let num_hashed = self.num_hashed.load(atomic::Ordering::Relaxed).to_string();
368 let total_wasted_space = crate::human_readable_size(total_wasted_space);
369 let summary = [
370 ("Elapsed:", elapsed),
371 ("Hash computed:", num_hashed),
372 ("Total wasted space:", total_wasted_space),
373 ];
374 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
375 formatter.write_values(&mut io::stderr(), &summary)
376 }
377
378 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
380 let mut progress = self
381 .progress
382 .as_ref()
383 .map(|progress| progress.add_spinner())
384 .unwrap_or_else(Progress::none);
385 progress.set_message("Scanning directories...");
386
387 let (tx, rx) = mpsc::channel();
388 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
389 std::thread::scope(|scope| {
390 scope.spawn(|| {
391 if let Err(e) = self.find_duplicates_streaming(tx) {
392 log::error!("Error during duplicate finding: {}", e);
393 }
394 });
395
396 while let Ok(event) = rx.recv() {
397 match event {
398 DupEvent::StartHashing => progress.set_message("Hashing files..."),
399 DupEvent::Total(value) => progress.set_length(value),
400 DupEvent::Result(file, hash) => {
401 progress.inc(ProgressValue::with_size(file.size()));
402 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
403 paths: Vec::new(),
404 size: file.size(),
405 });
406 assert_eq!(
408 entry.size,
409 file.size(),
410 "Hash collision: sizes do not match"
411 );
412 entry.paths.push(file.into_path_buf());
413 }
414 DupEvent::Error => {}
415 }
416 }
417 });
418 progress.finish();
419
420 let mut duplicates = Vec::new();
421 for (_, mut dupes) in by_hash {
422 if dupes.paths.len() > 1 {
423 dupes.paths.sort();
424 duplicates.push(dupes);
425 }
426 }
427 Ok(duplicates)
428 }
429
430 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
431 std::thread::scope(|global_scope| {
432 let (it_rx, caches) = self.stream_file_items(global_scope)?;
433 let caches = &caches;
434 let pool = crate::build_thread_pool(self.jobs)?;
435 pool.scope(move |scope| -> anyhow::Result<()> {
436 let mut by_size: HashMap<u64, DupState> = HashMap::new();
437 let mut total = ProgressValue::default();
438 tx.send(DupEvent::StartHashing)?;
439 for (file, dir_index) in it_rx {
440 let size = file.size();
441 if size == 0 {
442 continue;
443 }
444 let cache = &caches[dir_index];
445 match by_size.entry(size) {
446 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
447 {
448 DupState::Single(file0, dir_index0) => {
449 let cache0 = &caches[*dir_index0];
452 self.send_hash(file0, cache0, &tx, scope);
453 self.send_hash(&file, cache, &tx, scope);
454 total += ProgressValue::with_size(file0.size());
455 total += ProgressValue::with_size(file.size());
456
457 *occ.get_mut() = DupState::Hashing;
459 }
460 DupState::Hashing => {
461 self.send_hash(&file, cache, &tx, scope);
463 total += ProgressValue::with_size(file.size());
464 }
465 },
466 std::collections::hash_map::Entry::Vacant(vac) => {
467 vac.insert(DupState::Single(file, dir_index));
468 }
469 }
470 }
471 tx.send(DupEvent::Total(total))?;
472 Ok(())
473 })?;
474 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
475 Ok::<(), anyhow::Error>(())
476 })?;
477 Ok(())
478 }
479
480 fn stream_file_items<'scope, 'env>(
481 &'env self,
482 scope: &'scope std::thread::Scope<'scope, 'env>,
483 ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
484 let (it_tx, it_rx) = mpsc::channel();
485 let mut caches = Vec::with_capacity(self.dirs.len());
486 for (dir_index, dir) in self.dirs.iter().enumerate() {
487 let mut it = FileIterator::new(dir);
488 let cache = FileHashCache::find_or_new(dir);
489 it.cache = Some(Arc::clone(&cache));
490 it.exclude = self.exclude.as_ref();
491 let it_tx = it_tx.clone();
492 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
493 caches.push(cache);
494 }
495 Ok((it_rx, caches))
496 }
497
498 fn send_hash<'scope>(
499 &'scope self,
500 file: &FileItem,
501 cache: &Arc<FileHashCache>,
502 tx: &mpsc::Sender<DupEvent>,
503 scope: &rayon::Scope<'scope>,
504 ) {
505 let (hash, relative) = self
506 .get_hash_from_cache(file, cache)
507 .expect("path should be in cache base_dir");
508 if let Some(hash) = hash {
509 let _ = tx.send(DupEvent::Result(file.clone(), hash));
510 return;
511 }
512
513 let file = file.clone();
514 let relative = relative.to_path_buf();
515 let tx = tx.clone();
516 let cache = Arc::clone(cache);
517 scope.spawn(move |_| {
518 if let Ok(hash) = self.compute_hash(&file) {
519 cache.insert(&relative, &file, hash);
520 let _ = tx.send(DupEvent::Result(file, hash));
521 } else {
522 log::error!("Failed to hash file: '{}'", file);
523 let _ = tx.send(DupEvent::Error);
524 }
525 });
526 }
527
528 pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
530 let cache = self.cache.as_ref().expect("cache should be initialized");
531 let (hash, relative) = self.get_hash_from_cache(file, cache)?;
532 if let Some(hash) = hash {
533 return Ok(hash);
534 }
535
536 let hash = self.compute_hash(file)?;
537 cache.insert(relative, file, hash);
538 Ok(hash)
539 }
540
541 fn get_hash_from_cache<'a>(
542 &self,
543 file: &'a FileItem,
544 cache: &FileHashCache,
545 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
546 let relative = file.relative_path(cache.base_dir());
547 if let Some(hash) = cache.get(relative, file) {
548 self.num_hash_looked_up
549 .fetch_add(1, atomic::Ordering::Relaxed);
550 return Ok((Some(hash), relative));
551 }
552 Ok((None, relative))
553 }
554
555 fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
556 let start_time = time::Instant::now();
557 let mut f = fs::File::open(file.path())?;
558 let mut progress = self
559 .progress
560 .as_ref()
561 .map(|progress| progress.add_file(file.path(), file.size()))
562 .unwrap_or_else(Progress::none);
563 let mut hasher = blake3::Hasher::new();
564 if self.buffer_size == 0 {
565 if file.size() > 0 {
566 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
567 hasher.update(&mmap[..]);
568 progress.inc(ProgressValue::with_size(file.size()));
569 }
570 } else {
571 let mut buf = vec![0u8; self.buffer_size];
572 loop {
573 let n = f.read(&mut buf)?;
574 if n == 0 {
575 break;
576 }
577 hasher.update(&buf[..n]);
578 progress.inc(ProgressValue::with_size(n as u64));
579 }
580 }
581 progress.finish();
582 self.num_hashed.fetch_add(1, atomic::Ordering::Relaxed);
583 let hash = hasher.finalize();
584 log::debug!(
585 "Computed hash in {}: '{}'",
586 FormattedDuration(start_time.elapsed()),
587 file
588 );
589 Ok(hash)
590 }
591}
592
593#[derive(Clone, Debug)]
595pub struct DuplicatedFiles {
596 pub paths: Vec<PathBuf>,
597 pub size: u64,
598}
599
600impl DuplicatedFiles {
601 fn wasted_size(&self) -> u64 {
602 self.size * (self.paths.len() as u64 - 1)
603 }
604
605 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
606 match output_format {
607 OutputFormat::Default => self.write_human(stdout())?,
608 OutputFormat::PowerShell => self.write_pwsh(stdout())?,
609 OutputFormat::Shell => self.write_shell(stdout())?,
610 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
611 }
612 Ok(())
613 }
614
615 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
616 writeln!(
617 writer,
618 "Identical {} files of {}:",
619 self.paths.len(),
620 crate::human_readable_size(self.size)
621 )?;
622 for path in &self.paths {
623 writeln!(writer, " {}", path.display())?;
624 }
625 Ok(())
626 }
627
628 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
629 writeln!(writer, "- paths:")?;
630 for path in &self.paths {
631 writeln!(writer, " - {:?}", path)?;
632 }
633 writeln!(writer, " size: {}", self.size)?;
634 Ok(())
635 }
636
637 fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
638 self.write_shell_with(writer, "cp", Self::escape_shell)
639 }
640
641 fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
642 self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
643 }
644
645 fn write_shell_with(
646 &self,
647 mut writer: impl io::Write,
648 cmd: &str,
649 stringify: impl Fn(&Path) -> String,
650 ) -> anyhow::Result<()> {
651 let mut iter = self.paths.iter();
652 if let Some(path0) = iter.next() {
653 let path0 = stringify(path0);
654 for path in iter {
655 writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
656 }
657 }
658 Ok(())
659 }
660
661 fn escape_shell(path: &Path) -> String {
662 path.to_string_lossy().replace('\'', "\'\\'\'")
663 }
664
665 fn escape_shell_double(path: &Path) -> String {
666 path.to_string_lossy().replace('\'', "\'\'")
667 }
668}
669
670#[cfg(test)]
671mod tests {
672 use super::*;
673 use std::cmp::Ordering;
674
675 fn default_exclude() -> globset::GlobSet {
676 let mut builder = globset::GlobSetBuilder::new();
677 builder.add(
678 globset::GlobBuilder::new(".hash_cache")
679 .case_insensitive(true)
680 .build()
681 .unwrap(),
682 );
683 builder.build().unwrap()
684 }
685
686 #[test]
687 fn find_duplicates() -> anyhow::Result<()> {
688 let dir = tempfile::tempdir()?;
689
690 let file1_path = dir.path().join("same1.txt");
691 fs::write(&file1_path, "same content")?;
692
693 let file2_path = dir.path().join("same2.txt");
694 fs::write(&file2_path, "same content")?;
695
696 let diff_path = dir.path().join("diff.txt");
697 fs::write(&diff_path, "different content")?;
698
699 let mut hasher = FileHasher::new(&[dir.path()])?;
700 hasher.buffer_size = 8192;
701 let duplicates = hasher.find_duplicates()?;
702
703 assert_eq!(hasher.num_hashed.load(atomic::Ordering::Relaxed), 2);
704 assert_eq!(hasher.num_hash_looked_up.load(atomic::Ordering::Relaxed), 0);
705
706 assert_eq!(duplicates.len(), 1);
707 let group = &duplicates[0];
708 assert_eq!(group.paths.len(), 2);
709 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
712 assert!(group.paths.contains(&file2_path));
713
714 Ok(())
715 }
716
717 #[test]
718 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
719 let dir = tempfile::tempdir()?;
720 let dir_path = dir.path();
721
722 let sub_dir = dir_path.join("a").join("a");
723 fs::create_dir_all(&sub_dir)?;
724
725 let file1_path = sub_dir.join("1");
726 fs::write(&file1_path, "same content")?;
727
728 let file2_path = sub_dir.join("2");
729 fs::write(&file2_path, "same content")?;
730
731 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
733 fs::File::create(&cache_aa_path)?;
734
735 let hasher_aa = FileHasher::new(&[&sub_dir])?;
737 let duplicates_aa = hasher_aa.find_duplicates()?;
738 assert_eq!(duplicates_aa.len(), 1);
739 assert!(cache_aa_path.exists());
740 assert_eq!(hasher_aa.num_hashed.load(atomic::Ordering::Relaxed), 2);
741 assert_eq!(
742 hasher_aa.num_hash_looked_up.load(atomic::Ordering::Relaxed),
743 0
744 );
745
746 let root_a = dir_path.join("a");
748 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
749 fs::File::create(&cache_a_path)?;
750
751 let hasher_a = FileHasher::new(&[&root_a])?;
753 let duplicates_a = hasher_a.find_duplicates()?;
754 assert_eq!(duplicates_a.len(), 1);
755 assert_eq!(hasher_a.num_hashed.load(atomic::Ordering::Relaxed), 0);
756 assert_eq!(
757 hasher_a.num_hash_looked_up.load(atomic::Ordering::Relaxed),
758 2
759 );
760
761 assert!(cache_a_path.exists());
763 assert!(!cache_aa_path.exists());
764
765 Ok(())
766 }
767
768 #[test]
769 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
770 let dir = tempfile::tempdir()?;
771
772 let file1_path = dir.path().join("same1.txt");
773 fs::write(&file1_path, "same content")?;
774
775 let file2_path = dir.path().join("same2.txt");
776 fs::write(&file2_path, "same content")?;
777
778 let exclude_path = dir.path().join("exclude.txt");
779 fs::write(&exclude_path, "same content")?;
780
781 let mut hasher = FileHasher::new(&[dir.path()])?;
782 hasher.buffer_size = 8192;
783 let mut builder = globset::GlobSetBuilder::new();
784 builder.add(
785 globset::GlobBuilder::new("exclude.txt")
786 .case_insensitive(true)
787 .build()?,
788 );
789 let filter = builder.build()?;
790 hasher.exclude = Some(filter);
791
792 let duplicates = hasher.find_duplicates()?;
793 assert_eq!(duplicates.len(), 1);
794 let group = &duplicates[0];
795 assert_eq!(group.paths.len(), 2);
796 assert!(group.paths.contains(&file1_path));
797 assert!(group.paths.contains(&file2_path));
798 assert!(!group.paths.contains(&exclude_path));
799 Ok(())
800 }
801
802 #[derive(Default)]
803 struct CheckCollector {
804 start_seen: bool,
805 total_files: Option<u64>,
806 results: Vec<FileComparisonResult>,
807 file_done_count: u64,
808 num_error: usize,
809 }
810
811 impl CheckCollector {
812 fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
813 let mut collector = Self::default();
814 collector._collect(rx, base_dir);
815 collector
816 }
817
818 fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
819 while let Ok(event) = rx.recv() {
820 match event {
821 CheckEvent::StartChecking => self.start_seen = true,
822 CheckEvent::Total(total) => self.total_files = Some(total.num_files),
823 CheckEvent::Result(mut result, _size) => {
824 result.relative_path = result
825 .relative_path
826 .strip_prefix(base_dir)
827 .unwrap()
828 .to_path_buf();
829 self.results.push(result);
830 }
831 CheckEvent::Progress(progress_val) => {
832 self.file_done_count += progress_val.num_files;
833 }
834 CheckEvent::Error(_) => {
835 self.num_error += 1;
836 }
837 }
838 }
839 }
840 }
841
842 #[test]
843 fn check_mode_empty_cache() -> anyhow::Result<()> {
844 let dir = tempfile::tempdir()?;
845 let dir_path = dir.path().to_path_buf();
846 println!("{:?}", dir_path);
847 let file1_path = dir.path().join("file1.txt");
848 fs::write(&file1_path, "content 1")?;
849 let file2_path = dir.path().join("file2.txt");
850 fs::write(&file2_path, "content 2")?;
851
852 let mut hasher = FileHasher::new(&[&dir_path])?;
853 hasher.exclude = Some(default_exclude());
854 let (tx, rx) = mpsc::channel();
855 hasher.check_streaming(tx, false)?;
856 let collector = CheckCollector::collect(rx, &dir_path);
857 assert!(collector.start_seen);
858 assert_eq!(collector.total_files, Some(2));
859 assert_eq!(collector.file_done_count, 0);
860 assert_eq!(collector.num_error, 0);
861
862 let mut results = collector.results;
863 results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
864 assert_eq!(results.len(), 2);
865 assert_eq!(results[0].relative_path, Path::new("file1.txt"));
866 assert_eq!(results[0].classification, Classification::OnlyInDir2);
867 assert_eq!(results[1].relative_path, Path::new("file2.txt"));
868 assert_eq!(results[1].classification, Classification::OnlyInDir2);
869
870 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
871 Ok(())
872 }
873
874 #[test]
875 fn check_mode_with_cache() -> anyhow::Result<()> {
876 let dir = tempfile::tempdir()?;
877 let dir_path = dir.path().to_path_buf();
878 let file1_path = dir.path().join("file1.txt");
879 let file2_path = dir.path().join("file2.txt");
880 fs::write(&file1_path, "content 1")?;
881 fs::write(&file2_path, "content 2")?;
882 let file1 = FileItem::try_from(file1_path.as_path())?;
883 let file2 = FileItem::try_from(file2_path.as_path())?;
884
885 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
886 hasher.exclude = Some(default_exclude());
887 let _hash1 = hasher.get_hash(&file1)?;
888 let _hash2 = hasher.get_hash(&file2)?;
889 hasher.save_cache()?;
890 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
891
892 let mut hasher = FileHasher::new(&[&dir_path])?;
893 hasher.exclude = Some(default_exclude());
894 let (tx, rx) = mpsc::channel();
895 hasher.check_streaming(tx, false)?;
896 let collector = CheckCollector::collect(rx, &dir_path);
897 assert_eq!(collector.results.len(), 0);
898 assert_eq!(collector.file_done_count, 2);
899
900 fs::write(&file1_path, "content 1 modified")?;
901
902 let file2_meta_before = fs::metadata(&file2_path)?;
903 let mtime_before = file2_meta_before.modified()?;
904 std::thread::sleep(time::Duration::from_millis(10));
905 fs::write(&file2_path, "content 2")?;
906 let file2_meta_after = fs::metadata(&file2_path)?;
907 let mtime_after = file2_meta_after.modified()?;
908 assert!(mtime_after > mtime_before);
909
910 let mut hasher = FileHasher::new(&[&dir_path])?;
911 hasher.exclude = Some(default_exclude());
912 let (tx, rx) = mpsc::channel();
913 hasher.check_streaming(tx, false)?;
914 let collector = CheckCollector::collect(rx, &dir_path);
915 assert_eq!(collector.results.len(), 1);
916 let results = collector.results;
917 assert_eq!(results[0].relative_path, Path::new("file1.txt"));
918 assert_eq!(results[0].modified_time_comparison, Some(Ordering::Less));
919 assert_eq!(results[0].size_comparison, Some(Ordering::Less));
920 assert_eq!(results[0].is_content_same, None);
921 assert_eq!(collector.file_done_count, 1);
922 Ok(())
923 }
924
925 #[test]
926 fn check_update_mode() -> anyhow::Result<()> {
927 let dir = tempfile::tempdir()?;
928 let dir_path = dir.path().to_path_buf();
929 let file1_path = dir.path().join("file1.txt");
930 fs::write(&file1_path, "content 1")?;
931
932 let mut hasher = FileHasher::new(&[&dir_path])?;
933 hasher.exclude = Some(default_exclude());
934 let (tx, rx) = mpsc::channel();
935 hasher.check_streaming(tx, true)?;
936 let _ = CheckCollector::collect(rx, &dir_path);
937 hasher.save_cache()?;
938 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
939
940 let cache = FileHashCache::new(&dir_path);
941 let file1 = FileItem::try_from(file1_path.as_path())?;
942 let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
943 assert!(hash1.is_some());
944
945 std::thread::sleep(time::Duration::from_millis(10));
946 fs::write(&file1_path, "content 1 modified")?;
947 let file1_mod = FileItem::try_from(file1_path.as_path())?;
948
949 let mut hasher = FileHasher::new(&[&dir_path])?;
950 hasher.exclude = Some(default_exclude());
951 let (tx, rx) = mpsc::channel();
952 hasher.check_streaming(tx, true)?;
953 let _ = CheckCollector::collect(rx, &dir_path);
954 hasher.save_cache()?;
955
956 let cache = FileHashCache::new(&dir_path);
957 let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
958 assert!(hash_mod.is_some());
959 assert_ne!(hash1, hash_mod);
960
961 std::thread::sleep(time::Duration::from_millis(10));
962 fs::write(&file1_path, "content 1 modified")?;
963 let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
964 assert!(file1_mod2.modified() > file1_mod.modified());
965
966 assert!(
967 cache
968 .get(&PathBuf::from("file1.txt"), &file1_mod2)
969 .is_none()
970 );
971
972 let mut hasher = FileHasher::new(&[&dir_path])?;
973 hasher.exclude = Some(default_exclude());
974 let (tx, rx) = mpsc::channel();
975 hasher.check_streaming(tx, true)?;
976 let _ = CheckCollector::collect(rx, &dir_path);
977 hasher.save_cache()?;
978
979 let cache = FileHashCache::new(&dir_path);
980 assert!(
981 cache
982 .get(&PathBuf::from("file1.txt"), &file1_mod2)
983 .is_some()
984 );
985 Ok(())
986 }
987
988 #[test]
989 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
990 let dir = tempfile::tempdir()?;
991 let dir_path = dir.path().to_path_buf();
992 let file1_path = dir.path().join("file1.txt");
993 let file2_path = dir.path().join("file2.txt");
994 fs::write(&file1_path, "content 1")?;
995 fs::write(&file2_path, "content 2")?;
996 let file1 = FileItem::try_from(file1_path.as_path())?;
997 let file2 = FileItem::try_from(file2_path.as_path())?;
998
999 let mut hasher = FileHasher::new(&[&dir_path])?;
1000 hasher.exclude = Some(default_exclude());
1001 let (tx, rx) = mpsc::channel();
1002 hasher.check_streaming(tx, true)?;
1003 let _ = CheckCollector::collect(rx, &dir_path);
1004 hasher.save_cache()?;
1005
1006 let cache = FileHashCache::new(&dir_path);
1008 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1009 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
1010
1011 fs::remove_file(&file2_path)?;
1013
1014 let mut hasher = FileHasher::new(&[&dir_path])?;
1016 hasher.exclude = Some(default_exclude());
1017 let (tx, rx) = mpsc::channel();
1018 hasher.check_streaming(tx, true)?;
1019 let _ = CheckCollector::collect(rx, &dir_path);
1020 hasher.save_cache()?;
1021
1022 let cache = FileHashCache::new(&dir_path);
1024 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1025 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1026 Ok(())
1027 }
1028
1029 #[test]
1030 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1031 let tmp = tempfile::tempdir()?;
1032 let dir1 = tmp.path().join("dir1");
1033 let dir2 = tmp.path().join("dir2");
1034 fs::create_dir(&dir1)?;
1035 fs::create_dir(&dir2)?;
1036 let file1_path = dir1.join("file1.txt");
1037 fs::write(&file1_path, "same content")?;
1038 let file2_path = dir2.join("file2.txt");
1039 fs::write(&file2_path, "same content")?;
1040 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1041 let duplicates = hasher.find_duplicates()?;
1042 assert_eq!(duplicates.len(), 1);
1043 let group = &duplicates[0];
1044 assert_eq!(group.paths.len(), 2);
1045 assert_eq!(group.size, 12);
1046 assert!(group.paths.contains(&file1_path));
1047 assert!(group.paths.contains(&file2_path));
1048
1049 Ok(())
1050 }
1051
1052 #[test]
1053 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1054 let tmp = tempfile::tempdir()?;
1055 let dir1 = tmp.path().join("dir1");
1056 let dir2 = tmp.path().join("dir2");
1057 fs::create_dir(&dir1)?;
1058 fs::create_dir(&dir2)?;
1059 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1060 assert!(hasher.check(false).is_err());
1061 Ok(())
1062 }
1063
1064 #[test]
1065 fn escape_shell() {
1066 let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1067 assert_eq!(escape_shell(""), "");
1068 assert_eq!(escape_shell("abc"), "abc");
1069 assert_eq!(escape_shell("a'b"), "a'\\''b");
1070 assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1071
1072 let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1073 assert_eq!(escape_shell_double(""), "");
1074 assert_eq!(escape_shell_double("abc"), "abc");
1075 assert_eq!(escape_shell_double("a'b"), "a''b");
1076 assert_eq!(escape_shell_double("a'b'"), "a''b''");
1077 }
1078
1079 #[test]
1080 fn write_dups_shell_empty() -> anyhow::Result<()> {
1081 let dup_empty = DuplicatedFiles {
1082 paths: vec![],
1083 size: 100,
1084 };
1085 let mut buf = Vec::new();
1086 dup_empty.write_shell(&mut buf)?;
1087 assert_eq!(String::from_utf8(buf)?, "");
1088 Ok(())
1089 }
1090
1091 #[test]
1092 fn write_dups_shell_one() -> anyhow::Result<()> {
1093 let dup_one = DuplicatedFiles {
1094 paths: vec![PathBuf::from("a.txt")],
1095 size: 100,
1096 };
1097 let mut buf = Vec::new();
1098 dup_one.write_shell(&mut buf)?;
1099 assert_eq!(String::from_utf8(buf)?, "");
1100 Ok(())
1101 }
1102
1103 #[test]
1104 fn write_dups_shell_two() -> anyhow::Result<()> {
1105 let dup_multiple = DuplicatedFiles {
1106 paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1107 size: 100,
1108 };
1109 let mut buf = Vec::new();
1110 dup_multiple.write_shell(&mut buf)?;
1111 assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1112 Ok(())
1113 }
1114
1115 #[test]
1116 fn write_dups_shell_three() -> anyhow::Result<()> {
1117 let dup_multiple = DuplicatedFiles {
1118 paths: vec![
1119 PathBuf::from("a.txt"),
1120 PathBuf::from("b.txt"),
1121 PathBuf::from("c.txt"),
1122 ],
1123 size: 100,
1124 };
1125 let mut buf = Vec::new();
1126 dup_multiple.write_shell(&mut buf)?;
1127 assert_eq!(
1128 String::from_utf8(buf)?,
1129 "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1130 );
1131 Ok(())
1132 }
1133
1134 #[test]
1135 fn write_dups_shell_quotes() -> anyhow::Result<()> {
1136 let dup_quotes = DuplicatedFiles {
1137 paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1138 size: 100,
1139 };
1140 let mut buf = Vec::new();
1141 dup_quotes.write_shell(&mut buf)?;
1142 assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1143
1144 let mut buf = Vec::new();
1145 dup_quotes.write_pwsh(&mut buf)?;
1146 assert_eq!(
1147 String::from_utf8(buf)?,
1148 "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1149 );
1150 Ok(())
1151 }
1152}