1use crate::{
2 ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileItem, FileIterator,
3 OutputFormat, Progress, ProgressBuilder, ProgressValue,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10 collections::HashMap,
11 fs,
12 io::{self, Read, stdout},
13 path::{Path, PathBuf},
14 sync::{
15 Arc,
16 atomic::{AtomicUsize, Ordering},
17 mpsc,
18 },
19 time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26 StartHashing,
27 Total(ProgressValue),
28 Result(FileItem, blake3::Hash),
29 Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34 New,
35 Modified,
36}
37
38#[derive(Debug)]
39enum CheckEvent {
40 StartChecking,
41 Total(ProgressValue),
42 Result(FileItem, CheckStatus, ProgressValue),
43 Progress(ProgressValue),
44 Error(FileItem),
45}
46
47enum DupState {
48 Single(FileItem, usize),
49 Hashing,
50}
51
52pub struct FileHasher {
54 dirs: Vec<PathBuf>,
55 pub buffer_size: usize,
56 cache: Option<Arc<FileHashCache>>,
57 num_hashed: AtomicUsize,
58 num_hash_looked_up: AtomicUsize,
59 pub exclude: Option<GlobSet>,
60 pub progress: Option<Arc<ProgressBuilder>>,
61 pub output_format: OutputFormat,
62 pub jobs: usize,
63}
64
65impl FileHasher {
66 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
67
68 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
70 if dirs.is_empty() {
71 anyhow::bail!("At least one directory must be specified.");
72 }
73 Ok(Self {
74 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
75 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
76 cache: None,
77 num_hashed: AtomicUsize::new(0),
78 num_hash_looked_up: AtomicUsize::new(0),
79 exclude: None,
80 progress: None,
81 output_format: OutputFormat::Default,
82 jobs: Self::DEFAULT_JOBS,
83 })
84 }
85
86 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
87 let mut hasher = Self::new(dirs)?;
88 hasher.cache = Some(hasher.new_cache()?);
89 Ok(hasher)
90 }
91
92 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
93 let common_ancestor = crate::common_ancestor(&self.dirs)
94 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
95 Ok(FileHashCache::find_or_new(&common_ancestor))
96 }
97
98 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
100 if self.cache.is_none() {
101 self.cache = Some(self.new_cache()?);
102 }
103 Ok(Arc::clone(self.cache.as_ref().unwrap()))
104 }
105
106 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
108 let cache = self.cache()?;
109 let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
110 cache.remove(relative);
111 Ok(())
112 }
113
114 pub fn save_cache(&self) -> anyhow::Result<()> {
116 log::info!(
117 "Hash stats for {:?}: {} computed, {} looked up",
118 self.dirs,
119 self.num_hashed.load(Ordering::Relaxed),
120 self.num_hash_looked_up.load(Ordering::Relaxed)
121 );
122 if let Some(cache) = &self.cache {
123 cache.save()?;
124 }
125 Ok(())
126 }
127
128 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
130 let cache = self.cache()?;
131 for dir in &self.dirs {
132 let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
133 cache.clear(relative);
134 }
135 Ok(())
136 }
137
138 pub fn check(&self, update: bool) -> anyhow::Result<()> {
140 match self.output_format {
141 OutputFormat::Default | OutputFormat::Symbol => {}
142 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
143 }
144 if self.dirs.len() > 1 {
145 anyhow::bail!("Check mode only supports one directory.");
146 }
147 let start_time = time::Instant::now();
148 let mut progress = self
149 .progress
150 .as_ref()
151 .map(|progress| progress.add_spinner())
152 .unwrap_or_else(Progress::none);
153 progress.use_bytes();
154 progress.set_message("Scanning directory...");
155 let mut num_new = 0;
156 let mut num_modified = 0;
157 let mut num_error = 0;
158 std::thread::scope(|scope| {
159 let (tx, rx) = mpsc::channel();
160 scope.spawn(|| {
161 if let Err(e) = self.check_streaming(tx, update) {
162 log::error!("Error during check: {}", e);
163 }
164 });
165 while let Ok(event) = rx.recv() {
166 match event {
167 CheckEvent::StartChecking => {
168 progress.set_message("Checking files...");
169 }
170 CheckEvent::Total(value) => {
171 progress.set_length(value);
172 progress.set_message("");
173 }
174 CheckEvent::Result(file, status, value) => {
175 let symbol = match status {
176 CheckStatus::New => {
177 num_new += 1;
178 '+'
179 }
180 CheckStatus::Modified => {
181 num_modified += 1;
182 '!'
183 }
184 };
185 progress.inc(value);
186 progress.suspend_for(stdout(), || {
187 let base_dir = &self.dirs[0];
188 let rel_path = file.relative_path(base_dir);
189 println!("{} {}", symbol, rel_path.display());
190 });
191 }
192 CheckEvent::Progress(value) => {
193 progress.inc(value);
194 }
195 CheckEvent::Error(file) => {
196 progress.inc(ProgressValue::with_skip(file.size()));
197 num_error += 1;
198 }
199 }
200 }
201 });
202 progress.finish();
203 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
204 Ok(())
205 }
206
207 fn print_check_summary(
208 &self,
209 start_time: &time::Instant,
210 num_new: usize,
211 num_modified: usize,
212 num_error: usize,
213 ) -> io::Result<()> {
214 let summary = [
215 ("Elapsed:", 0),
216 ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
217 ("New files:", num_new),
218 ("Modified files:", num_modified),
219 ("Errors:", num_error),
220 ];
221 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
222 let mut writer = std::io::stderr();
223 formatter.write_value(
224 &mut writer,
225 summary[0].0,
226 FormattedDuration(start_time.elapsed()),
227 )?;
228 formatter.write_values(&mut writer, &summary[1..])
229 }
230
231 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
232 assert_eq!(self.dirs.len(), 1);
233 let cache = self.new_cache()?;
234 let base_dir = &self.dirs[0];
235 let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
236 cache.set_remove_if_no_access(relative);
237 let cache_clone = Arc::clone(&cache);
238 std::thread::scope(|global_scope| {
239 let mut it = FileIterator::new(base_dir);
240 it.cache = Some(Arc::clone(&cache));
241 it.exclude = self.exclude.as_ref();
242 let it_rx = it.spawn_in_scope(global_scope);
243 tx.send(CheckEvent::StartChecking)?;
244 let pool = crate::build_thread_pool(self.jobs)?;
245 pool.scope(move |scope| -> anyhow::Result<()> {
246 let mut total = ProgressValue::default();
247 for file in it_rx {
248 self.check_file(file, &cache, update, &mut total, &tx, scope);
249 }
250 tx.send(CheckEvent::Total(total))?;
251 Ok(())
252 })
253 })?;
254 cache_clone.save()?;
255 Ok(())
256 }
257
258 fn check_file<'scope>(
259 &'scope self,
260 file: FileItem,
261 cache: &Arc<FileHashCache>,
262 update: bool,
263 total: &mut ProgressValue,
264 tx: &mpsc::Sender<CheckEvent>,
265 scope: &rayon::Scope<'scope>,
266 ) {
267 *total += ProgressValue::with_size(file.size());
268 let tx = tx.clone();
269 let cache = Arc::clone(cache);
270 scope.spawn(move |_| {
271 if let Err(error) = self._check_file(&file, cache, update, &tx) {
272 log::error!("Failed to check file '{}': {}", file, error);
273 if tx.send(CheckEvent::Error(file)).is_err() {
274 log::error!("Send failed");
275 }
276 }
277 });
278 }
279
280 fn _check_file(
281 &self,
282 file: &FileItem,
283 cache: Arc<FileHashCache>,
284 update: bool,
285 tx: &mpsc::Sender<CheckEvent>,
286 ) -> anyhow::Result<()> {
287 assert!(file.path().is_absolute());
288 let path_in_cache = file.relative_path(cache.base_dir());
289 match cache.get_entry(path_in_cache) {
290 Some(cached) => {
291 if !update && cached.size != 0 && file.size() != cached.size {
292 tx.send(CheckEvent::Result(
293 file.clone(),
294 CheckStatus::Modified,
295 ProgressValue::with_skip(file.size()),
296 ))?;
297 return Ok(());
298 }
299 let hash = self.compute_hash(file)?;
300 if hash == cached.hash {
301 if cached.should_update(file, update) {
302 cache.insert(path_in_cache, file, hash);
303 }
304 tx.send(CheckEvent::Progress(ProgressValue::with_size(file.size())))?;
305 } else {
306 if update {
307 cache.insert(path_in_cache, file, hash);
308 }
309 tx.send(CheckEvent::Result(
310 file.clone(),
311 CheckStatus::Modified,
312 ProgressValue::with_size(file.size()),
313 ))?;
314 }
315 }
316 None => {
317 if update {
318 let hash = self.compute_hash(file)?;
319 cache.insert(path_in_cache, file, hash);
320 }
321 tx.send(CheckEvent::Result(
322 file.clone(),
323 CheckStatus::New,
324 ProgressValue::with_size(file.size()),
325 ))?;
326 }
327 }
328 Ok(())
329 }
330
331 pub fn run(&self) -> anyhow::Result<()> {
333 let start_time = time::Instant::now();
334 let mut duplicates = self.find_duplicates()?;
335 let mut total_wasted_space = 0;
336 if !duplicates.is_empty() {
337 duplicates.sort_by_key(|a| a.size);
338 total_wasted_space = self.print_duplicates_results(&duplicates)?;
339 }
340 self.print_duplicates_summary(&start_time, total_wasted_space)?;
341 Ok(())
342 }
343
344 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
345 let mut total_wasted_space = 0;
346 for dupes in duplicates {
347 dupes.print(self.output_format)?;
348 total_wasted_space += dupes.wasted_size();
349 }
350 Ok(total_wasted_space)
351 }
352
353 fn print_duplicates_summary(
354 &self,
355 start_time: &time::Instant,
356 total_wasted_space: u64,
357 ) -> io::Result<()> {
358 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
359 let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
360 let total_wasted_space = crate::human_readable_size(total_wasted_space);
361 let summary = [
362 ("Elapsed:", elapsed),
363 ("Hash computed:", num_hashed),
364 ("Total wasted space:", total_wasted_space),
365 ];
366 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
367 formatter.write_values(&mut io::stderr(), &summary)
368 }
369
370 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
372 let mut progress = self
373 .progress
374 .as_ref()
375 .map(|progress| progress.add_spinner())
376 .unwrap_or_else(Progress::none);
377 progress.set_message("Scanning directories...");
378
379 let (tx, rx) = mpsc::channel();
380 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
381 std::thread::scope(|scope| {
382 scope.spawn(|| {
383 if let Err(e) = self.find_duplicates_streaming(tx) {
384 log::error!("Error during duplicate finding: {}", e);
385 }
386 });
387
388 while let Ok(event) = rx.recv() {
389 match event {
390 DupEvent::StartHashing => progress.set_message("Hashing files..."),
391 DupEvent::Total(value) => progress.set_length(value),
392 DupEvent::Result(file, hash) => {
393 progress.inc(ProgressValue::with_size(file.size()));
394 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
395 paths: Vec::new(),
396 size: file.size(),
397 });
398 assert_eq!(
400 entry.size,
401 file.size(),
402 "Hash collision: sizes do not match"
403 );
404 entry.paths.push(file.into_path_buf());
405 }
406 DupEvent::Error => {}
407 }
408 }
409 });
410 progress.finish();
411
412 let mut duplicates = Vec::new();
413 for (_, mut dupes) in by_hash {
414 if dupes.paths.len() > 1 {
415 dupes.paths.sort();
416 duplicates.push(dupes);
417 }
418 }
419 Ok(duplicates)
420 }
421
422 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
423 std::thread::scope(|global_scope| {
424 let (it_rx, caches) = self.stream_file_items(global_scope)?;
425 let caches = &caches;
426 let pool = crate::build_thread_pool(self.jobs)?;
427 pool.scope(move |scope| -> anyhow::Result<()> {
428 let mut by_size: HashMap<u64, DupState> = HashMap::new();
429 let mut total = ProgressValue::default();
430 tx.send(DupEvent::StartHashing)?;
431 for (file, dir_index) in it_rx {
432 let size = file.size();
433 if size == 0 {
434 continue;
435 }
436 let cache = &caches[dir_index];
437 match by_size.entry(size) {
438 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
439 {
440 DupState::Single(file0, dir_index0) => {
441 let cache0 = &caches[*dir_index0];
444 self.send_hash(file0, cache0, &tx, scope);
445 self.send_hash(&file, cache, &tx, scope);
446 total += ProgressValue::with_size(file0.size());
447 total += ProgressValue::with_size(file.size());
448
449 *occ.get_mut() = DupState::Hashing;
451 }
452 DupState::Hashing => {
453 self.send_hash(&file, cache, &tx, scope);
455 total += ProgressValue::with_size(file.size());
456 }
457 },
458 std::collections::hash_map::Entry::Vacant(vac) => {
459 vac.insert(DupState::Single(file, dir_index));
460 }
461 }
462 }
463 tx.send(DupEvent::Total(total))?;
464 Ok(())
465 })?;
466 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
467 Ok::<(), anyhow::Error>(())
468 })?;
469 Ok(())
470 }
471
472 fn stream_file_items<'scope, 'env>(
473 &'env self,
474 scope: &'scope std::thread::Scope<'scope, 'env>,
475 ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
476 let (it_tx, it_rx) = mpsc::channel();
477 let mut caches = Vec::with_capacity(self.dirs.len());
478 for (dir_index, dir) in self.dirs.iter().enumerate() {
479 let mut it = FileIterator::new(dir);
480 let cache = FileHashCache::find_or_new(dir);
481 it.cache = Some(Arc::clone(&cache));
482 it.exclude = self.exclude.as_ref();
483 let it_tx = it_tx.clone();
484 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
485 caches.push(cache);
486 }
487 Ok((it_rx, caches))
488 }
489
490 fn send_hash<'scope>(
491 &'scope self,
492 file: &FileItem,
493 cache: &Arc<FileHashCache>,
494 tx: &mpsc::Sender<DupEvent>,
495 scope: &rayon::Scope<'scope>,
496 ) {
497 let (hash, relative) = self
498 .get_hash_from_cache(file, cache)
499 .expect("path should be in cache base_dir");
500 if let Some(hash) = hash {
501 let _ = tx.send(DupEvent::Result(file.clone(), hash));
502 return;
503 }
504
505 let file = file.clone();
506 let relative = relative.to_path_buf();
507 let tx = tx.clone();
508 let cache = Arc::clone(cache);
509 scope.spawn(move |_| {
510 if let Ok(hash) = self.compute_hash(&file) {
511 cache.insert(&relative, &file, hash);
512 let _ = tx.send(DupEvent::Result(file, hash));
513 } else {
514 log::error!("Failed to hash file: '{}'", file);
515 let _ = tx.send(DupEvent::Error);
516 }
517 });
518 }
519
520 pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
522 let cache = self.cache.as_ref().expect("cache should be initialized");
523 let (hash, relative) = self.get_hash_from_cache(file, cache)?;
524 if let Some(hash) = hash {
525 return Ok(hash);
526 }
527
528 let hash = self.compute_hash(file)?;
529 cache.insert(relative, file, hash);
530 Ok(hash)
531 }
532
533 fn get_hash_from_cache<'a>(
534 &self,
535 file: &'a FileItem,
536 cache: &FileHashCache,
537 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
538 let relative = file.relative_path(cache.base_dir());
539 if let Some(hash) = cache.get(relative, file) {
540 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
541 return Ok((Some(hash), relative));
542 }
543 Ok((None, relative))
544 }
545
546 fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
547 let start_time = time::Instant::now();
548 let mut f = fs::File::open(file.path())?;
549 let mut progress = self
550 .progress
551 .as_ref()
552 .map(|progress| progress.add_file(file.path(), file.size()))
553 .unwrap_or_else(Progress::none);
554 let mut hasher = blake3::Hasher::new();
555 if self.buffer_size == 0 {
556 if file.size() > 0 {
557 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
558 hasher.update(&mmap[..]);
559 progress.inc(ProgressValue::with_size(file.size()));
560 }
561 } else {
562 let mut buf = vec![0u8; self.buffer_size];
563 loop {
564 let n = f.read(&mut buf)?;
565 if n == 0 {
566 break;
567 }
568 hasher.update(&buf[..n]);
569 progress.inc(ProgressValue::with_size(n as u64));
570 }
571 }
572 progress.finish();
573 self.num_hashed.fetch_add(1, Ordering::Relaxed);
574 let hash = hasher.finalize();
575 log::debug!(
576 "Computed hash in {}: '{}'",
577 FormattedDuration(start_time.elapsed()),
578 file
579 );
580 Ok(hash)
581 }
582}
583
584#[derive(Clone, Debug)]
586pub struct DuplicatedFiles {
587 pub paths: Vec<PathBuf>,
588 pub size: u64,
589}
590
591impl DuplicatedFiles {
592 fn wasted_size(&self) -> u64 {
593 self.size * (self.paths.len() as u64 - 1)
594 }
595
596 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
597 match output_format {
598 OutputFormat::Default => self.write_human(stdout())?,
599 OutputFormat::PowerShell => self.write_pwsh(stdout())?,
600 OutputFormat::Shell => self.write_shell(stdout())?,
601 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
602 }
603 Ok(())
604 }
605
606 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
607 writeln!(
608 writer,
609 "Identical {} files of {}:",
610 self.paths.len(),
611 crate::human_readable_size(self.size)
612 )?;
613 for path in &self.paths {
614 writeln!(writer, " {}", path.display())?;
615 }
616 Ok(())
617 }
618
619 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
620 writeln!(writer, "- paths:")?;
621 for path in &self.paths {
622 writeln!(writer, " - {:?}", path)?;
623 }
624 writeln!(writer, " size: {}", self.size)?;
625 Ok(())
626 }
627
628 fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
629 self.write_shell_with(writer, "cp", Self::escape_shell)
630 }
631
632 fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
633 self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
634 }
635
636 fn write_shell_with(
637 &self,
638 mut writer: impl io::Write,
639 cmd: &str,
640 stringify: impl Fn(&Path) -> String,
641 ) -> anyhow::Result<()> {
642 let mut iter = self.paths.iter();
643 if let Some(path0) = iter.next() {
644 let path0 = stringify(path0);
645 for path in iter {
646 writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
647 }
648 }
649 Ok(())
650 }
651
652 fn escape_shell(path: &Path) -> String {
653 path.to_string_lossy().replace('\'', "\'\\'\'")
654 }
655
656 fn escape_shell_double(path: &Path) -> String {
657 path.to_string_lossy().replace('\'', "\'\'")
658 }
659}
660
661#[cfg(test)]
662mod tests {
663 use super::*;
664
665 fn default_exclude() -> globset::GlobSet {
666 let mut builder = globset::GlobSetBuilder::new();
667 builder.add(
668 globset::GlobBuilder::new(".hash_cache")
669 .case_insensitive(true)
670 .build()
671 .unwrap(),
672 );
673 builder.build().unwrap()
674 }
675
676 #[test]
677 fn find_duplicates() -> anyhow::Result<()> {
678 let dir = tempfile::tempdir()?;
679
680 let file1_path = dir.path().join("same1.txt");
681 fs::write(&file1_path, "same content")?;
682
683 let file2_path = dir.path().join("same2.txt");
684 fs::write(&file2_path, "same content")?;
685
686 let diff_path = dir.path().join("diff.txt");
687 fs::write(&diff_path, "different content")?;
688
689 let mut hasher = FileHasher::new(&[dir.path()])?;
690 hasher.buffer_size = 8192;
691 let duplicates = hasher.find_duplicates()?;
692
693 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
694 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
695
696 assert_eq!(duplicates.len(), 1);
697 let group = &duplicates[0];
698 assert_eq!(group.paths.len(), 2);
699 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
702 assert!(group.paths.contains(&file2_path));
703
704 Ok(())
705 }
706
707 #[test]
708 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
709 let dir = tempfile::tempdir()?;
710 let dir_path = dir.path();
711
712 let sub_dir = dir_path.join("a").join("a");
713 fs::create_dir_all(&sub_dir)?;
714
715 let file1_path = sub_dir.join("1");
716 fs::write(&file1_path, "same content")?;
717
718 let file2_path = sub_dir.join("2");
719 fs::write(&file2_path, "same content")?;
720
721 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
723 fs::File::create(&cache_aa_path)?;
724
725 let hasher_aa = FileHasher::new(&[&sub_dir])?;
727 let duplicates_aa = hasher_aa.find_duplicates()?;
728 assert_eq!(duplicates_aa.len(), 1);
729 assert!(cache_aa_path.exists());
730 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
731 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
732
733 let root_a = dir_path.join("a");
735 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
736 fs::File::create(&cache_a_path)?;
737
738 let hasher_a = FileHasher::new(&[&root_a])?;
740 let duplicates_a = hasher_a.find_duplicates()?;
741 assert_eq!(duplicates_a.len(), 1);
742 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
743 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
744
745 assert!(cache_a_path.exists());
747 assert!(!cache_aa_path.exists());
748
749 Ok(())
750 }
751
752 #[test]
753 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
754 let dir = tempfile::tempdir()?;
755
756 let file1_path = dir.path().join("same1.txt");
757 fs::write(&file1_path, "same content")?;
758
759 let file2_path = dir.path().join("same2.txt");
760 fs::write(&file2_path, "same content")?;
761
762 let exclude_path = dir.path().join("exclude.txt");
763 fs::write(&exclude_path, "same content")?;
764
765 let mut hasher = FileHasher::new(&[dir.path()])?;
766 hasher.buffer_size = 8192;
767 let mut builder = globset::GlobSetBuilder::new();
768 builder.add(
769 globset::GlobBuilder::new("exclude.txt")
770 .case_insensitive(true)
771 .build()?,
772 );
773 let filter = builder.build()?;
774 hasher.exclude = Some(filter);
775
776 let duplicates = hasher.find_duplicates()?;
777 assert_eq!(duplicates.len(), 1);
778 let group = &duplicates[0];
779 assert_eq!(group.paths.len(), 2);
780 assert!(group.paths.contains(&file1_path));
781 assert!(group.paths.contains(&file2_path));
782 assert!(!group.paths.contains(&exclude_path));
783 Ok(())
784 }
785
786 #[derive(Default)]
787 struct CheckCollector {
788 start_seen: bool,
789 total_files: Option<u64>,
790 results: Vec<(PathBuf, CheckStatus)>,
791 file_done_count: u64,
792 num_error: usize,
793 }
794
795 impl CheckCollector {
796 fn collect(rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) -> Self {
797 let mut collector = Self::default();
798 collector._collect(rx, base_dir);
799 collector
800 }
801
802 fn _collect(&mut self, rx: mpsc::Receiver<CheckEvent>, base_dir: &Path) {
803 while let Ok(event) = rx.recv() {
804 match event {
805 CheckEvent::StartChecking => self.start_seen = true,
806 CheckEvent::Total(total) => self.total_files = Some(total.num_files),
807 CheckEvent::Result(file, status, _size) => {
808 let stripped = file.path().strip_prefix(base_dir).unwrap().to_path_buf();
809 self.results.push((stripped, status));
810 }
811 CheckEvent::Progress(progress_val) => {
812 self.file_done_count += progress_val.num_files;
813 }
814 CheckEvent::Error(_) => {
815 self.num_error += 1;
816 }
817 }
818 }
819 }
820 }
821
822 #[test]
823 fn check_mode_empty_cache() -> anyhow::Result<()> {
824 let dir = tempfile::tempdir()?;
825 let dir_path = dir.path().to_path_buf();
826 println!("{:?}", dir_path);
827 let file1_path = dir.path().join("file1.txt");
828 fs::write(&file1_path, "content 1")?;
829 let file2_path = dir.path().join("file2.txt");
830 fs::write(&file2_path, "content 2")?;
831
832 let mut hasher = FileHasher::new(&[&dir_path])?;
833 hasher.exclude = Some(default_exclude());
834 let (tx, rx) = mpsc::channel();
835 hasher.check_streaming(tx, false)?;
836 let collector = CheckCollector::collect(rx, &dir_path);
837 assert!(collector.start_seen);
838 assert_eq!(collector.total_files, Some(2));
839 assert_eq!(collector.file_done_count, 0);
840 assert_eq!(collector.num_error, 0);
841
842 let mut results = collector.results;
843 results.sort_by(|a, b| a.0.cmp(&b.0));
844 assert_eq!(results.len(), 2);
845 assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
846 assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
847
848 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
849 Ok(())
850 }
851
852 #[test]
853 fn check_mode_with_cache() -> anyhow::Result<()> {
854 let dir = tempfile::tempdir()?;
855 let dir_path = dir.path().to_path_buf();
856 let file1_path = dir.path().join("file1.txt");
857 let file2_path = dir.path().join("file2.txt");
858 fs::write(&file1_path, "content 1")?;
859 fs::write(&file2_path, "content 2")?;
860 let file1 = FileItem::try_from(file1_path.as_path())?;
861 let file2 = FileItem::try_from(file2_path.as_path())?;
862
863 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
864 hasher.exclude = Some(default_exclude());
865 let _hash1 = hasher.get_hash(&file1)?;
866 let _hash2 = hasher.get_hash(&file2)?;
867 hasher.save_cache()?;
868 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
869
870 let mut hasher = FileHasher::new(&[&dir_path])?;
871 hasher.exclude = Some(default_exclude());
872 let (tx, rx) = mpsc::channel();
873 hasher.check_streaming(tx, false)?;
874 let collector = CheckCollector::collect(rx, &dir_path);
875 assert_eq!(collector.results.len(), 0);
876 assert_eq!(collector.file_done_count, 2);
877
878 fs::write(&file1_path, "content 1 modified")?;
879
880 let file2_meta_before = fs::metadata(&file2_path)?;
881 let mtime_before = file2_meta_before.modified()?;
882 std::thread::sleep(time::Duration::from_millis(10));
883 fs::write(&file2_path, "content 2")?;
884 let file2_meta_after = fs::metadata(&file2_path)?;
885 let mtime_after = file2_meta_after.modified()?;
886 assert!(mtime_after > mtime_before);
887
888 let mut hasher = FileHasher::new(&[&dir_path])?;
889 hasher.exclude = Some(default_exclude());
890 let (tx, rx) = mpsc::channel();
891 hasher.check_streaming(tx, false)?;
892 let collector = CheckCollector::collect(rx, &dir_path);
893 assert_eq!(collector.results.len(), 1);
894 let results = collector.results;
895 assert_eq!(
896 results[0],
897 (PathBuf::from("file1.txt"), CheckStatus::Modified)
898 );
899 assert_eq!(collector.file_done_count, 1);
900 Ok(())
901 }
902
903 #[test]
904 fn check_update_mode() -> anyhow::Result<()> {
905 let dir = tempfile::tempdir()?;
906 let dir_path = dir.path().to_path_buf();
907 let file1_path = dir.path().join("file1.txt");
908 fs::write(&file1_path, "content 1")?;
909
910 let mut hasher = FileHasher::new(&[&dir_path])?;
911 hasher.exclude = Some(default_exclude());
912 let (tx, rx) = mpsc::channel();
913 hasher.check_streaming(tx, true)?;
914 let _ = CheckCollector::collect(rx, &dir_path);
915 hasher.save_cache()?;
916 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
917
918 let cache = FileHashCache::new(&dir_path);
919 let file1 = FileItem::try_from(file1_path.as_path())?;
920 let hash1 = cache.get(&PathBuf::from("file1.txt"), &file1);
921 assert!(hash1.is_some());
922
923 std::thread::sleep(time::Duration::from_millis(10));
924 fs::write(&file1_path, "content 1 modified")?;
925 let file1_mod = FileItem::try_from(file1_path.as_path())?;
926
927 let mut hasher = FileHasher::new(&[&dir_path])?;
928 hasher.exclude = Some(default_exclude());
929 let (tx, rx) = mpsc::channel();
930 hasher.check_streaming(tx, true)?;
931 let _ = CheckCollector::collect(rx, &dir_path);
932 hasher.save_cache()?;
933
934 let cache = FileHashCache::new(&dir_path);
935 let hash_mod = cache.get(&PathBuf::from("file1.txt"), &file1_mod);
936 assert!(hash_mod.is_some());
937 assert_ne!(hash1, hash_mod);
938
939 std::thread::sleep(time::Duration::from_millis(10));
940 fs::write(&file1_path, "content 1 modified")?;
941 let file1_mod2 = FileItem::try_from(file1_path.as_path())?;
942 assert!(file1_mod2.modified() > file1_mod.modified());
943
944 assert!(
945 cache
946 .get(&PathBuf::from("file1.txt"), &file1_mod2)
947 .is_none()
948 );
949
950 let mut hasher = FileHasher::new(&[&dir_path])?;
951 hasher.exclude = Some(default_exclude());
952 let (tx, rx) = mpsc::channel();
953 hasher.check_streaming(tx, true)?;
954 let _ = CheckCollector::collect(rx, &dir_path);
955 hasher.save_cache()?;
956
957 let cache = FileHashCache::new(&dir_path);
958 assert!(
959 cache
960 .get(&PathBuf::from("file1.txt"), &file1_mod2)
961 .is_some()
962 );
963 Ok(())
964 }
965
966 #[test]
967 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
968 let dir = tempfile::tempdir()?;
969 let dir_path = dir.path().to_path_buf();
970 let file1_path = dir.path().join("file1.txt");
971 let file2_path = dir.path().join("file2.txt");
972 fs::write(&file1_path, "content 1")?;
973 fs::write(&file2_path, "content 2")?;
974 let file1 = FileItem::try_from(file1_path.as_path())?;
975 let file2 = FileItem::try_from(file2_path.as_path())?;
976
977 let mut hasher = FileHasher::new(&[&dir_path])?;
978 hasher.exclude = Some(default_exclude());
979 let (tx, rx) = mpsc::channel();
980 hasher.check_streaming(tx, true)?;
981 let _ = CheckCollector::collect(rx, &dir_path);
982 hasher.save_cache()?;
983
984 let cache = FileHashCache::new(&dir_path);
986 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
987 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_some());
988
989 fs::remove_file(&file2_path)?;
991
992 let mut hasher = FileHasher::new(&[&dir_path])?;
994 hasher.exclude = Some(default_exclude());
995 let (tx, rx) = mpsc::channel();
996 hasher.check_streaming(tx, true)?;
997 let _ = CheckCollector::collect(rx, &dir_path);
998 hasher.save_cache()?;
999
1000 let cache = FileHashCache::new(&dir_path);
1002 assert!(cache.get(&PathBuf::from("file2.txt"), &file2).is_none());
1003 assert!(cache.get(&PathBuf::from("file1.txt"), &file1).is_some());
1004 Ok(())
1005 }
1006
1007 #[test]
1008 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
1009 let tmp = tempfile::tempdir()?;
1010 let dir1 = tmp.path().join("dir1");
1011 let dir2 = tmp.path().join("dir2");
1012 fs::create_dir(&dir1)?;
1013 fs::create_dir(&dir2)?;
1014 let file1_path = dir1.join("file1.txt");
1015 fs::write(&file1_path, "same content")?;
1016 let file2_path = dir2.join("file2.txt");
1017 fs::write(&file2_path, "same content")?;
1018 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1019 let duplicates = hasher.find_duplicates()?;
1020 assert_eq!(duplicates.len(), 1);
1021 let group = &duplicates[0];
1022 assert_eq!(group.paths.len(), 2);
1023 assert_eq!(group.size, 12);
1024 assert!(group.paths.contains(&file1_path));
1025 assert!(group.paths.contains(&file2_path));
1026
1027 Ok(())
1028 }
1029
1030 #[test]
1031 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1032 let tmp = tempfile::tempdir()?;
1033 let dir1 = tmp.path().join("dir1");
1034 let dir2 = tmp.path().join("dir2");
1035 fs::create_dir(&dir1)?;
1036 fs::create_dir(&dir2)?;
1037 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1038 assert!(hasher.check(false).is_err());
1039 Ok(())
1040 }
1041
1042 #[test]
1043 fn escape_shell() {
1044 let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1045 assert_eq!(escape_shell(""), "");
1046 assert_eq!(escape_shell("abc"), "abc");
1047 assert_eq!(escape_shell("a'b"), "a'\\''b");
1048 assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1049
1050 let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1051 assert_eq!(escape_shell_double(""), "");
1052 assert_eq!(escape_shell_double("abc"), "abc");
1053 assert_eq!(escape_shell_double("a'b"), "a''b");
1054 assert_eq!(escape_shell_double("a'b'"), "a''b''");
1055 }
1056
1057 #[test]
1058 fn write_dups_shell_empty() -> anyhow::Result<()> {
1059 let dup_empty = DuplicatedFiles {
1060 paths: vec![],
1061 size: 100,
1062 };
1063 let mut buf = Vec::new();
1064 dup_empty.write_shell(&mut buf)?;
1065 assert_eq!(String::from_utf8(buf)?, "");
1066 Ok(())
1067 }
1068
1069 #[test]
1070 fn write_dups_shell_one() -> anyhow::Result<()> {
1071 let dup_one = DuplicatedFiles {
1072 paths: vec![PathBuf::from("a.txt")],
1073 size: 100,
1074 };
1075 let mut buf = Vec::new();
1076 dup_one.write_shell(&mut buf)?;
1077 assert_eq!(String::from_utf8(buf)?, "");
1078 Ok(())
1079 }
1080
1081 #[test]
1082 fn write_dups_shell_two() -> anyhow::Result<()> {
1083 let dup_multiple = DuplicatedFiles {
1084 paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1085 size: 100,
1086 };
1087 let mut buf = Vec::new();
1088 dup_multiple.write_shell(&mut buf)?;
1089 assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1090 Ok(())
1091 }
1092
1093 #[test]
1094 fn write_dups_shell_three() -> anyhow::Result<()> {
1095 let dup_multiple = DuplicatedFiles {
1096 paths: vec![
1097 PathBuf::from("a.txt"),
1098 PathBuf::from("b.txt"),
1099 PathBuf::from("c.txt"),
1100 ],
1101 size: 100,
1102 };
1103 let mut buf = Vec::new();
1104 dup_multiple.write_shell(&mut buf)?;
1105 assert_eq!(
1106 String::from_utf8(buf)?,
1107 "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1108 );
1109 Ok(())
1110 }
1111
1112 #[test]
1113 fn write_dups_shell_quotes() -> anyhow::Result<()> {
1114 let dup_quotes = DuplicatedFiles {
1115 paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1116 size: 100,
1117 };
1118 let mut buf = Vec::new();
1119 dup_quotes.write_shell(&mut buf)?;
1120 assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1121
1122 let mut buf = Vec::new();
1123 dup_quotes.write_pwsh(&mut buf)?;
1124 assert_eq!(
1125 String::from_utf8(buf)?,
1126 "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1127 );
1128 Ok(())
1129 }
1130}