1use crate::{
2 ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileItem, FileIterator,
3 OutputFormat, Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10 collections::HashMap,
11 fs,
12 io::{self, Read, stdout},
13 path::{Path, PathBuf},
14 sync::{
15 Arc,
16 atomic::{AtomicUsize, Ordering},
17 mpsc,
18 },
19 time,
20};
21
22type FileWithDirIndex = (FileItem, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26 StartHashing,
27 NumFiles(usize),
28 Result(FileItem, blake3::Hash),
29 Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34 Unchanged,
35 New,
36 Modified,
37}
38
39#[derive(Debug, PartialEq)]
40enum CheckEvent {
41 StartChecking,
42 TotalFiles(usize),
43 Result(PathBuf, CheckStatus),
44 FileDone,
45 Error,
46}
47
48enum DupState {
49 Single(FileItem, usize),
50 Hashing,
51}
52
53pub struct FileHasher {
55 dirs: Vec<PathBuf>,
56 pub buffer_size: usize,
57 cache: Option<Arc<FileHashCache>>,
58 num_hashed: AtomicUsize,
59 num_hash_looked_up: AtomicUsize,
60 pub exclude: Option<GlobSet>,
61 pub progress: Option<Arc<ProgressBuilder>>,
62 pub output_format: OutputFormat,
63 pub jobs: usize,
64}
65
66impl FileHasher {
67 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
68
69 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
71 if dirs.is_empty() {
72 anyhow::bail!("At least one directory must be specified.");
73 }
74 Ok(Self {
75 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
76 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
77 cache: None,
78 num_hashed: AtomicUsize::new(0),
79 num_hash_looked_up: AtomicUsize::new(0),
80 exclude: None,
81 progress: None,
82 output_format: OutputFormat::Default,
83 jobs: Self::DEFAULT_JOBS,
84 })
85 }
86
87 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
88 let mut hasher = Self::new(dirs)?;
89 hasher.cache = Some(hasher.new_cache()?);
90 Ok(hasher)
91 }
92
93 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
94 let common_ancestor = crate::common_ancestor(&self.dirs)
95 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
96 Ok(FileHashCache::find_or_new(&common_ancestor))
97 }
98
99 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
101 if self.cache.is_none() {
102 self.cache = Some(self.new_cache()?);
103 }
104 Ok(Arc::clone(self.cache.as_ref().unwrap()))
105 }
106
107 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
109 let cache = self.cache()?;
110 let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
111 cache.remove(relative);
112 Ok(())
113 }
114
115 pub fn save_cache(&self) -> anyhow::Result<()> {
117 log::info!(
118 "Hash stats for {:?}: {} computed, {} looked up",
119 self.dirs,
120 self.num_hashed.load(Ordering::Relaxed),
121 self.num_hash_looked_up.load(Ordering::Relaxed)
122 );
123 if let Some(cache) = &self.cache {
124 cache.save()?;
125 }
126 Ok(())
127 }
128
129 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
131 let cache = self.cache()?;
132 for dir in &self.dirs {
133 let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
134 cache.clear(relative);
135 }
136 Ok(())
137 }
138
139 pub fn check(&self, update: bool) -> anyhow::Result<()> {
141 match self.output_format {
142 OutputFormat::Default | OutputFormat::Symbol => {}
143 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
144 }
145 if self.dirs.len() > 1 {
146 anyhow::bail!("Check mode only supports one directory.");
147 }
148 let start_time = time::Instant::now();
149 let progress = self
150 .progress
151 .as_ref()
152 .map(|progress| progress.add_spinner())
153 .unwrap_or_else(Progress::none);
154 progress.set_message("Scanning directory...");
155 let mut num_new = 0;
156 let mut num_modified = 0;
157 let mut num_error = 0;
158 std::thread::scope(|scope| {
159 let (tx, rx) = mpsc::channel();
160 scope.spawn(|| {
161 if let Err(e) = self.check_streaming(tx, update) {
162 log::error!("Error during check: {}", e);
163 }
164 });
165 while let Ok(event) = rx.recv() {
166 match event {
167 CheckEvent::StartChecking => {
168 progress.set_message("Checking files...");
169 }
170 CheckEvent::TotalFiles(total) => {
171 progress.set_length(total as u64);
172 progress.set_message("");
173 }
174 CheckEvent::Result(path, status) => {
175 let symbol = match status {
176 CheckStatus::New => {
177 num_new += 1;
178 '+'
179 }
180 CheckStatus::Modified => {
181 num_modified += 1;
182 '!'
183 }
184 CheckStatus::Unchanged => unreachable!(),
185 };
186 progress.inc(1);
187 progress.suspend_for(stdout(), || {
188 println!("{} {}", symbol, path.display());
189 });
190 }
191 CheckEvent::FileDone => {
192 progress.inc(1);
193 }
194 CheckEvent::Error => {
195 progress.inc(1);
196 num_error += 1;
197 }
198 }
199 }
200 });
201 progress.finish();
202 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
203 Ok(())
204 }
205
206 fn print_check_summary(
207 &self,
208 start_time: &time::Instant,
209 num_new: usize,
210 num_modified: usize,
211 num_error: usize,
212 ) -> io::Result<()> {
213 let summary = [
214 ("Elapsed:", 0),
215 ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
216 ("New files:", num_new),
217 ("Modified files:", num_modified),
218 ("Errors:", num_error),
219 ];
220 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
221 let mut writer = std::io::stderr();
222 formatter.write_value(
223 &mut writer,
224 summary[0].0,
225 FormattedDuration(start_time.elapsed()),
226 )?;
227 formatter.write_values(&mut writer, &summary[1..])
228 }
229
230 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
231 assert_eq!(self.dirs.len(), 1);
232 let cache = self.new_cache()?;
233 let base_dir = &self.dirs[0];
234 let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
235 cache.set_remove_if_no_access(relative);
236 let cache_clone = Arc::clone(&cache);
237 std::thread::scope(|global_scope| {
238 let mut it = FileIterator::new(base_dir);
239 it.cache = Some(Arc::clone(&cache));
240 it.exclude = self.exclude.as_ref();
241 let it_rx = it.spawn_in_scope(global_scope);
242 tx.send(CheckEvent::StartChecking)?;
243 let pool = crate::build_thread_pool(self.jobs)?;
244 pool.scope(move |scope| -> anyhow::Result<()> {
245 let mut total_files = 0;
246 for file in it_rx {
247 total_files += 1;
248 let tx = tx.clone();
249 let cache = Arc::clone(&cache);
250 scope.spawn(move |_| {
251 let status = self.check_file(&file, &cache, update);
252 let event = match status {
253 Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
254 let rel_path = file.relative_path(base_dir);
255 CheckEvent::Result(rel_path.into(), status.unwrap())
256 }
257 Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
258 Err(e) => {
259 log::error!("Failed to check file '{}': {}", file, e);
260 CheckEvent::Error
261 }
262 };
263 if tx.send(event).is_err() {
264 log::error!("Send failed");
265 }
266 });
267 }
268 tx.send(CheckEvent::TotalFiles(total_files))?;
269 Ok(())
270 })
271 })?;
272 cache_clone.save()?;
273 Ok(())
274 }
275
276 fn check_file(
277 &self,
278 file: &FileItem,
279 cache: &FileHashCache,
280 update: bool,
281 ) -> anyhow::Result<CheckStatus> {
282 assert!(file.path().is_absolute());
283 let computed_hash = self.compute_hash(file)?;
284 let rel_path = file.relative_path(cache.base_dir());
285 let cached_hash = cache.get_by_path(rel_path);
286 let status = match cached_hash {
287 None => CheckStatus::New,
288 Some(cached) => {
289 if computed_hash != cached {
290 CheckStatus::Modified
291 } else {
292 CheckStatus::Unchanged
293 }
294 }
295 };
296 if update {
297 let modified = file.modified();
298 match status {
299 CheckStatus::New | CheckStatus::Modified => {
300 cache.insert(rel_path, modified, computed_hash);
301 }
302 CheckStatus::Unchanged => {
303 if cache.get(rel_path, modified).is_none() {
304 cache.insert(rel_path, modified, computed_hash);
305 }
306 }
307 }
308 }
309 Ok(status)
310 }
311
312 pub fn run(&self) -> anyhow::Result<()> {
314 let start_time = time::Instant::now();
315 let mut duplicates = self.find_duplicates()?;
316 let mut total_wasted_space = 0;
317 if !duplicates.is_empty() {
318 duplicates.sort_by_key(|a| a.size);
319 total_wasted_space = self.print_duplicates_results(&duplicates)?;
320 }
321 self.print_duplicates_summary(&start_time, total_wasted_space)?;
322 Ok(())
323 }
324
325 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
326 let mut total_wasted_space = 0;
327 for dupes in duplicates {
328 dupes.print(self.output_format)?;
329 total_wasted_space += dupes.wasted_size();
330 }
331 Ok(total_wasted_space)
332 }
333
334 fn print_duplicates_summary(
335 &self,
336 start_time: &time::Instant,
337 total_wasted_space: u64,
338 ) -> io::Result<()> {
339 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
340 let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
341 let total_wasted_space = crate::human_readable_size(total_wasted_space);
342 let summary = [
343 ("Elapsed:", elapsed),
344 ("Hash computed:", num_hashed),
345 ("Total wasted space:", total_wasted_space),
346 ];
347 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
348 formatter.write_values(&mut io::stderr(), &summary)
349 }
350
351 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
353 let progress = self
354 .progress
355 .as_ref()
356 .map(|progress| progress.add_spinner())
357 .unwrap_or_else(Progress::none);
358 progress.set_message("Scanning directories...");
359
360 let (tx, rx) = mpsc::channel();
361 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
362 std::thread::scope(|scope| {
363 scope.spawn(|| {
364 if let Err(e) = self.find_duplicates_streaming(tx) {
365 log::error!("Error during duplicate finding: {}", e);
366 }
367 });
368
369 while let Ok(event) = rx.recv() {
370 match event {
371 DupEvent::StartHashing => progress.set_message("Hashing files..."),
372 DupEvent::NumFiles(num) => progress.set_length(num as u64),
373 DupEvent::Result(file, hash) => {
374 progress.inc(1);
375 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
376 paths: Vec::new(),
377 size: file.size(),
378 });
379 assert_eq!(
381 entry.size,
382 file.size(),
383 "Hash collision: sizes do not match"
384 );
385 entry.paths.push(file.into_path_buf());
386 }
387 DupEvent::Error => progress.inc(1),
388 }
389 }
390 });
391 progress.finish();
392
393 let mut duplicates = Vec::new();
394 for (_, mut dupes) in by_hash {
395 if dupes.paths.len() > 1 {
396 dupes.paths.sort();
397 duplicates.push(dupes);
398 }
399 }
400 Ok(duplicates)
401 }
402
403 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
404 std::thread::scope(|global_scope| {
405 let (it_rx, caches) = self.stream_file_items(global_scope)?;
406 let caches = &caches;
407 let pool = crate::build_thread_pool(self.jobs)?;
408 pool.scope(move |scope| -> anyhow::Result<()> {
409 let mut by_size: HashMap<u64, DupState> = HashMap::new();
410 let mut num_hashed = 0;
411 tx.send(DupEvent::StartHashing)?;
412 for (file, dir_index) in it_rx {
413 let size = file.size();
414 if size == 0 {
415 continue;
416 }
417 let cache = &caches[dir_index];
418 match by_size.entry(size) {
419 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
420 {
421 DupState::Single(file0, dir_index0) => {
422 let cache0 = &caches[*dir_index0];
425 self.send_hash(file0, cache0, &tx, scope);
426 self.send_hash(&file, cache, &tx, scope);
427
428 *occ.get_mut() = DupState::Hashing;
430 num_hashed += 2;
431 }
432 DupState::Hashing => {
433 self.send_hash(&file, cache, &tx, scope);
435 num_hashed += 1;
436 }
437 },
438 std::collections::hash_map::Entry::Vacant(vac) => {
439 vac.insert(DupState::Single(file, dir_index));
440 }
441 }
442 }
443 tx.send(DupEvent::NumFiles(num_hashed))?;
444 Ok(())
445 })?;
446 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
447 Ok::<(), anyhow::Error>(())
448 })?;
449 Ok(())
450 }
451
452 fn stream_file_items<'scope, 'env>(
453 &'env self,
454 scope: &'scope std::thread::Scope<'scope, 'env>,
455 ) -> anyhow::Result<(mpsc::Receiver<FileWithDirIndex>, Vec<Arc<FileHashCache>>)> {
456 let (it_tx, it_rx) = mpsc::channel();
457 let mut caches = Vec::with_capacity(self.dirs.len());
458 for (dir_index, dir) in self.dirs.iter().enumerate() {
459 let mut it = FileIterator::new(dir);
460 let cache = FileHashCache::find_or_new(dir);
461 it.cache = Some(Arc::clone(&cache));
462 it.exclude = self.exclude.as_ref();
463 let it_tx = it_tx.clone();
464 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
465 caches.push(cache);
466 }
467 Ok((it_rx, caches))
468 }
469
470 fn send_hash<'scope>(
471 &'scope self,
472 file: &FileItem,
473 cache: &Arc<FileHashCache>,
474 tx: &mpsc::Sender<DupEvent>,
475 scope: &rayon::Scope<'scope>,
476 ) {
477 let (hash, relative) = self
478 .get_hash_from_cache(file, cache)
479 .expect("path should be in cache base_dir");
480 if let Some(hash) = hash {
481 let _ = tx.send(DupEvent::Result(file.clone(), hash));
482 return;
483 }
484
485 let file = file.clone();
486 let relative = relative.to_path_buf();
487 let tx = tx.clone();
488 let cache = Arc::clone(cache);
489 scope.spawn(move |_| {
490 if let Ok(hash) = self.compute_hash(&file) {
491 cache.insert(&relative, file.modified(), hash);
492 let _ = tx.send(DupEvent::Result(file, hash));
493 } else {
494 log::error!("Failed to hash file: '{}'", file);
495 let _ = tx.send(DupEvent::Error);
496 }
497 });
498 }
499
500 pub fn get_hash(&self, file: &FileItem) -> anyhow::Result<blake3::Hash> {
502 let cache = self.cache.as_ref().expect("cache should be initialized");
503 let (hash, relative) = self.get_hash_from_cache(file, cache)?;
504 if let Some(hash) = hash {
505 return Ok(hash);
506 }
507
508 let hash = self.compute_hash(file)?;
509 cache.insert(relative, file.modified(), hash);
510 Ok(hash)
511 }
512
513 fn get_hash_from_cache<'a>(
514 &self,
515 file: &'a FileItem,
516 cache: &FileHashCache,
517 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
518 let relative = file.relative_path(cache.base_dir());
519 if let Some(hash) = cache.get(relative, file.modified()) {
520 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
521 return Ok((Some(hash), relative));
522 }
523 Ok((None, relative))
524 }
525
526 fn compute_hash(&self, file: &FileItem) -> io::Result<blake3::Hash> {
527 let start_time = time::Instant::now();
528 let mut f = fs::File::open(file.path())?;
529 let progress = self
530 .progress
531 .as_ref()
532 .map(|progress| progress.add_file(file.path(), file.size()))
533 .unwrap_or_else(Progress::none);
534 let mut hasher = blake3::Hasher::new();
535 if self.buffer_size == 0 {
536 if file.size() > 0 {
537 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
538 hasher.update(&mmap[..]);
539 progress.inc(file.size());
540 }
541 } else {
542 let mut buf = vec![0u8; self.buffer_size];
543 loop {
544 let n = f.read(&mut buf)?;
545 if n == 0 {
546 break;
547 }
548 hasher.update(&buf[..n]);
549 progress.inc(n as u64);
550 }
551 }
552 progress.finish();
553 self.num_hashed.fetch_add(1, Ordering::Relaxed);
554 let hash = hasher.finalize();
555 log::debug!(
556 "Computed hash in {}: '{}'",
557 FormattedDuration(start_time.elapsed()),
558 file
559 );
560 Ok(hash)
561 }
562}
563
564#[derive(Clone, Debug)]
566pub struct DuplicatedFiles {
567 pub paths: Vec<PathBuf>,
568 pub size: u64,
569}
570
571impl DuplicatedFiles {
572 fn wasted_size(&self) -> u64 {
573 self.size * (self.paths.len() as u64 - 1)
574 }
575
576 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
577 match output_format {
578 OutputFormat::Default => self.write_human(stdout())?,
579 OutputFormat::PowerShell => self.write_pwsh(stdout())?,
580 OutputFormat::Shell => self.write_shell(stdout())?,
581 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
582 }
583 Ok(())
584 }
585
586 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
587 writeln!(
588 writer,
589 "Identical {} files of {}:",
590 self.paths.len(),
591 crate::human_readable_size(self.size)
592 )?;
593 for path in &self.paths {
594 writeln!(writer, " {}", path.display())?;
595 }
596 Ok(())
597 }
598
599 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
600 writeln!(writer, "- paths:")?;
601 for path in &self.paths {
602 writeln!(writer, " - {:?}", path)?;
603 }
604 writeln!(writer, " size: {}", self.size)?;
605 Ok(())
606 }
607
608 fn write_shell(&self, writer: impl io::Write) -> anyhow::Result<()> {
609 self.write_shell_with(writer, "cp", Self::escape_shell)
610 }
611
612 fn write_pwsh(&self, writer: impl io::Write) -> anyhow::Result<()> {
613 self.write_shell_with(writer, "Copy-Item -LiteralPath", Self::escape_shell_double)
614 }
615
616 fn write_shell_with(
617 &self,
618 mut writer: impl io::Write,
619 cmd: &str,
620 stringify: impl Fn(&Path) -> String,
621 ) -> anyhow::Result<()> {
622 let mut iter = self.paths.iter();
623 if let Some(path0) = iter.next() {
624 let path0 = stringify(path0);
625 for path in iter {
626 writeln!(writer, "{cmd} '{path0}' '{}'", stringify(path))?;
627 }
628 }
629 Ok(())
630 }
631
632 fn escape_shell(path: &Path) -> String {
633 path.to_string_lossy().replace('\'', "\'\\'\'")
634 }
635
636 fn escape_shell_double(path: &Path) -> String {
637 path.to_string_lossy().replace('\'', "\'\'")
638 }
639}
640
641#[cfg(test)]
642mod tests {
643 use super::*;
644
645 fn default_exclude() -> globset::GlobSet {
646 let mut builder = globset::GlobSetBuilder::new();
647 builder.add(
648 globset::GlobBuilder::new(".hash_cache")
649 .case_insensitive(true)
650 .build()
651 .unwrap(),
652 );
653 builder.build().unwrap()
654 }
655
656 #[test]
657 fn find_duplicates() -> anyhow::Result<()> {
658 let dir = tempfile::tempdir()?;
659
660 let file1_path = dir.path().join("same1.txt");
661 fs::write(&file1_path, "same content")?;
662
663 let file2_path = dir.path().join("same2.txt");
664 fs::write(&file2_path, "same content")?;
665
666 let diff_path = dir.path().join("diff.txt");
667 fs::write(&diff_path, "different content")?;
668
669 let mut hasher = FileHasher::new(&[dir.path()])?;
670 hasher.buffer_size = 8192;
671 let duplicates = hasher.find_duplicates()?;
672
673 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
674 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
675
676 assert_eq!(duplicates.len(), 1);
677 let group = &duplicates[0];
678 assert_eq!(group.paths.len(), 2);
679 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
682 assert!(group.paths.contains(&file2_path));
683
684 Ok(())
685 }
686
687 #[test]
688 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
689 let dir = tempfile::tempdir()?;
690 let dir_path = dir.path();
691
692 let sub_dir = dir_path.join("a").join("a");
693 fs::create_dir_all(&sub_dir)?;
694
695 let file1_path = sub_dir.join("1");
696 fs::write(&file1_path, "same content")?;
697
698 let file2_path = sub_dir.join("2");
699 fs::write(&file2_path, "same content")?;
700
701 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
703 fs::File::create(&cache_aa_path)?;
704
705 let hasher_aa = FileHasher::new(&[&sub_dir])?;
707 let duplicates_aa = hasher_aa.find_duplicates()?;
708 assert_eq!(duplicates_aa.len(), 1);
709 assert!(cache_aa_path.exists());
710 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
711 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
712
713 let root_a = dir_path.join("a");
715 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
716 fs::File::create(&cache_a_path)?;
717
718 let hasher_a = FileHasher::new(&[&root_a])?;
720 let duplicates_a = hasher_a.find_duplicates()?;
721 assert_eq!(duplicates_a.len(), 1);
722 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
723 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
724
725 assert!(cache_a_path.exists());
727 assert!(!cache_aa_path.exists());
728
729 Ok(())
730 }
731
732 #[test]
733 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
734 let dir = tempfile::tempdir()?;
735
736 let file1_path = dir.path().join("same1.txt");
737 fs::write(&file1_path, "same content")?;
738
739 let file2_path = dir.path().join("same2.txt");
740 fs::write(&file2_path, "same content")?;
741
742 let exclude_path = dir.path().join("exclude.txt");
743 fs::write(&exclude_path, "same content")?;
744
745 let mut hasher = FileHasher::new(&[dir.path()])?;
746 hasher.buffer_size = 8192;
747 let mut builder = globset::GlobSetBuilder::new();
748 builder.add(
749 globset::GlobBuilder::new("exclude.txt")
750 .case_insensitive(true)
751 .build()?,
752 );
753 let filter = builder.build()?;
754 hasher.exclude = Some(filter);
755
756 let duplicates = hasher.find_duplicates()?;
757 assert_eq!(duplicates.len(), 1);
758 let group = &duplicates[0];
759 assert_eq!(group.paths.len(), 2);
760 assert!(group.paths.contains(&file1_path));
761 assert!(group.paths.contains(&file2_path));
762 assert!(!group.paths.contains(&exclude_path));
763 Ok(())
764 }
765
766 #[test]
767 fn check_mode_empty_cache() -> anyhow::Result<()> {
768 let dir = tempfile::tempdir()?;
769 let dir_path = dir.path().to_path_buf();
770 println!("{:?}", dir_path);
771 let file1_path = dir.path().join("file1.txt");
772 fs::write(&file1_path, "content 1")?;
773 let file2_path = dir.path().join("file2.txt");
774 fs::write(&file2_path, "content 2")?;
775
776 let mut hasher = FileHasher::new(&[&dir_path])?;
777 hasher.exclude = Some(default_exclude());
778 let (tx, rx) = mpsc::channel();
779 hasher.check_streaming(tx, false)?;
780 let mut results = Vec::new();
781 let mut start_seen = false;
782 let mut total_files = None;
783 let mut file_done_count = 0;
784 let mut num_error = 0;
785 while let Ok(event) = rx.recv() {
786 match event {
787 CheckEvent::StartChecking => start_seen = true,
788 CheckEvent::TotalFiles(total) => total_files = Some(total),
789 CheckEvent::Result(path, status) => results.push((path, status)),
790 CheckEvent::FileDone => file_done_count += 1,
791 CheckEvent::Error => num_error += 1,
792 }
793 }
794 assert!(start_seen);
795 assert_eq!(total_files, Some(2));
796 assert_eq!(file_done_count, 0);
797 assert_eq!(num_error, 0);
798
799 results.sort_by(|a, b| a.0.cmp(&b.0));
800 assert_eq!(results.len(), 2);
801 assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
802 assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
803
804 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
805 Ok(())
806 }
807
808 #[test]
809 fn check_mode_with_cache() -> anyhow::Result<()> {
810 let dir = tempfile::tempdir()?;
811 let dir_path = dir.path().to_path_buf();
812 let file1_path = dir.path().join("file1.txt");
813 let file2_path = dir.path().join("file2.txt");
814 fs::write(&file1_path, "content 1")?;
815 fs::write(&file2_path, "content 2")?;
816 let file1 = FileItem::try_from(file1_path.as_path())?;
817 let file2 = FileItem::try_from(file2_path.as_path())?;
818
819 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
820 hasher.exclude = Some(default_exclude());
821 let _hash1 = hasher.get_hash(&file1)?;
822 let _hash2 = hasher.get_hash(&file2)?;
823 hasher.save_cache()?;
824 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
825
826 let mut hasher = FileHasher::new(&[&dir_path])?;
827 hasher.exclude = Some(default_exclude());
828 let (tx, rx) = mpsc::channel();
829 hasher.check_streaming(tx, false)?;
830 let mut results = Vec::new();
831 let mut file_done_count = 0;
832 while let Ok(event) = rx.recv() {
833 match event {
834 CheckEvent::Result(path, status) => results.push((path, status)),
835 CheckEvent::FileDone => file_done_count += 1,
836 _ => {}
837 }
838 }
839 assert_eq!(results.len(), 0);
840 assert_eq!(file_done_count, 2);
841
842 fs::write(&file1_path, "content 1 modified")?;
843
844 let file2_meta_before = fs::metadata(&file2_path)?;
845 let mtime_before = file2_meta_before.modified()?;
846 std::thread::sleep(time::Duration::from_millis(10));
847 fs::write(&file2_path, "content 2")?;
848 let file2_meta_after = fs::metadata(&file2_path)?;
849 let mtime_after = file2_meta_after.modified()?;
850 assert!(mtime_after > mtime_before);
851
852 let mut hasher = FileHasher::new(&[&dir_path])?;
853 hasher.exclude = Some(default_exclude());
854 let (tx, rx) = mpsc::channel();
855 hasher.check_streaming(tx, false)?;
856 let mut results = Vec::new();
857 let mut file_done_count = 0;
858 while let Ok(event) = rx.recv() {
859 match event {
860 CheckEvent::Result(path, status) => results.push((path, status)),
861 CheckEvent::FileDone => file_done_count += 1,
862 _ => {}
863 }
864 }
865 assert_eq!(results.len(), 1);
866 assert_eq!(
867 results[0],
868 (PathBuf::from("file1.txt"), CheckStatus::Modified)
869 );
870 assert_eq!(file_done_count, 1);
871 Ok(())
872 }
873
874 #[test]
875 fn check_update_mode() -> anyhow::Result<()> {
876 let dir = tempfile::tempdir()?;
877 let dir_path = dir.path().to_path_buf();
878 let file1_path = dir.path().join("file1.txt");
879 fs::write(&file1_path, "content 1")?;
880
881 let mut hasher = FileHasher::new(&[&dir_path])?;
882 hasher.exclude = Some(default_exclude());
883 let (tx, rx) = mpsc::channel();
884 hasher.check_streaming(tx, true)?;
885 while rx.recv().is_ok() {}
886 hasher.save_cache()?;
887 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
888
889 let cache = FileHashCache::new(&dir_path);
890 let mtime1 = fs::metadata(&file1_path)?.modified()?;
891 let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
892 assert!(hash1.is_some());
893
894 std::thread::sleep(time::Duration::from_millis(10));
895 fs::write(&file1_path, "content 1 modified")?;
896 let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
897
898 let mut hasher = FileHasher::new(&[&dir_path])?;
899 hasher.exclude = Some(default_exclude());
900 let (tx, rx) = mpsc::channel();
901 hasher.check_streaming(tx, true)?;
902 while rx.recv().is_ok() {}
903 hasher.save_cache()?;
904
905 let cache = FileHashCache::new(&dir_path);
906 let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
907 assert!(hash_mod.is_some());
908 assert_ne!(hash1, hash_mod);
909
910 std::thread::sleep(time::Duration::from_millis(10));
911 fs::write(&file1_path, "content 1 modified")?;
912 let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
913 assert!(mtime1_mod2 > mtime1_mod);
914
915 assert!(
916 cache
917 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
918 .is_none()
919 );
920
921 let mut hasher = FileHasher::new(&[&dir_path])?;
922 hasher.exclude = Some(default_exclude());
923 let (tx, rx) = mpsc::channel();
924 hasher.check_streaming(tx, true)?;
925 while rx.recv().is_ok() {}
926 hasher.save_cache()?;
927
928 let cache = FileHashCache::new(&dir_path);
929 assert!(
930 cache
931 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
932 .is_some()
933 );
934 Ok(())
935 }
936
937 #[test]
938 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
939 let dir = tempfile::tempdir()?;
940 let dir_path = dir.path().to_path_buf();
941 let file1_path = dir.path().join("file1.txt");
942 let file2_path = dir.path().join("file2.txt");
943 fs::write(&file1_path, "content 1")?;
944 fs::write(&file2_path, "content 2")?;
945 let mtime1 = fs::metadata(&file1_path)?.modified()?;
946 let mtime2 = fs::metadata(&file2_path)?.modified()?;
947
948 let mut hasher = FileHasher::new(&[&dir_path])?;
949 hasher.exclude = Some(default_exclude());
950 let (tx, rx) = mpsc::channel();
951 hasher.check_streaming(tx, true)?;
952 while rx.recv().is_ok() {}
953 hasher.save_cache()?;
954
955 let cache = FileHashCache::new(&dir_path);
957 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
958 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
959
960 fs::remove_file(&file2_path)?;
962
963 let mut hasher = FileHasher::new(&[&dir_path])?;
965 hasher.exclude = Some(default_exclude());
966 let (tx, rx) = mpsc::channel();
967 hasher.check_streaming(tx, true)?;
968 while rx.recv().is_ok() {}
969 hasher.save_cache()?;
970
971 let cache = FileHashCache::new(&dir_path);
973 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
974 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
975 Ok(())
976 }
977
978 #[test]
979 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
980 let tmp = tempfile::tempdir()?;
981 let dir1 = tmp.path().join("dir1");
982 let dir2 = tmp.path().join("dir2");
983 fs::create_dir(&dir1)?;
984 fs::create_dir(&dir2)?;
985 let file1_path = dir1.join("file1.txt");
986 fs::write(&file1_path, "same content")?;
987 let file2_path = dir2.join("file2.txt");
988 fs::write(&file2_path, "same content")?;
989 let hasher = FileHasher::new(&[&dir1, &dir2])?;
990 let duplicates = hasher.find_duplicates()?;
991 assert_eq!(duplicates.len(), 1);
992 let group = &duplicates[0];
993 assert_eq!(group.paths.len(), 2);
994 assert_eq!(group.size, 12);
995 assert!(group.paths.contains(&file1_path));
996 assert!(group.paths.contains(&file2_path));
997
998 Ok(())
999 }
1000
1001 #[test]
1002 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
1003 let tmp = tempfile::tempdir()?;
1004 let dir1 = tmp.path().join("dir1");
1005 let dir2 = tmp.path().join("dir2");
1006 fs::create_dir(&dir1)?;
1007 fs::create_dir(&dir2)?;
1008 let hasher = FileHasher::new(&[&dir1, &dir2])?;
1009 assert!(hasher.check(false).is_err());
1010 Ok(())
1011 }
1012
1013 #[test]
1014 fn escape_shell() {
1015 let escape_shell = |p: &str| DuplicatedFiles::escape_shell(Path::new(p));
1016 assert_eq!(escape_shell(""), "");
1017 assert_eq!(escape_shell("abc"), "abc");
1018 assert_eq!(escape_shell("a'b"), "a'\\''b");
1019 assert_eq!(escape_shell("a'b'"), "a'\\''b'\\''");
1020
1021 let escape_shell_double = |p: &str| DuplicatedFiles::escape_shell_double(Path::new(p));
1022 assert_eq!(escape_shell_double(""), "");
1023 assert_eq!(escape_shell_double("abc"), "abc");
1024 assert_eq!(escape_shell_double("a'b"), "a''b");
1025 assert_eq!(escape_shell_double("a'b'"), "a''b''");
1026 }
1027
1028 #[test]
1029 fn write_dups_shell_empty() -> anyhow::Result<()> {
1030 let dup_empty = DuplicatedFiles {
1031 paths: vec![],
1032 size: 100,
1033 };
1034 let mut buf = Vec::new();
1035 dup_empty.write_shell(&mut buf)?;
1036 assert_eq!(String::from_utf8(buf)?, "");
1037 Ok(())
1038 }
1039
1040 #[test]
1041 fn write_dups_shell_one() -> anyhow::Result<()> {
1042 let dup_one = DuplicatedFiles {
1043 paths: vec![PathBuf::from("a.txt")],
1044 size: 100,
1045 };
1046 let mut buf = Vec::new();
1047 dup_one.write_shell(&mut buf)?;
1048 assert_eq!(String::from_utf8(buf)?, "");
1049 Ok(())
1050 }
1051
1052 #[test]
1053 fn write_dups_shell_two() -> anyhow::Result<()> {
1054 let dup_multiple = DuplicatedFiles {
1055 paths: vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")],
1056 size: 100,
1057 };
1058 let mut buf = Vec::new();
1059 dup_multiple.write_shell(&mut buf)?;
1060 assert_eq!(String::from_utf8(buf)?, "cp 'a.txt' 'b.txt'\n");
1061 Ok(())
1062 }
1063
1064 #[test]
1065 fn write_dups_shell_three() -> anyhow::Result<()> {
1066 let dup_multiple = DuplicatedFiles {
1067 paths: vec![
1068 PathBuf::from("a.txt"),
1069 PathBuf::from("b.txt"),
1070 PathBuf::from("c.txt"),
1071 ],
1072 size: 100,
1073 };
1074 let mut buf = Vec::new();
1075 dup_multiple.write_shell(&mut buf)?;
1076 assert_eq!(
1077 String::from_utf8(buf)?,
1078 "cp 'a.txt' 'b.txt'\ncp 'a.txt' 'c.txt'\n"
1079 );
1080 Ok(())
1081 }
1082
1083 #[test]
1084 fn write_dups_shell_quotes() -> anyhow::Result<()> {
1085 let dup_quotes = DuplicatedFiles {
1086 paths: vec![PathBuf::from("a'b.txt"), PathBuf::from("c'd.txt")],
1087 size: 100,
1088 };
1089 let mut buf = Vec::new();
1090 dup_quotes.write_shell(&mut buf)?;
1091 assert_eq!(String::from_utf8(buf)?, "cp 'a'\\''b.txt' 'c'\\''d.txt'\n");
1092
1093 let mut buf = Vec::new();
1094 dup_quotes.write_pwsh(&mut buf)?;
1095 assert_eq!(
1096 String::from_utf8(buf)?,
1097 "Copy-Item -LiteralPath 'a''b.txt' 'c''d.txt'\n"
1098 );
1099 Ok(())
1100 }
1101}