1use crate::{
2 ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, OutputFormat,
3 Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use simple_path::SimplePath;
9use std::{
10 collections::HashMap,
11 fs,
12 io::{self, Read, stdout},
13 path::{Path, PathBuf},
14 sync::{
15 Arc,
16 atomic::{AtomicUsize, Ordering},
17 mpsc,
18 },
19 time,
20};
21
22type FileItem = (PathBuf, usize);
23
24#[derive(Debug, Clone)]
25enum DupEvent {
26 StartHashing,
27 NumFiles(usize),
28 Result(PathBuf, u64, blake3::Hash),
29 Error,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
33enum CheckStatus {
34 Unchanged,
35 New,
36 Modified,
37}
38
39#[derive(Debug, PartialEq)]
40enum CheckEvent {
41 StartChecking,
42 TotalFiles(usize),
43 Result(PathBuf, CheckStatus),
44 FileDone,
45 Error,
46}
47
48enum DupState {
49 Single(PathBuf, time::SystemTime, usize),
50 Hashing,
51}
52
53pub struct FileHasher {
55 dirs: Vec<PathBuf>,
56 pub buffer_size: usize,
57 cache: Option<Arc<FileHashCache>>,
58 num_hashed: AtomicUsize,
59 num_hash_looked_up: AtomicUsize,
60 pub exclude: Option<GlobSet>,
61 pub progress: Option<Arc<ProgressBuilder>>,
62 pub output_format: OutputFormat,
63 pub jobs: usize,
64}
65
66impl FileHasher {
67 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
68
69 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
71 if dirs.is_empty() {
72 anyhow::bail!("At least one directory must be specified.");
73 }
74 Ok(Self {
75 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
76 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
77 cache: None,
78 num_hashed: AtomicUsize::new(0),
79 num_hash_looked_up: AtomicUsize::new(0),
80 exclude: None,
81 progress: None,
82 output_format: OutputFormat::Default,
83 jobs: Self::DEFAULT_JOBS,
84 })
85 }
86
87 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
88 let mut hasher = Self::new(dirs)?;
89 hasher.cache = Some(hasher.new_cache()?);
90 Ok(hasher)
91 }
92
93 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
94 let common_ancestor = crate::common_ancestor(&self.dirs)
95 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
96 Ok(FileHashCache::find_or_new(&common_ancestor))
97 }
98
99 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
101 if self.cache.is_none() {
102 self.cache = Some(self.new_cache()?);
103 }
104 Ok(Arc::clone(self.cache.as_ref().unwrap()))
105 }
106
107 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
109 let cache = self.cache()?;
110 let relative = SimplePath::strip_prefix(path, cache.base_dir())?;
111 cache.remove(relative);
112 Ok(())
113 }
114
115 pub fn save_cache(&self) -> anyhow::Result<()> {
117 log::info!(
118 "Hash stats for {:?}: {} computed, {} looked up",
119 self.dirs,
120 self.num_hashed.load(Ordering::Relaxed),
121 self.num_hash_looked_up.load(Ordering::Relaxed)
122 );
123 if let Some(cache) = &self.cache {
124 cache.save()?;
125 }
126 Ok(())
127 }
128
129 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
131 let cache = self.cache()?;
132 for dir in &self.dirs {
133 let relative = SimplePath::strip_prefix(dir, cache.base_dir())?;
134 cache.clear(relative);
135 }
136 Ok(())
137 }
138
139 pub fn check(&self, update: bool) -> anyhow::Result<()> {
141 match self.output_format {
142 OutputFormat::Default | OutputFormat::Symbol => {}
143 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
144 }
145 if self.dirs.len() > 1 {
146 anyhow::bail!("Check mode only supports one directory.");
147 }
148 let start_time = time::Instant::now();
149 let progress = self
150 .progress
151 .as_ref()
152 .map(|progress| progress.add_spinner())
153 .unwrap_or_else(Progress::none);
154 progress.set_message("Scanning directory...");
155 let mut num_new = 0;
156 let mut num_modified = 0;
157 let mut num_error = 0;
158 std::thread::scope(|scope| {
159 let (tx, rx) = mpsc::channel();
160 scope.spawn(|| {
161 if let Err(e) = self.check_streaming(tx, update) {
162 log::error!("Error during check: {}", e);
163 }
164 });
165 while let Ok(event) = rx.recv() {
166 match event {
167 CheckEvent::StartChecking => {
168 progress.set_message("Checking files...");
169 }
170 CheckEvent::TotalFiles(total) => {
171 progress.set_length(total as u64);
172 progress.set_message("");
173 }
174 CheckEvent::Result(path, status) => {
175 let symbol = match status {
176 CheckStatus::New => {
177 num_new += 1;
178 '+'
179 }
180 CheckStatus::Modified => {
181 num_modified += 1;
182 '!'
183 }
184 CheckStatus::Unchanged => unreachable!(),
185 };
186 progress.inc(1);
187 progress.suspend_for(stdout(), || {
188 println!("{} {}", symbol, path.display());
189 });
190 }
191 CheckEvent::FileDone => {
192 progress.inc(1);
193 }
194 CheckEvent::Error => {
195 progress.inc(1);
196 num_error += 1;
197 }
198 }
199 }
200 });
201 progress.finish();
202 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
203 Ok(())
204 }
205
206 fn print_check_summary(
207 &self,
208 start_time: &time::Instant,
209 num_new: usize,
210 num_modified: usize,
211 num_error: usize,
212 ) -> io::Result<()> {
213 let summary = [
214 ("Elapsed:", 0),
215 ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
216 ("New files:", num_new),
217 ("Modified files:", num_modified),
218 ("Errors:", num_error),
219 ];
220 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
221 let mut writer = std::io::stderr();
222 formatter.write_value(
223 &mut writer,
224 summary[0].0,
225 FormattedDuration(start_time.elapsed()),
226 )?;
227 formatter.write_values(&mut writer, &summary[1..])
228 }
229
230 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
231 assert_eq!(self.dirs.len(), 1);
232 let cache = self.new_cache()?;
233 let base_dir = &self.dirs[0];
234 let relative = SimplePath::strip_prefix(base_dir, cache.base_dir())?;
235 cache.set_remove_if_no_access(relative);
236 let cache_clone = Arc::clone(&cache);
237 std::thread::scope(|global_scope| {
238 let mut it = FileIterator::new(base_dir);
239 it.cache = Some(Arc::clone(&cache));
240 it.exclude = self.exclude.as_ref();
241 let it_rx = it.spawn_in_scope(global_scope);
242 tx.send(CheckEvent::StartChecking)?;
243 let pool = crate::build_thread_pool(self.jobs)?;
244 pool.scope(move |scope| -> anyhow::Result<()> {
245 let mut total_files = 0;
246 for path in it_rx {
247 total_files += 1;
248 let tx = tx.clone();
249 let cache = Arc::clone(&cache);
250 scope.spawn(move |_| {
251 let status = self.check_file(&path, &cache, update);
252 let event = match status {
253 Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
254 let rel_path = SimplePath::strip_prefix(&path, base_dir).unwrap();
255 CheckEvent::Result(rel_path.into(), status.unwrap())
256 }
257 Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
258 Err(e) => {
259 log::error!("Failed to check file {:?}: {}", path, e);
260 CheckEvent::Error
261 }
262 };
263 if tx.send(event).is_err() {
264 log::error!("Send failed");
265 }
266 });
267 }
268 tx.send(CheckEvent::TotalFiles(total_files))?;
269 Ok(())
270 })
271 })?;
272 cache_clone.save()?;
273 Ok(())
274 }
275
276 fn check_file(
277 &self,
278 abs_path: &Path,
279 cache: &FileHashCache,
280 update: bool,
281 ) -> anyhow::Result<CheckStatus> {
282 assert!(abs_path.is_absolute());
283 let computed_hash = self.compute_hash(abs_path)?;
284 let rel_path = SimplePath::strip_prefix(abs_path, cache.base_dir())?;
285 let cached_hash = cache.get_by_path(rel_path);
286 let status = match cached_hash {
287 None => CheckStatus::New,
288 Some(cached) => {
289 if computed_hash != cached {
290 CheckStatus::Modified
291 } else {
292 CheckStatus::Unchanged
293 }
294 }
295 };
296 if update {
297 let modified = fs::metadata(abs_path)?.modified()?;
298 match status {
299 CheckStatus::New | CheckStatus::Modified => {
300 cache.insert(rel_path, modified, computed_hash);
301 }
302 CheckStatus::Unchanged => {
303 if cache.get(rel_path, modified).is_none() {
304 cache.insert(rel_path, modified, computed_hash);
305 }
306 }
307 }
308 }
309 Ok(status)
310 }
311
312 pub fn run(&self) -> anyhow::Result<()> {
314 let start_time = time::Instant::now();
315 let mut duplicates = self.find_duplicates()?;
316 let mut total_wasted_space = 0;
317 if !duplicates.is_empty() {
318 duplicates.sort_by_key(|a| a.size);
319 total_wasted_space = self.print_duplicates_results(&duplicates)?;
320 }
321 self.print_duplicates_summary(&start_time, total_wasted_space)?;
322 Ok(())
323 }
324
325 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
326 let mut total_wasted_space = 0;
327 for dupes in duplicates {
328 dupes.print(self.output_format)?;
329 total_wasted_space += dupes.wasted_size();
330 }
331 Ok(total_wasted_space)
332 }
333
334 fn print_duplicates_summary(
335 &self,
336 start_time: &time::Instant,
337 total_wasted_space: u64,
338 ) -> io::Result<()> {
339 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
340 let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
341 let total_wasted_space = crate::human_readable_size(total_wasted_space);
342 let summary = [
343 ("Elapsed:", elapsed),
344 ("Hash computed:", num_hashed),
345 ("Total wasted space:", total_wasted_space),
346 ];
347 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
348 formatter.write_values(&mut io::stderr(), &summary)
349 }
350
351 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
353 let progress = self
354 .progress
355 .as_ref()
356 .map(|progress| progress.add_spinner())
357 .unwrap_or_else(Progress::none);
358 progress.set_message("Scanning directories...");
359
360 let (tx, rx) = mpsc::channel();
361 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
362 std::thread::scope(|scope| {
363 scope.spawn(|| {
364 if let Err(e) = self.find_duplicates_streaming(tx) {
365 log::error!("Error during duplicate finding: {}", e);
366 }
367 });
368
369 while let Ok(event) = rx.recv() {
370 match event {
371 DupEvent::StartHashing => progress.set_message("Hashing files..."),
372 DupEvent::NumFiles(num) => progress.set_length(num as u64),
373 DupEvent::Result(path, size, hash) => {
374 progress.inc(1);
375 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
376 paths: Vec::new(),
377 size,
378 });
379 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
381 entry.paths.push(path);
382 }
383 DupEvent::Error => progress.inc(1),
384 }
385 }
386 });
387 progress.finish();
388
389 let mut duplicates = Vec::new();
390 for (_, mut dupes) in by_hash {
391 if dupes.paths.len() > 1 {
392 dupes.paths.sort();
393 duplicates.push(dupes);
394 }
395 }
396 Ok(duplicates)
397 }
398
399 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
400 std::thread::scope(|global_scope| {
401 let (it_rx, caches) = self.stream_file_items(global_scope)?;
402 let caches = &caches;
403 let pool = crate::build_thread_pool(self.jobs)?;
404 pool.scope(move |scope| -> anyhow::Result<()> {
405 let mut by_size: HashMap<u64, DupState> = HashMap::new();
406 let mut num_hashed = 0;
407 tx.send(DupEvent::StartHashing)?;
408 for (path, dir_index) in it_rx {
409 let meta = fs::metadata(&path)?;
410 let size = meta.len();
411 if size == 0 {
412 continue;
413 }
414 let modified = meta.modified()?;
415 let cache = &caches[dir_index];
416 match by_size.entry(size) {
417 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
418 {
419 DupState::Single(path0, modified0, dir_index0) => {
420 let cache0 = &caches[*dir_index0];
423 self.send_hash(path0, size, *modified0, cache0, &tx, scope);
424 self.send_hash(&path, size, modified, cache, &tx, scope);
425
426 *occ.get_mut() = DupState::Hashing;
428 num_hashed += 2;
429 }
430 DupState::Hashing => {
431 self.send_hash(&path, size, modified, cache, &tx, scope);
433 num_hashed += 1;
434 }
435 },
436 std::collections::hash_map::Entry::Vacant(vac) => {
437 vac.insert(DupState::Single(path, modified, dir_index));
438 }
439 }
440 }
441 tx.send(DupEvent::NumFiles(num_hashed))?;
442 Ok(())
443 })?;
444 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
445 Ok::<(), anyhow::Error>(())
446 })?;
447 Ok(())
448 }
449
450 fn stream_file_items<'scope, 'env>(
451 &'env self,
452 scope: &'scope std::thread::Scope<'scope, 'env>,
453 ) -> anyhow::Result<(mpsc::Receiver<FileItem>, Vec<Arc<FileHashCache>>)> {
454 let (it_tx, it_rx) = mpsc::channel();
455 let mut caches = Vec::with_capacity(self.dirs.len());
456 for (dir_index, dir) in self.dirs.iter().enumerate() {
457 let mut it = FileIterator::new(dir);
458 let cache = FileHashCache::find_or_new(dir);
459 it.cache = Some(Arc::clone(&cache));
460 it.exclude = self.exclude.as_ref();
461 let it_tx = it_tx.clone();
462 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
463 caches.push(cache);
464 }
465 Ok((it_rx, caches))
466 }
467
468 fn send_hash<'scope>(
469 &'scope self,
470 path: &Path,
471 size: u64,
472 modified: time::SystemTime,
473 cache: &Arc<FileHashCache>,
474 tx: &mpsc::Sender<DupEvent>,
475 scope: &rayon::Scope<'scope>,
476 ) {
477 let (hash, relative) = self
478 .get_hash_from_cache(path, modified, cache)
479 .expect("path should be in cache base_dir");
480 if let Some(hash) = hash {
481 let _ = tx.send(DupEvent::Result(path.to_path_buf(), size, hash));
482 return;
483 }
484
485 let path = path.to_path_buf();
486 let relative = relative.to_path_buf();
487 let tx = tx.clone();
488 let cache = Arc::clone(cache);
489 scope.spawn(move |_| {
490 if let Ok(hash) = self.compute_hash(&path) {
491 cache.insert(&relative, modified, hash);
492 let _ = tx.send(DupEvent::Result(path, size, hash));
493 } else {
494 log::error!("Failed to hash file: {:?}", path);
495 let _ = tx.send(DupEvent::Error);
496 }
497 });
498 }
499
500 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
502 let cache = self.cache.as_ref().expect("cache should be initialized");
503 let meta = fs::metadata(path)?;
504 let modified = meta.modified()?;
505 let (hash, relative) = self.get_hash_from_cache(path, modified, cache)?;
506 if let Some(hash) = hash {
507 return Ok(hash);
508 }
509
510 let hash = self.compute_hash(path)?;
511 cache.insert(relative, modified, hash);
512 Ok(hash)
513 }
514
515 fn get_hash_from_cache<'a>(
516 &self,
517 path: &'a Path,
518 modified: time::SystemTime,
519 cache: &FileHashCache,
520 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
521 let relative = SimplePath::strip_prefix(path, cache.base_dir())
522 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
523 if let Some(hash) = cache.get(relative, modified) {
524 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
525 return Ok((Some(hash), relative));
526 }
527 Ok((None, relative))
528 }
529
530 fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
531 let start_time = time::Instant::now();
532 let mut f = fs::File::open(path)?;
533 let len = f.metadata()?.len();
534 let progress = self
535 .progress
536 .as_ref()
537 .map(|progress| progress.add_file(path, len))
538 .unwrap_or_else(Progress::none);
539 let mut hasher = blake3::Hasher::new();
540 if self.buffer_size == 0 {
541 if len > 0 {
542 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
543 hasher.update(&mmap[..]);
544 progress.inc(len);
545 }
546 } else {
547 let mut buf = vec![0u8; self.buffer_size];
548 loop {
549 let n = f.read(&mut buf)?;
550 if n == 0 {
551 break;
552 }
553 hasher.update(&buf[..n]);
554 progress.inc(n as u64);
555 }
556 }
557 progress.finish();
558 self.num_hashed.fetch_add(1, Ordering::Relaxed);
559 let hash = hasher.finalize();
560 log::debug!(
561 "Computed hash in {}: {:?}",
562 FormattedDuration(start_time.elapsed()),
563 path
564 );
565 Ok(hash)
566 }
567}
568
569#[derive(Clone, Debug)]
571pub struct DuplicatedFiles {
572 pub paths: Vec<PathBuf>,
573 pub size: u64,
574}
575
576impl DuplicatedFiles {
577 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
578 match output_format {
579 OutputFormat::Default => self.write_human(stdout())?,
580 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
581 }
582 Ok(())
583 }
584
585 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
586 writeln!(
587 writer,
588 "Identical {} files of {}:",
589 self.paths.len(),
590 crate::human_readable_size(self.size)
591 )?;
592 for path in &self.paths {
593 writeln!(writer, " {}", path.display())?;
594 }
595 Ok(())
596 }
597
598 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
599 writeln!(writer, "- paths:")?;
600 for path in &self.paths {
601 writeln!(writer, " - {:?}", path)?;
602 }
603 writeln!(writer, " size: {}", self.size)?;
604 Ok(())
605 }
606
607 fn wasted_size(&self) -> u64 {
608 self.size * (self.paths.len() as u64 - 1)
609 }
610}
611
612#[cfg(test)]
613mod tests {
614 use super::*;
615
616 fn default_exclude() -> globset::GlobSet {
617 let mut builder = globset::GlobSetBuilder::new();
618 builder.add(
619 globset::GlobBuilder::new(".hash_cache")
620 .case_insensitive(true)
621 .build()
622 .unwrap(),
623 );
624 builder.build().unwrap()
625 }
626
627 #[test]
628 fn find_duplicates() -> anyhow::Result<()> {
629 let dir = tempfile::tempdir()?;
630
631 let file1_path = dir.path().join("same1.txt");
632 fs::write(&file1_path, "same content")?;
633
634 let file2_path = dir.path().join("same2.txt");
635 fs::write(&file2_path, "same content")?;
636
637 let diff_path = dir.path().join("diff.txt");
638 fs::write(&diff_path, "different content")?;
639
640 let mut hasher = FileHasher::new(&[dir.path()])?;
641 hasher.buffer_size = 8192;
642 let duplicates = hasher.find_duplicates()?;
643
644 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
645 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
646
647 assert_eq!(duplicates.len(), 1);
648 let group = &duplicates[0];
649 assert_eq!(group.paths.len(), 2);
650 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
653 assert!(group.paths.contains(&file2_path));
654
655 Ok(())
656 }
657
658 #[test]
659 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
660 let dir = tempfile::tempdir()?;
661 let dir_path = dir.path();
662
663 let sub_dir = dir_path.join("a").join("a");
664 fs::create_dir_all(&sub_dir)?;
665
666 let file1_path = sub_dir.join("1");
667 fs::write(&file1_path, "same content")?;
668
669 let file2_path = sub_dir.join("2");
670 fs::write(&file2_path, "same content")?;
671
672 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
674 fs::File::create(&cache_aa_path)?;
675
676 let hasher_aa = FileHasher::new(&[&sub_dir])?;
678 let duplicates_aa = hasher_aa.find_duplicates()?;
679 assert_eq!(duplicates_aa.len(), 1);
680 assert!(cache_aa_path.exists());
681 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
682 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
683
684 let root_a = dir_path.join("a");
686 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
687 fs::File::create(&cache_a_path)?;
688
689 let hasher_a = FileHasher::new(&[&root_a])?;
691 let duplicates_a = hasher_a.find_duplicates()?;
692 assert_eq!(duplicates_a.len(), 1);
693 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
694 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
695
696 assert!(cache_a_path.exists());
698 assert!(!cache_aa_path.exists());
699
700 Ok(())
701 }
702
703 #[test]
704 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
705 let dir = tempfile::tempdir()?;
706
707 let file1_path = dir.path().join("same1.txt");
708 fs::write(&file1_path, "same content")?;
709
710 let file2_path = dir.path().join("same2.txt");
711 fs::write(&file2_path, "same content")?;
712
713 let exclude_path = dir.path().join("exclude.txt");
714 fs::write(&exclude_path, "same content")?;
715
716 let mut hasher = FileHasher::new(&[dir.path()])?;
717 hasher.buffer_size = 8192;
718 let mut builder = globset::GlobSetBuilder::new();
719 builder.add(
720 globset::GlobBuilder::new("exclude.txt")
721 .case_insensitive(true)
722 .build()?,
723 );
724 let filter = builder.build()?;
725 hasher.exclude = Some(filter);
726
727 let duplicates = hasher.find_duplicates()?;
728 assert_eq!(duplicates.len(), 1);
729 let group = &duplicates[0];
730 assert_eq!(group.paths.len(), 2);
731 assert!(group.paths.contains(&file1_path));
732 assert!(group.paths.contains(&file2_path));
733 assert!(!group.paths.contains(&exclude_path));
734 Ok(())
735 }
736
737 #[test]
738 fn check_mode_empty_cache() -> anyhow::Result<()> {
739 let dir = tempfile::tempdir()?;
740 let dir_path = dir.path().to_path_buf();
741 println!("{:?}", dir_path);
742 let file1_path = dir.path().join("file1.txt");
743 fs::write(&file1_path, "content 1")?;
744 let file2_path = dir.path().join("file2.txt");
745 fs::write(&file2_path, "content 2")?;
746
747 let mut hasher = FileHasher::new(&[&dir_path])?;
748 hasher.exclude = Some(default_exclude());
749 let (tx, rx) = mpsc::channel();
750 hasher.check_streaming(tx, false)?;
751 let mut results = Vec::new();
752 let mut start_seen = false;
753 let mut total_files = None;
754 let mut file_done_count = 0;
755 let mut num_error = 0;
756 while let Ok(event) = rx.recv() {
757 match event {
758 CheckEvent::StartChecking => start_seen = true,
759 CheckEvent::TotalFiles(total) => total_files = Some(total),
760 CheckEvent::Result(path, status) => results.push((path, status)),
761 CheckEvent::FileDone => file_done_count += 1,
762 CheckEvent::Error => num_error += 1,
763 }
764 }
765 assert!(start_seen);
766 assert_eq!(total_files, Some(2));
767 assert_eq!(file_done_count, 0);
768 assert_eq!(num_error, 0);
769
770 results.sort_by(|a, b| a.0.cmp(&b.0));
771 assert_eq!(results.len(), 2);
772 assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
773 assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
774
775 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
776 Ok(())
777 }
778
779 #[test]
780 fn check_mode_with_cache() -> anyhow::Result<()> {
781 let dir = tempfile::tempdir()?;
782 let dir_path = dir.path().to_path_buf();
783 let file1_path = dir.path().join("file1.txt");
784 let file2_path = dir.path().join("file2.txt");
785 fs::write(&file1_path, "content 1")?;
786 fs::write(&file2_path, "content 2")?;
787
788 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
789 hasher.exclude = Some(default_exclude());
790 let _hash1 = hasher.get_hash(&file1_path)?;
791 let _hash2 = hasher.get_hash(&file2_path)?;
792 hasher.save_cache()?;
793 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
794
795 let mut hasher = FileHasher::new(&[&dir_path])?;
796 hasher.exclude = Some(default_exclude());
797 let (tx, rx) = mpsc::channel();
798 hasher.check_streaming(tx, false)?;
799 let mut results = Vec::new();
800 let mut file_done_count = 0;
801 while let Ok(event) = rx.recv() {
802 match event {
803 CheckEvent::Result(path, status) => results.push((path, status)),
804 CheckEvent::FileDone => file_done_count += 1,
805 _ => {}
806 }
807 }
808 assert_eq!(results.len(), 0);
809 assert_eq!(file_done_count, 2);
810
811 fs::write(&file1_path, "content 1 modified")?;
812
813 let file2_meta_before = fs::metadata(&file2_path)?;
814 let mtime_before = file2_meta_before.modified()?;
815 std::thread::sleep(time::Duration::from_millis(10));
816 fs::write(&file2_path, "content 2")?;
817 let file2_meta_after = fs::metadata(&file2_path)?;
818 let mtime_after = file2_meta_after.modified()?;
819 assert!(mtime_after > mtime_before);
820
821 let mut hasher = FileHasher::new(&[&dir_path])?;
822 hasher.exclude = Some(default_exclude());
823 let (tx, rx) = mpsc::channel();
824 hasher.check_streaming(tx, false)?;
825 let mut results = Vec::new();
826 let mut file_done_count = 0;
827 while let Ok(event) = rx.recv() {
828 match event {
829 CheckEvent::Result(path, status) => results.push((path, status)),
830 CheckEvent::FileDone => file_done_count += 1,
831 _ => {}
832 }
833 }
834 assert_eq!(results.len(), 1);
835 assert_eq!(
836 results[0],
837 (PathBuf::from("file1.txt"), CheckStatus::Modified)
838 );
839 assert_eq!(file_done_count, 1);
840 Ok(())
841 }
842
843 #[test]
844 fn check_update_mode() -> anyhow::Result<()> {
845 let dir = tempfile::tempdir()?;
846 let dir_path = dir.path().to_path_buf();
847 let file1_path = dir.path().join("file1.txt");
848 fs::write(&file1_path, "content 1")?;
849
850 let mut hasher = FileHasher::new(&[&dir_path])?;
851 hasher.exclude = Some(default_exclude());
852 let (tx, rx) = mpsc::channel();
853 hasher.check_streaming(tx, true)?;
854 while rx.recv().is_ok() {}
855 hasher.save_cache()?;
856 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
857
858 let cache = FileHashCache::new(&dir_path);
859 let mtime1 = fs::metadata(&file1_path)?.modified()?;
860 let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
861 assert!(hash1.is_some());
862
863 std::thread::sleep(time::Duration::from_millis(10));
864 fs::write(&file1_path, "content 1 modified")?;
865 let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
866
867 let mut hasher = FileHasher::new(&[&dir_path])?;
868 hasher.exclude = Some(default_exclude());
869 let (tx, rx) = mpsc::channel();
870 hasher.check_streaming(tx, true)?;
871 while rx.recv().is_ok() {}
872 hasher.save_cache()?;
873
874 let cache = FileHashCache::new(&dir_path);
875 let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
876 assert!(hash_mod.is_some());
877 assert_ne!(hash1, hash_mod);
878
879 std::thread::sleep(time::Duration::from_millis(10));
880 fs::write(&file1_path, "content 1 modified")?;
881 let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
882 assert!(mtime1_mod2 > mtime1_mod);
883
884 assert!(
885 cache
886 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
887 .is_none()
888 );
889
890 let mut hasher = FileHasher::new(&[&dir_path])?;
891 hasher.exclude = Some(default_exclude());
892 let (tx, rx) = mpsc::channel();
893 hasher.check_streaming(tx, true)?;
894 while rx.recv().is_ok() {}
895 hasher.save_cache()?;
896
897 let cache = FileHashCache::new(&dir_path);
898 assert!(
899 cache
900 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
901 .is_some()
902 );
903 Ok(())
904 }
905
906 #[test]
907 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
908 let dir = tempfile::tempdir()?;
909 let dir_path = dir.path().to_path_buf();
910 let file1_path = dir.path().join("file1.txt");
911 let file2_path = dir.path().join("file2.txt");
912 fs::write(&file1_path, "content 1")?;
913 fs::write(&file2_path, "content 2")?;
914 let mtime1 = fs::metadata(&file1_path)?.modified()?;
915 let mtime2 = fs::metadata(&file2_path)?.modified()?;
916
917 let mut hasher = FileHasher::new(&[&dir_path])?;
918 hasher.exclude = Some(default_exclude());
919 let (tx, rx) = mpsc::channel();
920 hasher.check_streaming(tx, true)?;
921 while rx.recv().is_ok() {}
922 hasher.save_cache()?;
923
924 let cache = FileHashCache::new(&dir_path);
926 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
927 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
928
929 fs::remove_file(&file2_path)?;
931
932 let mut hasher = FileHasher::new(&[&dir_path])?;
934 hasher.exclude = Some(default_exclude());
935 let (tx, rx) = mpsc::channel();
936 hasher.check_streaming(tx, true)?;
937 while rx.recv().is_ok() {}
938 hasher.save_cache()?;
939
940 let cache = FileHashCache::new(&dir_path);
942 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
943 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
944 Ok(())
945 }
946
947 #[test]
948 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
949 let tmp = tempfile::tempdir()?;
950 let dir1 = tmp.path().join("dir1");
951 let dir2 = tmp.path().join("dir2");
952 fs::create_dir(&dir1)?;
953 fs::create_dir(&dir2)?;
954 let file1_path = dir1.join("file1.txt");
955 fs::write(&file1_path, "same content")?;
956 let file2_path = dir2.join("file2.txt");
957 fs::write(&file2_path, "same content")?;
958 let hasher = FileHasher::new(&[&dir1, &dir2])?;
959 let duplicates = hasher.find_duplicates()?;
960 assert_eq!(duplicates.len(), 1);
961 let group = &duplicates[0];
962 assert_eq!(group.paths.len(), 2);
963 assert_eq!(group.size, 12);
964 assert!(group.paths.contains(&file1_path));
965 assert!(group.paths.contains(&file2_path));
966
967 Ok(())
968 }
969
970 #[test]
971 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
972 let tmp = tempfile::tempdir()?;
973 let dir1 = tmp.path().join("dir1");
974 let dir2 = tmp.path().join("dir2");
975 fs::create_dir(&dir1)?;
976 fs::create_dir(&dir2)?;
977 let hasher = FileHasher::new(&[&dir1, &dir2])?;
978 assert!(hasher.check(false).is_err());
979 Ok(())
980 }
981}