1use crate::{
2 ColumnFormatter, DirectoryComparer, FileComparer, FileHashCache, FileIterator, OutputFormat,
3 Progress, ProgressBuilder,
4};
5use globset::GlobSet;
6use indicatif::FormattedDuration;
7use rayon::prelude::*;
8use std::{
9 collections::HashMap,
10 fs,
11 io::{self, Read, stdout},
12 path::{Path, PathBuf},
13 sync::{
14 Arc,
15 atomic::{AtomicUsize, Ordering},
16 mpsc,
17 },
18 time,
19};
20
21type FileItem = (PathBuf, usize);
22
23#[derive(Debug, Clone)]
24enum DupEvent {
25 StartHashing,
26 NumFiles(usize),
27 Result(PathBuf, u64, blake3::Hash),
28 Error,
29}
30
31#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
32enum CheckStatus {
33 Unchanged,
34 New,
35 Modified,
36}
37
38#[derive(Debug, PartialEq)]
39enum CheckEvent {
40 StartChecking,
41 TotalFiles(usize),
42 Result(PathBuf, CheckStatus),
43 FileDone,
44 Error,
45}
46
47enum DupState {
48 Single(PathBuf, time::SystemTime, usize),
49 Hashing,
50}
51
52pub struct FileHasher {
54 dirs: Vec<PathBuf>,
55 pub buffer_size: usize,
56 cache: Option<Arc<FileHashCache>>,
57 num_hashed: AtomicUsize,
58 num_hash_looked_up: AtomicUsize,
59 pub exclude: Option<GlobSet>,
60 pub progress: Option<Arc<ProgressBuilder>>,
61 pub output_format: OutputFormat,
62 pub jobs: usize,
63}
64
65impl FileHasher {
66 const DEFAULT_JOBS: usize = DirectoryComparer::DEFAULT_JOBS;
67
68 pub fn new<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
70 if dirs.is_empty() {
71 anyhow::bail!("At least one directory must be specified.");
72 }
73 Ok(Self {
74 dirs: dirs.iter().map(|p| p.as_ref().to_path_buf()).collect(),
75 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
76 cache: None,
77 num_hashed: AtomicUsize::new(0),
78 num_hash_looked_up: AtomicUsize::new(0),
79 exclude: None,
80 progress: None,
81 output_format: OutputFormat::Default,
82 jobs: Self::DEFAULT_JOBS,
83 })
84 }
85
86 pub(crate) fn new_with_cache<P: AsRef<Path>>(dirs: &[P]) -> anyhow::Result<Self> {
87 let mut hasher = Self::new(dirs)?;
88 hasher.cache = Some(hasher.new_cache()?);
89 Ok(hasher)
90 }
91
92 fn new_cache(&self) -> anyhow::Result<Arc<FileHashCache>> {
93 let common_ancestor = crate::common_ancestor(&self.dirs)
94 .ok_or_else(|| anyhow::anyhow!("No common ancestor found"))?;
95 Ok(FileHashCache::find_or_new(&common_ancestor))
96 }
97
98 pub(crate) fn cache(&mut self) -> anyhow::Result<Arc<FileHashCache>> {
100 if self.cache.is_none() {
101 self.cache = Some(self.new_cache()?);
102 }
103 Ok(Arc::clone(self.cache.as_ref().unwrap()))
104 }
105
106 pub(crate) fn remove_cache_entry(&mut self, path: &Path) -> anyhow::Result<()> {
108 let cache = self.cache()?;
109 let relative = crate::strip_prefix(path, cache.base_dir())?;
110 cache.remove(relative);
111 Ok(())
112 }
113
114 pub fn save_cache(&self) -> anyhow::Result<()> {
116 log::info!(
117 "Hash stats for {:?}: {} computed, {} looked up",
118 self.dirs,
119 self.num_hashed.load(Ordering::Relaxed),
120 self.num_hash_looked_up.load(Ordering::Relaxed)
121 );
122 if let Some(cache) = &self.cache {
123 cache.save()?;
124 }
125 Ok(())
126 }
127
128 pub(crate) fn clear_cache(&mut self) -> anyhow::Result<()> {
130 let cache = self.cache()?;
131 for dir in &self.dirs {
132 let relative = crate::strip_prefix(dir, cache.base_dir())?;
133 cache.clear(relative);
134 }
135 Ok(())
136 }
137
138 pub fn check(&self, update: bool) -> anyhow::Result<()> {
140 match self.output_format {
141 OutputFormat::Default | OutputFormat::Symbol => {}
142 _ => anyhow::bail!("Check mode only supports default or symbol output format."),
143 }
144 if self.dirs.len() > 1 {
145 anyhow::bail!("Check mode only supports one directory.");
146 }
147 let start_time = time::Instant::now();
148 let progress = self
149 .progress
150 .as_ref()
151 .map(|progress| progress.add_spinner())
152 .unwrap_or_else(Progress::none);
153 progress.set_message("Scanning directory...");
154 let mut num_new = 0;
155 let mut num_modified = 0;
156 let mut num_error = 0;
157 std::thread::scope(|scope| {
158 let (tx, rx) = mpsc::channel();
159 scope.spawn(|| {
160 if let Err(e) = self.check_streaming(tx, update) {
161 log::error!("Error during check: {}", e);
162 }
163 });
164 while let Ok(event) = rx.recv() {
165 match event {
166 CheckEvent::StartChecking => {
167 progress.set_message("Checking files...");
168 }
169 CheckEvent::TotalFiles(total) => {
170 progress.set_length(total as u64);
171 progress.set_message("");
172 }
173 CheckEvent::Result(path, status) => {
174 let symbol = match status {
175 CheckStatus::New => {
176 num_new += 1;
177 '+'
178 }
179 CheckStatus::Modified => {
180 num_modified += 1;
181 '!'
182 }
183 CheckStatus::Unchanged => unreachable!(),
184 };
185 progress.inc(1);
186 progress.suspend_for(stdout(), || {
187 println!("{} {}", symbol, path.display());
188 });
189 }
190 CheckEvent::FileDone => {
191 progress.inc(1);
192 }
193 CheckEvent::Error => {
194 progress.inc(1);
195 num_error += 1;
196 }
197 }
198 }
199 });
200 progress.finish();
201 self.print_check_summary(&start_time, num_new, num_modified, num_error)?;
202 Ok(())
203 }
204
205 fn print_check_summary(
206 &self,
207 start_time: &time::Instant,
208 num_new: usize,
209 num_modified: usize,
210 num_error: usize,
211 ) -> io::Result<()> {
212 let summary = [
213 ("Elapsed:", 0),
214 ("Hash computed:", self.num_hashed.load(Ordering::Relaxed)),
215 ("New files:", num_new),
216 ("Modified files:", num_modified),
217 ("Errors:", num_error),
218 ];
219 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
220 let mut writer = std::io::stderr();
221 formatter.write_value(
222 &mut writer,
223 summary[0].0,
224 FormattedDuration(start_time.elapsed()),
225 )?;
226 formatter.write_values(&mut writer, &summary[1..])
227 }
228
229 fn check_streaming(&self, tx: mpsc::Sender<CheckEvent>, update: bool) -> anyhow::Result<()> {
230 assert_eq!(self.dirs.len(), 1);
231 let cache = self.new_cache()?;
232 let base_dir = &self.dirs[0];
233 let relative = crate::strip_prefix(base_dir, cache.base_dir())?;
234 cache.set_remove_if_no_access(relative);
235 let cache_clone = Arc::clone(&cache);
236 std::thread::scope(|global_scope| {
237 let mut it = FileIterator::new(base_dir);
238 it.cache = Some(Arc::clone(&cache));
239 it.exclude = self.exclude.as_ref();
240 let it_rx = it.spawn_in_scope(global_scope);
241 tx.send(CheckEvent::StartChecking)?;
242 let pool = crate::build_thread_pool(self.jobs)?;
243 pool.scope(move |scope| -> anyhow::Result<()> {
244 let mut total_files = 0;
245 for path in it_rx {
246 total_files += 1;
247 let tx = tx.clone();
248 let cache = Arc::clone(&cache);
249 scope.spawn(move |_| {
250 let status = self.check_file(&path, &cache, update);
251 let event = match status {
252 Ok(CheckStatus::New) | Ok(CheckStatus::Modified) => {
253 let rel_path = crate::strip_prefix(&path, base_dir).unwrap();
254 CheckEvent::Result(rel_path.into(), status.unwrap())
255 }
256 Ok(CheckStatus::Unchanged) => CheckEvent::FileDone,
257 Err(e) => {
258 log::error!("Failed to check file {:?}: {}", path, e);
259 CheckEvent::Error
260 }
261 };
262 if tx.send(event).is_err() {
263 log::error!("Send failed");
264 }
265 });
266 }
267 tx.send(CheckEvent::TotalFiles(total_files))?;
268 Ok(())
269 })
270 })?;
271 cache_clone.save()?;
272 Ok(())
273 }
274
275 fn check_file(
276 &self,
277 abs_path: &Path,
278 cache: &FileHashCache,
279 update: bool,
280 ) -> anyhow::Result<CheckStatus> {
281 assert!(abs_path.is_absolute());
282 let computed_hash = self.compute_hash(abs_path)?;
283 let rel_path = crate::strip_prefix(abs_path, cache.base_dir())?;
284 let cached_hash = cache.get_by_path(rel_path);
285 let status = match cached_hash {
286 None => CheckStatus::New,
287 Some(cached) => {
288 if computed_hash != cached {
289 CheckStatus::Modified
290 } else {
291 CheckStatus::Unchanged
292 }
293 }
294 };
295 if update {
296 let modified = fs::metadata(abs_path)?.modified()?;
297 match status {
298 CheckStatus::New | CheckStatus::Modified => {
299 cache.insert(rel_path, modified, computed_hash);
300 }
301 CheckStatus::Unchanged => {
302 if cache.get(rel_path, modified).is_none() {
303 cache.insert(rel_path, modified, computed_hash);
304 }
305 }
306 }
307 }
308 Ok(status)
309 }
310
311 pub fn run(&self) -> anyhow::Result<()> {
313 let start_time = time::Instant::now();
314 let mut duplicates = self.find_duplicates()?;
315 let mut total_wasted_space = 0;
316 if !duplicates.is_empty() {
317 duplicates.sort_by_key(|a| a.size);
318 total_wasted_space = self.print_duplicates_results(&duplicates)?;
319 }
320 self.print_duplicates_summary(&start_time, total_wasted_space)?;
321 Ok(())
322 }
323
324 fn print_duplicates_results(&self, duplicates: &Vec<DuplicatedFiles>) -> anyhow::Result<u64> {
325 let mut total_wasted_space = 0;
326 for dupes in duplicates {
327 dupes.print(self.output_format)?;
328 total_wasted_space += dupes.wasted_size();
329 }
330 Ok(total_wasted_space)
331 }
332
333 fn print_duplicates_summary(
334 &self,
335 start_time: &time::Instant,
336 total_wasted_space: u64,
337 ) -> io::Result<()> {
338 let elapsed = FormattedDuration(start_time.elapsed()).to_string();
339 let num_hashed = self.num_hashed.load(Ordering::Relaxed).to_string();
340 let total_wasted_space = crate::human_readable_size(total_wasted_space);
341 let summary = [
342 ("Elapsed:", elapsed),
343 ("Hash computed:", num_hashed),
344 ("Total wasted space:", total_wasted_space),
345 ];
346 let formatter = ColumnFormatter::new(summary.iter().map(|(s, _)| *s));
347 formatter.write_values(&mut io::stderr(), &summary)
348 }
349
350 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
352 let progress = self
353 .progress
354 .as_ref()
355 .map(|progress| progress.add_spinner())
356 .unwrap_or_else(Progress::none);
357 progress.set_message("Scanning directories...");
358
359 let (tx, rx) = mpsc::channel();
360 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
361 std::thread::scope(|scope| {
362 scope.spawn(|| {
363 if let Err(e) = self.find_duplicates_streaming(tx) {
364 log::error!("Error during duplicate finding: {}", e);
365 }
366 });
367
368 while let Ok(event) = rx.recv() {
369 match event {
370 DupEvent::StartHashing => progress.set_message("Hashing files..."),
371 DupEvent::NumFiles(num) => progress.set_length(num as u64),
372 DupEvent::Result(path, size, hash) => {
373 progress.inc(1);
374 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
375 paths: Vec::new(),
376 size,
377 });
378 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
380 entry.paths.push(path);
381 }
382 DupEvent::Error => progress.inc(1),
383 }
384 }
385 });
386 progress.finish();
387
388 let mut duplicates = Vec::new();
389 for (_, mut dupes) in by_hash {
390 if dupes.paths.len() > 1 {
391 dupes.paths.sort();
392 duplicates.push(dupes);
393 }
394 }
395 Ok(duplicates)
396 }
397
398 fn find_duplicates_streaming(&self, tx: mpsc::Sender<DupEvent>) -> anyhow::Result<()> {
399 std::thread::scope(|global_scope| {
400 let (it_rx, caches) = self.stream_file_items(global_scope)?;
401 let caches = &caches;
402 let pool = crate::build_thread_pool(self.jobs)?;
403 pool.scope(move |scope| -> anyhow::Result<()> {
404 let mut by_size: HashMap<u64, DupState> = HashMap::new();
405 let mut num_hashed = 0;
406 tx.send(DupEvent::StartHashing)?;
407 for (path, dir_index) in it_rx {
408 let meta = fs::metadata(&path)?;
409 let size = meta.len();
410 if size == 0 {
411 continue;
412 }
413 let modified = meta.modified()?;
414 let cache = &caches[dir_index];
415 match by_size.entry(size) {
416 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut()
417 {
418 DupState::Single(path0, modified0, dir_index0) => {
419 let cache0 = &caches[*dir_index0];
422 self.send_hash(path0, size, *modified0, cache0, &tx, scope);
423 self.send_hash(&path, size, modified, cache, &tx, scope);
424
425 *occ.get_mut() = DupState::Hashing;
427 num_hashed += 2;
428 }
429 DupState::Hashing => {
430 self.send_hash(&path, size, modified, cache, &tx, scope);
432 num_hashed += 1;
433 }
434 },
435 std::collections::hash_map::Entry::Vacant(vac) => {
436 vac.insert(DupState::Single(path, modified, dir_index));
437 }
438 }
439 }
440 tx.send(DupEvent::NumFiles(num_hashed))?;
441 Ok(())
442 })?;
443 pool.install(|| caches.into_par_iter().try_for_each(|cache| cache.save()))?;
444 Ok::<(), anyhow::Error>(())
445 })?;
446 Ok(())
447 }
448
449 fn stream_file_items<'scope, 'env>(
450 &'env self,
451 scope: &'scope std::thread::Scope<'scope, 'env>,
452 ) -> anyhow::Result<(mpsc::Receiver<FileItem>, Vec<Arc<FileHashCache>>)> {
453 let (it_tx, it_rx) = mpsc::channel();
454 let mut caches = Vec::with_capacity(self.dirs.len());
455 for (dir_index, dir) in self.dirs.iter().enumerate() {
456 let mut it = FileIterator::new(dir);
457 let cache = FileHashCache::find_or_new(dir);
458 it.cache = Some(Arc::clone(&cache));
459 it.exclude = self.exclude.as_ref();
460 let it_tx = it_tx.clone();
461 scope.spawn(move || it.send_to_as(it_tx, |path| (path, dir_index)));
462 caches.push(cache);
463 }
464 Ok((it_rx, caches))
465 }
466
467 fn send_hash<'scope>(
468 &'scope self,
469 path: &Path,
470 size: u64,
471 modified: time::SystemTime,
472 cache: &Arc<FileHashCache>,
473 tx: &mpsc::Sender<DupEvent>,
474 scope: &rayon::Scope<'scope>,
475 ) {
476 let (hash, relative) = self
477 .get_hash_from_cache(path, modified, cache)
478 .expect("path should be in cache base_dir");
479 if let Some(hash) = hash {
480 let _ = tx.send(DupEvent::Result(path.to_path_buf(), size, hash));
481 return;
482 }
483
484 let path = path.to_path_buf();
485 let relative = relative.to_path_buf();
486 let tx = tx.clone();
487 let cache = Arc::clone(cache);
488 scope.spawn(move |_| {
489 if let Ok(hash) = self.compute_hash(&path) {
490 cache.insert(&relative, modified, hash);
491 let _ = tx.send(DupEvent::Result(path, size, hash));
492 } else {
493 log::error!("Failed to hash file: {:?}", path);
494 let _ = tx.send(DupEvent::Error);
495 }
496 });
497 }
498
499 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
501 let cache = self.cache.as_ref().expect("cache should be initialized");
502 let meta = fs::metadata(path)?;
503 let modified = meta.modified()?;
504 let (hash, relative) = self.get_hash_from_cache(path, modified, cache)?;
505 if let Some(hash) = hash {
506 return Ok(hash);
507 }
508
509 let hash = self.compute_hash(path)?;
510 cache.insert(relative, modified, hash);
511 Ok(hash)
512 }
513
514 fn get_hash_from_cache<'a>(
515 &self,
516 path: &'a Path,
517 modified: time::SystemTime,
518 cache: &FileHashCache,
519 ) -> io::Result<(Option<blake3::Hash>, &'a Path)> {
520 let relative = crate::strip_prefix(path, cache.base_dir())
521 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
522 if let Some(hash) = cache.get(relative, modified) {
523 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
524 return Ok((Some(hash), relative));
525 }
526 Ok((None, relative))
527 }
528
529 fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
530 let start_time = time::Instant::now();
531 let mut f = fs::File::open(path)?;
532 let len = f.metadata()?.len();
533 let progress = self
534 .progress
535 .as_ref()
536 .map(|progress| progress.add_file(path, len))
537 .unwrap_or_else(Progress::none);
538 let mut hasher = blake3::Hasher::new();
539 if self.buffer_size == 0 {
540 if len > 0 {
541 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
542 hasher.update(&mmap[..]);
543 progress.inc(len);
544 }
545 } else {
546 let mut buf = vec![0u8; self.buffer_size];
547 loop {
548 let n = f.read(&mut buf)?;
549 if n == 0 {
550 break;
551 }
552 hasher.update(&buf[..n]);
553 progress.inc(n as u64);
554 }
555 }
556 progress.finish();
557 self.num_hashed.fetch_add(1, Ordering::Relaxed);
558 let hash = hasher.finalize();
559 log::debug!(
560 "Computed hash in {}: {:?}",
561 FormattedDuration(start_time.elapsed()),
562 path
563 );
564 Ok(hash)
565 }
566}
567
568#[derive(Clone, Debug)]
570pub struct DuplicatedFiles {
571 pub paths: Vec<PathBuf>,
572 pub size: u64,
573}
574
575impl DuplicatedFiles {
576 fn print(&self, output_format: OutputFormat) -> anyhow::Result<()> {
577 match output_format {
578 OutputFormat::Default => self.write_human(stdout())?,
579 OutputFormat::Yaml | OutputFormat::Symbol => self.write_yaml(stdout())?,
580 }
581 Ok(())
582 }
583
584 fn write_human(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
585 writeln!(
586 writer,
587 "Identical {} files of {}:",
588 self.paths.len(),
589 crate::human_readable_size(self.size)
590 )?;
591 for path in &self.paths {
592 writeln!(writer, " {}", path.display())?;
593 }
594 Ok(())
595 }
596
597 fn write_yaml(&self, mut writer: impl io::Write) -> anyhow::Result<()> {
598 writeln!(writer, "- paths:")?;
599 for path in &self.paths {
600 writeln!(writer, " - {:?}", path)?;
601 }
602 writeln!(writer, " size: {}", self.size)?;
603 Ok(())
604 }
605
606 fn wasted_size(&self) -> u64 {
607 self.size * (self.paths.len() as u64 - 1)
608 }
609}
610
611#[cfg(test)]
612mod tests {
613 use super::*;
614
615 fn default_exclude() -> globset::GlobSet {
616 let mut builder = globset::GlobSetBuilder::new();
617 builder.add(
618 globset::GlobBuilder::new(".hash_cache")
619 .case_insensitive(true)
620 .build()
621 .unwrap(),
622 );
623 builder.build().unwrap()
624 }
625
626 #[test]
627 fn find_duplicates() -> anyhow::Result<()> {
628 let dir = tempfile::tempdir()?;
629
630 let file1_path = dir.path().join("same1.txt");
631 fs::write(&file1_path, "same content")?;
632
633 let file2_path = dir.path().join("same2.txt");
634 fs::write(&file2_path, "same content")?;
635
636 let diff_path = dir.path().join("diff.txt");
637 fs::write(&diff_path, "different content")?;
638
639 let mut hasher = FileHasher::new(&[dir.path()])?;
640 hasher.buffer_size = 8192;
641 let duplicates = hasher.find_duplicates()?;
642
643 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
644 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
645
646 assert_eq!(duplicates.len(), 1);
647 let group = &duplicates[0];
648 assert_eq!(group.paths.len(), 2);
649 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
652 assert!(group.paths.contains(&file2_path));
653
654 Ok(())
655 }
656
657 #[test]
658 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
659 let dir = tempfile::tempdir()?;
660 let dir_path = dir.path();
661
662 let sub_dir = dir_path.join("a").join("a");
663 fs::create_dir_all(&sub_dir)?;
664
665 let file1_path = sub_dir.join("1");
666 fs::write(&file1_path, "same content")?;
667
668 let file2_path = sub_dir.join("2");
669 fs::write(&file2_path, "same content")?;
670
671 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
673 fs::File::create(&cache_aa_path)?;
674
675 let hasher_aa = FileHasher::new(&[&sub_dir])?;
677 let duplicates_aa = hasher_aa.find_duplicates()?;
678 assert_eq!(duplicates_aa.len(), 1);
679 assert!(cache_aa_path.exists());
680 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
681 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
682
683 let root_a = dir_path.join("a");
685 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
686 fs::File::create(&cache_a_path)?;
687
688 let hasher_a = FileHasher::new(&[&root_a])?;
690 let duplicates_a = hasher_a.find_duplicates()?;
691 assert_eq!(duplicates_a.len(), 1);
692 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
693 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
694
695 assert!(cache_a_path.exists());
697 assert!(!cache_aa_path.exists());
698
699 Ok(())
700 }
701
702 #[test]
703 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
704 let dir = tempfile::tempdir()?;
705
706 let file1_path = dir.path().join("same1.txt");
707 fs::write(&file1_path, "same content")?;
708
709 let file2_path = dir.path().join("same2.txt");
710 fs::write(&file2_path, "same content")?;
711
712 let exclude_path = dir.path().join("exclude.txt");
713 fs::write(&exclude_path, "same content")?;
714
715 let mut hasher = FileHasher::new(&[dir.path()])?;
716 hasher.buffer_size = 8192;
717 let mut builder = globset::GlobSetBuilder::new();
718 builder.add(
719 globset::GlobBuilder::new("exclude.txt")
720 .case_insensitive(true)
721 .build()?,
722 );
723 let filter = builder.build()?;
724 hasher.exclude = Some(filter);
725
726 let duplicates = hasher.find_duplicates()?;
727 assert_eq!(duplicates.len(), 1);
728 let group = &duplicates[0];
729 assert_eq!(group.paths.len(), 2);
730 assert!(group.paths.contains(&file1_path));
731 assert!(group.paths.contains(&file2_path));
732 assert!(!group.paths.contains(&exclude_path));
733 Ok(())
734 }
735
736 #[test]
737 fn check_mode_empty_cache() -> anyhow::Result<()> {
738 let dir = tempfile::tempdir()?;
739 let dir_path = dir.path().to_path_buf();
740 println!("{:?}", dir_path);
741 let file1_path = dir.path().join("file1.txt");
742 fs::write(&file1_path, "content 1")?;
743 let file2_path = dir.path().join("file2.txt");
744 fs::write(&file2_path, "content 2")?;
745
746 let mut hasher = FileHasher::new(&[&dir_path])?;
747 hasher.exclude = Some(default_exclude());
748 let (tx, rx) = mpsc::channel();
749 hasher.check_streaming(tx, false)?;
750 let mut results = Vec::new();
751 let mut start_seen = false;
752 let mut total_files = None;
753 let mut file_done_count = 0;
754 let mut num_error = 0;
755 while let Ok(event) = rx.recv() {
756 match event {
757 CheckEvent::StartChecking => start_seen = true,
758 CheckEvent::TotalFiles(total) => total_files = Some(total),
759 CheckEvent::Result(path, status) => results.push((path, status)),
760 CheckEvent::FileDone => file_done_count += 1,
761 CheckEvent::Error => num_error += 1,
762 }
763 }
764 assert!(start_seen);
765 assert_eq!(total_files, Some(2));
766 assert_eq!(file_done_count, 0);
767 assert_eq!(num_error, 0);
768
769 results.sort_by(|a, b| a.0.cmp(&b.0));
770 assert_eq!(results.len(), 2);
771 assert_eq!(results[0], (PathBuf::from("file1.txt"), CheckStatus::New));
772 assert_eq!(results[1], (PathBuf::from("file2.txt"), CheckStatus::New));
773
774 assert!(!dir.path().join(FileHashCache::FILE_NAME).exists());
775 Ok(())
776 }
777
778 #[test]
779 fn check_mode_with_cache() -> anyhow::Result<()> {
780 let dir = tempfile::tempdir()?;
781 let dir_path = dir.path().to_path_buf();
782 let file1_path = dir.path().join("file1.txt");
783 let file2_path = dir.path().join("file2.txt");
784 fs::write(&file1_path, "content 1")?;
785 fs::write(&file2_path, "content 2")?;
786
787 let mut hasher = FileHasher::new_with_cache(&[&dir_path])?;
788 hasher.exclude = Some(default_exclude());
789 let _hash1 = hasher.get_hash(&file1_path)?;
790 let _hash2 = hasher.get_hash(&file2_path)?;
791 hasher.save_cache()?;
792 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
793
794 let mut hasher = FileHasher::new(&[&dir_path])?;
795 hasher.exclude = Some(default_exclude());
796 let (tx, rx) = mpsc::channel();
797 hasher.check_streaming(tx, false)?;
798 let mut results = Vec::new();
799 let mut file_done_count = 0;
800 while let Ok(event) = rx.recv() {
801 match event {
802 CheckEvent::Result(path, status) => results.push((path, status)),
803 CheckEvent::FileDone => file_done_count += 1,
804 _ => {}
805 }
806 }
807 assert_eq!(results.len(), 0);
808 assert_eq!(file_done_count, 2);
809
810 fs::write(&file1_path, "content 1 modified")?;
811
812 let file2_meta_before = fs::metadata(&file2_path)?;
813 let mtime_before = file2_meta_before.modified()?;
814 std::thread::sleep(time::Duration::from_millis(10));
815 fs::write(&file2_path, "content 2")?;
816 let file2_meta_after = fs::metadata(&file2_path)?;
817 let mtime_after = file2_meta_after.modified()?;
818 assert!(mtime_after > mtime_before);
819
820 let mut hasher = FileHasher::new(&[&dir_path])?;
821 hasher.exclude = Some(default_exclude());
822 let (tx, rx) = mpsc::channel();
823 hasher.check_streaming(tx, false)?;
824 let mut results = Vec::new();
825 let mut file_done_count = 0;
826 while let Ok(event) = rx.recv() {
827 match event {
828 CheckEvent::Result(path, status) => results.push((path, status)),
829 CheckEvent::FileDone => file_done_count += 1,
830 _ => {}
831 }
832 }
833 assert_eq!(results.len(), 1);
834 assert_eq!(
835 results[0],
836 (PathBuf::from("file1.txt"), CheckStatus::Modified)
837 );
838 assert_eq!(file_done_count, 1);
839 Ok(())
840 }
841
842 #[test]
843 fn check_update_mode() -> anyhow::Result<()> {
844 let dir = tempfile::tempdir()?;
845 let dir_path = dir.path().to_path_buf();
846 let file1_path = dir.path().join("file1.txt");
847 fs::write(&file1_path, "content 1")?;
848
849 let mut hasher = FileHasher::new(&[&dir_path])?;
850 hasher.exclude = Some(default_exclude());
851 let (tx, rx) = mpsc::channel();
852 hasher.check_streaming(tx, true)?;
853 while rx.recv().is_ok() {}
854 hasher.save_cache()?;
855 assert!(dir.path().join(FileHashCache::FILE_NAME).exists());
856
857 let cache = FileHashCache::new(&dir_path);
858 let mtime1 = fs::metadata(&file1_path)?.modified()?;
859 let hash1 = cache.get(&PathBuf::from("file1.txt"), mtime1);
860 assert!(hash1.is_some());
861
862 std::thread::sleep(time::Duration::from_millis(10));
863 fs::write(&file1_path, "content 1 modified")?;
864 let mtime1_mod = fs::metadata(&file1_path)?.modified()?;
865
866 let mut hasher = FileHasher::new(&[&dir_path])?;
867 hasher.exclude = Some(default_exclude());
868 let (tx, rx) = mpsc::channel();
869 hasher.check_streaming(tx, true)?;
870 while rx.recv().is_ok() {}
871 hasher.save_cache()?;
872
873 let cache = FileHashCache::new(&dir_path);
874 let hash_mod = cache.get(&PathBuf::from("file1.txt"), mtime1_mod);
875 assert!(hash_mod.is_some());
876 assert_ne!(hash1, hash_mod);
877
878 std::thread::sleep(time::Duration::from_millis(10));
879 fs::write(&file1_path, "content 1 modified")?;
880 let mtime1_mod2 = fs::metadata(&file1_path)?.modified()?;
881 assert!(mtime1_mod2 > mtime1_mod);
882
883 assert!(
884 cache
885 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
886 .is_none()
887 );
888
889 let mut hasher = FileHasher::new(&[&dir_path])?;
890 hasher.exclude = Some(default_exclude());
891 let (tx, rx) = mpsc::channel();
892 hasher.check_streaming(tx, true)?;
893 while rx.recv().is_ok() {}
894 hasher.save_cache()?;
895
896 let cache = FileHashCache::new(&dir_path);
897 assert!(
898 cache
899 .get(&PathBuf::from("file1.txt"), mtime1_mod2)
900 .is_some()
901 );
902 Ok(())
903 }
904
905 #[test]
906 fn check_cleanup_deleted_files() -> anyhow::Result<()> {
907 let dir = tempfile::tempdir()?;
908 let dir_path = dir.path().to_path_buf();
909 let file1_path = dir.path().join("file1.txt");
910 let file2_path = dir.path().join("file2.txt");
911 fs::write(&file1_path, "content 1")?;
912 fs::write(&file2_path, "content 2")?;
913 let mtime1 = fs::metadata(&file1_path)?.modified()?;
914 let mtime2 = fs::metadata(&file2_path)?.modified()?;
915
916 let mut hasher = FileHasher::new(&[&dir_path])?;
917 hasher.exclude = Some(default_exclude());
918 let (tx, rx) = mpsc::channel();
919 hasher.check_streaming(tx, true)?;
920 while rx.recv().is_ok() {}
921 hasher.save_cache()?;
922
923 let cache = FileHashCache::new(&dir_path);
925 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
926 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_some());
927
928 fs::remove_file(&file2_path)?;
930
931 let mut hasher = FileHasher::new(&[&dir_path])?;
933 hasher.exclude = Some(default_exclude());
934 let (tx, rx) = mpsc::channel();
935 hasher.check_streaming(tx, true)?;
936 while rx.recv().is_ok() {}
937 hasher.save_cache()?;
938
939 let cache = FileHashCache::new(&dir_path);
941 assert!(cache.get(&PathBuf::from("file2.txt"), mtime2).is_none());
942 assert!(cache.get(&PathBuf::from("file1.txt"), mtime1).is_some());
943 Ok(())
944 }
945
946 #[test]
947 fn find_duplicates_multiple_dirs() -> anyhow::Result<()> {
948 let tmp = tempfile::tempdir()?;
949 let dir1 = tmp.path().join("dir1");
950 let dir2 = tmp.path().join("dir2");
951 fs::create_dir(&dir1)?;
952 fs::create_dir(&dir2)?;
953 let file1_path = dir1.join("file1.txt");
954 fs::write(&file1_path, "same content")?;
955 let file2_path = dir2.join("file2.txt");
956 fs::write(&file2_path, "same content")?;
957 let hasher = FileHasher::new(&[&dir1, &dir2])?;
958 let duplicates = hasher.find_duplicates()?;
959 assert_eq!(duplicates.len(), 1);
960 let group = &duplicates[0];
961 assert_eq!(group.paths.len(), 2);
962 assert_eq!(group.size, 12);
963 assert!(group.paths.contains(&file1_path));
964 assert!(group.paths.contains(&file2_path));
965
966 Ok(())
967 }
968
969 #[test]
970 fn check_fails_with_multiple_dirs() -> anyhow::Result<()> {
971 let tmp = tempfile::tempdir()?;
972 let dir1 = tmp.path().join("dir1");
973 let dir2 = tmp.path().join("dir2");
974 fs::create_dir(&dir1)?;
975 fs::create_dir(&dir2)?;
976 let hasher = FileHasher::new(&[&dir1, &dir2])?;
977 assert!(hasher.check(false).is_err());
978 Ok(())
979 }
980}