1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
mod bag;
mod fs;
mod macros;
mod report;
pub use bag::{Duplicates, Fdupes, Machine, TreeBag};
pub use fs::wrapper::DirEntry;
pub use globset;
#[cfg(any(test, feature = "build-bin"))]
pub use hashers::{HighwayHasher, SeaHasher, XxHasher};
pub use regex;
pub use report::Report;
use std::path::Path;
pub trait Hasher: core::hash::Hasher + std::io::Write + core::default::Default {}
impl<T> Hasher for T
where
T: core::hash::Hasher,
T: core::default::Default,
T: std::io::Write,
{
}
#[derive(Debug, Default, typed_builder::TypedBuilder)]
#[builder(doc)]
pub struct Config<'a, P>
where
P: AsRef<Path>,
{
#[builder(setter(doc = "Paths that will be checked for duplicate files"))]
paths: &'a [P],
#[builder(default, setter(into, doc = "Minimum file size"))]
minimum_file_size: Option<u64>,
#[builder(default, setter(into, doc = "Maximum file size"))]
maximum_file_size: Option<u64>,
#[builder(default, setter(into, doc = "File name must match this regex"))]
regex: Option<regex::Regex>,
#[builder(default, setter(into, doc = "File name must match this glob"))]
glob: Option<globset::Glob>,
}
impl<P> Config<'_, P>
where
P: AsRef<Path>,
{
pub fn scan<H: Hasher>(self) -> TreeBag<u64, DirEntry> {
let bag = fs::find_dupes_partial::<H, P>(
self.paths,
self.minimum_file_size,
self.maximum_file_size,
self.regex,
self.glob.map(|g| g.compile_matcher()),
);
if log::log_enabled!(log::Level::Info) {
log::info!(
"scanned {} files",
bag.values().map(|b| b.len()).sum::<usize>()
);
log::info!(
"found {} possible duplicates after initial scan",
bag.duplicates().iter().map(|b| b.len()).sum::<usize>()
);
if log::log_enabled!(log::Level::Debug) {
log::debug!("{:?}", bag);
}
}
let dupes = fs::dedupe::<H>(bag);
if log::log_enabled!(log::Level::Info) {
log::info!(
"found {} duplicates in {} groups after checksumming",
dupes.duplicates().iter().map(|b| b.len()).sum::<usize>(),
dupes.duplicates().iter().count(),
);
if log::log_enabled!(log::Level::Debug) {
log::debug!("{:?}", dupes);
}
}
dupes
}
}
#[cfg(any(test, feature = "build-bin"))]
mod hashers {
#[derive(Default)]
#[repr(transparent)]
pub struct HighwayHasher(highway::HighwayHasher);
#[derive(Default)]
#[repr(transparent)]
pub struct SeaHasher(seahash::SeaHasher);
#[derive(Default)]
#[repr(transparent)]
pub struct XxHasher(twox_hash::XxHash64);
super::newtype_impl_hasher_and_write!(HighwayHasher);
super::newtype_impl_hasher_and_write!(SeaHasher);
super::newtype_impl_hasher_and_write!(XxHasher);
}