1use crate::{FileHashCache, FileIterator, ProgressReporter};
2use globset::GlobSet;
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::{Arc, mpsc};
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12 StartDiscovering,
13 TotalFiles(usize),
14 Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18 Single(PathBuf, std::time::SystemTime),
19 Hashing,
20}
21
22#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25 pub paths: Vec<PathBuf>,
26 pub size: u64,
27}
28
29pub struct FileHasher {
31 dir: PathBuf,
32 pub buffer_size: usize,
33 cache: Arc<FileHashCache>,
34 pub(crate) num_hashed: AtomicUsize,
35 pub(crate) num_hash_looked_up: AtomicUsize,
36 pub exclude: Option<GlobSet>,
37}
38
39impl FileHasher {
40 pub fn new(dir: PathBuf) -> Self {
42 let cache = FileHashCache::find_or_new(&dir);
43 Self {
44 dir,
45 buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
46 cache,
47 num_hashed: AtomicUsize::new(0),
48 num_hash_looked_up: AtomicUsize::new(0),
49 exclude: None,
50 }
51 }
52
53 pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
55 let relative = crate::strip_prefix(path, self.cache.base_dir())?;
56 self.cache.remove(relative);
57 Ok(())
58 }
59
60 pub fn save_cache(&self) -> anyhow::Result<()> {
62 log::info!(
63 "Hash stats for {:?}: {} computed, {} looked up",
64 self.dir,
65 self.num_hashed.load(Ordering::Relaxed),
66 self.num_hash_looked_up.load(Ordering::Relaxed)
67 );
68 Ok(self.cache.save()?)
69 }
70
71 pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
73 self.cache.merge(other_cache);
74 }
75
76 pub fn clear_cache(&self) -> anyhow::Result<()> {
78 let relative = crate::strip_prefix(&self.dir, self.cache.base_dir())?;
79 self.cache.clear(relative);
80 Ok(())
81 }
82
83 pub fn run(&self) -> anyhow::Result<()> {
85 let start_time = std::time::Instant::now();
86 let mut duplicates = self.find_duplicates()?;
87 if duplicates.is_empty() {
88 println!("No duplicates found.");
89 } else {
90 duplicates.sort_by(|a, b| a.size.cmp(&b.size));
91 let mut total_wasted_space = 0;
92 for dupes in &duplicates {
93 let paths = &dupes.paths;
94 let file_size = dupes.size;
95 println!(
96 "Identical {} files of {}:",
97 paths.len(),
98 crate::human_readable_size(file_size)
99 );
100 for path in paths {
101 println!(" {}", path.display());
102 }
103 total_wasted_space += file_size * (paths.len() as u64 - 1);
104 }
105 eprintln!(
106 "Total wasted space: {}",
107 crate::human_readable_size(total_wasted_space)
108 );
109 }
110 eprintln!("Finished in {:?}.", start_time.elapsed());
111 Ok(())
112 }
113
114 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
116 let progress = ProgressReporter::new();
117 progress.set_message("Scanning directories...");
118
119 let (tx, rx) = mpsc::channel();
120 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
121 let mut num_cache_hits = 0;
122 std::thread::scope(|scope| {
123 scope.spawn(|| {
124 if let Err(e) = self.find_duplicates_internal(tx) {
125 log::error!("Error during duplicate finding: {}", e);
126 }
127 });
128
129 while let Ok(event) = rx.recv() {
130 match event {
131 HashProgress::StartDiscovering => {
132 progress.set_message("Hashing files...");
133 }
134 HashProgress::TotalFiles(total) => {
135 progress.set_length(total as u64);
136 if num_cache_hits > 0 {
137 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
138 }
139 }
140 HashProgress::Result(path, size, hash, is_cache_hit) => {
141 if is_cache_hit {
142 num_cache_hits += 1;
143 if progress.length().is_none() {
144 progress.set_message(format!(
145 "Hashing files... ({} cache hits)",
146 num_cache_hits
147 ));
148 } else {
149 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
150 }
151 }
152
153 progress.inc(1);
154 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
155 paths: Vec::new(),
156 size,
157 });
158 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
160 entry.paths.push(path);
161 }
162 }
163 }
164 });
165 progress.finish();
166
167 let mut duplicates = Vec::new();
168 for (_, mut dupes) in by_hash {
169 if dupes.paths.len() > 1 {
170 dupes.paths.sort();
171 duplicates.push(dupes);
172 }
173 }
174 Ok(duplicates)
175 }
176
177 fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
178 tx.send(HashProgress::StartDiscovering)?;
179 let mut by_size: HashMap<u64, EntryState> = HashMap::new();
180 let mut total_hashed = 0;
181
182 rayon::scope(|scope| -> anyhow::Result<()> {
183 let mut it = FileIterator::new(self.dir.clone());
184 it.hasher = Some(self);
185 it.exclude = self.exclude.as_ref();
186 for (_, current_path) in it {
187 let meta = fs::metadata(¤t_path)?;
188 let size = meta.len();
189 let modified = meta.modified()?;
190
191 match by_size.entry(size) {
194 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
195 EntryState::Single(first_path, first_modified) => {
196 self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
199 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
200
201 *occ.get_mut() = EntryState::Hashing;
203 total_hashed += 2;
204 }
205 EntryState::Hashing => {
206 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
208 total_hashed += 1;
209 }
210 },
211 std::collections::hash_map::Entry::Vacant(vac) => {
212 vac.insert(EntryState::Single(current_path, modified));
213 }
214 }
215 }
216 tx.send(HashProgress::TotalFiles(total_hashed))?;
217 Ok(())
218 })?;
219
220 self.save_cache()
223 }
224
225 fn spawn_hash_task<'scope>(
226 &'scope self,
227 scope: &rayon::Scope<'scope>,
228 path: &Path,
229 size: u64,
230 modified: std::time::SystemTime,
231 tx: &mpsc::Sender<HashProgress>,
232 ) {
233 let relative = crate::strip_prefix(path, self.cache.base_dir())
234 .expect("path should be in cache base_dir");
235 if let Some(hash) = self.cache.get(relative, modified) {
236 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
237 let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
238 return;
239 }
240
241 let path_owned = path.to_path_buf();
242 let relative_owned = relative.to_path_buf();
243 let tx_owned = tx.clone();
244 let cache_owned = self.cache.clone();
245 scope.spawn(move |_| {
246 if let Ok(hash) = self.compute_hash(&path_owned) {
247 self.num_hashed.fetch_add(1, Ordering::Relaxed);
248 cache_owned.insert(&relative_owned, modified, hash);
249 let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
250 } else {
251 log::warn!("Failed to hash file: {:?}", path_owned);
252 }
253 });
254 }
255
256 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
258 let meta = fs::metadata(path)?;
259 let modified = meta.modified()?;
260 let relative = crate::strip_prefix(path, self.cache.base_dir())
261 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
262 if let Some(hash) = self.cache.get(relative, modified) {
263 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
264 return Ok(hash);
265 }
266
267 let hash = self.compute_hash(path)?;
268 self.num_hashed.fetch_add(1, Ordering::Relaxed);
269 self.cache.insert(relative, modified, hash);
270 Ok(hash)
271 }
272
273 fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
274 let start_time = std::time::Instant::now();
275 let mut f = fs::File::open(path)?;
276 let mut hasher = blake3::Hasher::new();
277 if self.buffer_size == 0 {
278 let len = f.metadata()?.len();
279 if len > 0 {
280 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
281 hasher.update(&mmap[..]);
282 }
283 } else {
284 let mut buf = vec![0u8; self.buffer_size];
285 loop {
286 let n = f.read(&mut buf)?;
287 if n == 0 {
288 break;
289 }
290 hasher.update(&buf[..n]);
291 }
292 }
293 log::trace!("Computed hash in {:?}: {:?}", start_time.elapsed(), path);
294 Ok(hasher.finalize())
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301
302 #[test]
303 fn test_find_duplicates() -> anyhow::Result<()> {
304 let dir = tempfile::tempdir()?;
305
306 let file1_path = dir.path().join("same1.txt");
307 fs::write(&file1_path, "same content")?;
308
309 let file2_path = dir.path().join("same2.txt");
310 fs::write(&file2_path, "same content")?;
311
312 let diff_path = dir.path().join("diff.txt");
313 fs::write(&diff_path, "different content")?;
314
315 let mut hasher = FileHasher::new(dir.path().to_path_buf());
316 hasher.buffer_size = 8192;
317 let duplicates = hasher.find_duplicates()?;
318
319 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
320 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
321
322 assert_eq!(duplicates.len(), 1);
323 let group = &duplicates[0];
324 assert_eq!(group.paths.len(), 2);
325 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
328 assert!(group.paths.contains(&file2_path));
329
330 Ok(())
331 }
332
333 #[test]
334 fn test_find_duplicates_merge_cache() -> anyhow::Result<()> {
335 let dir = tempfile::tempdir()?;
336 let dir_path = dir.path();
337
338 let sub_dir = dir_path.join("a").join("a");
339 fs::create_dir_all(&sub_dir)?;
340
341 let file1_path = sub_dir.join("1");
342 fs::write(&file1_path, "same content")?;
343
344 let file2_path = sub_dir.join("2");
345 fs::write(&file2_path, "same content")?;
346
347 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
349 fs::File::create(&cache_aa_path)?;
350
351 let hasher_aa = FileHasher::new(sub_dir.clone());
353 let duplicates_aa = hasher_aa.find_duplicates()?;
354 assert_eq!(duplicates_aa.len(), 1);
355 assert!(cache_aa_path.exists());
356 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
357 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
358
359 let root_a = dir_path.join("a");
361 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
362 fs::File::create(&cache_a_path)?;
363
364 let hasher_a = FileHasher::new(root_a.clone());
366 let duplicates_a = hasher_a.find_duplicates()?;
367 assert_eq!(duplicates_a.len(), 1);
368 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
369 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
370
371 assert!(cache_a_path.exists());
373 assert!(!cache_aa_path.exists());
374
375 Ok(())
376 }
377
378 #[test]
379 fn test_find_duplicates_with_exclude() -> anyhow::Result<()> {
380 let dir = tempfile::tempdir()?;
381
382 let file1_path = dir.path().join("same1.txt");
383 fs::write(&file1_path, "same content")?;
384
385 let file2_path = dir.path().join("same2.txt");
386 fs::write(&file2_path, "same content")?;
387
388 let exclude_path = dir.path().join("exclude.txt");
389 fs::write(&exclude_path, "same content")?;
390
391 let mut hasher = FileHasher::new(dir.path().to_path_buf());
392 hasher.buffer_size = 8192;
393 let mut builder = globset::GlobSetBuilder::new();
394 builder.add(
395 globset::GlobBuilder::new("exclude.txt")
396 .case_insensitive(true)
397 .build()?,
398 );
399 let filter = builder.build()?;
400 hasher.exclude = Some(filter);
401
402 let duplicates = hasher.find_duplicates()?;
403 assert_eq!(duplicates.len(), 1);
404 let group = &duplicates[0];
405 assert_eq!(group.paths.len(), 2);
406 assert!(group.paths.contains(&file1_path));
407 assert!(group.paths.contains(&file2_path));
408 assert!(!group.paths.contains(&exclude_path));
409 Ok(())
410 }
411}