1use crate::{FileComparer, FileHashCache, FileIterator, Progress, ProgressBuilder};
2use globset::GlobSet;
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::{Arc, mpsc};
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12 StartDiscovering,
13 TotalFiles(usize),
14 Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18 Single(PathBuf, std::time::SystemTime),
19 Hashing,
20}
21
22#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25 pub paths: Vec<PathBuf>,
26 pub size: u64,
27}
28
29pub struct FileHasher {
31 dir: PathBuf,
32 pub buffer_size: usize,
33 cache: Arc<FileHashCache>,
34 pub(crate) num_hashed: AtomicUsize,
35 pub(crate) num_hash_looked_up: AtomicUsize,
36 pub exclude: Option<GlobSet>,
37 pub progress: Option<Arc<ProgressBuilder>>,
38}
39
40impl FileHasher {
41 pub fn new(dir: PathBuf) -> Self {
43 let cache = FileHashCache::find_or_new(&dir);
44 Self {
45 dir,
46 buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
47 cache,
48 num_hashed: AtomicUsize::new(0),
49 num_hash_looked_up: AtomicUsize::new(0),
50 exclude: None,
51 progress: None,
52 }
53 }
54
55 pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
57 let relative = crate::strip_prefix(path, self.cache.base_dir())?;
58 self.cache.remove(relative);
59 Ok(())
60 }
61
62 pub fn save_cache(&self) -> anyhow::Result<()> {
64 log::info!(
65 "Hash stats for {:?}: {} computed, {} looked up",
66 self.dir,
67 self.num_hashed.load(Ordering::Relaxed),
68 self.num_hash_looked_up.load(Ordering::Relaxed)
69 );
70 Ok(self.cache.save()?)
71 }
72
73 pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
75 self.cache.merge(other_cache);
76 }
77
78 pub fn clear_cache(&self) -> anyhow::Result<()> {
80 let relative = crate::strip_prefix(&self.dir, self.cache.base_dir())?;
81 self.cache.clear(relative);
82 Ok(())
83 }
84
85 pub fn run(&self) -> anyhow::Result<()> {
87 let start_time = std::time::Instant::now();
88 let mut duplicates = self.find_duplicates()?;
89 if duplicates.is_empty() {
90 println!("No duplicates found.");
91 } else {
92 duplicates.sort_by_key(|a| a.size);
93 let mut total_wasted_space = 0;
94 for dupes in &duplicates {
95 let paths = &dupes.paths;
96 let file_size = dupes.size;
97 println!(
98 "Identical {} files of {}:",
99 paths.len(),
100 crate::human_readable_size(file_size)
101 );
102 for path in paths {
103 println!(" {}", path.display());
104 }
105 total_wasted_space += file_size * (paths.len() as u64 - 1);
106 }
107 eprintln!(
108 "Total wasted space: {}",
109 crate::human_readable_size(total_wasted_space)
110 );
111 }
112 eprintln!("Finished in {:?}.", start_time.elapsed());
113 Ok(())
114 }
115
116 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
118 let progress = self
119 .progress
120 .as_ref()
121 .map(|progress| progress.add_spinner())
122 .unwrap_or_else(Progress::none);
123 progress.set_message("Scanning directories...");
124
125 let (tx, rx) = mpsc::channel();
126 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
127 let mut num_cache_hits = 0;
128 std::thread::scope(|scope| {
129 scope.spawn(|| {
130 if let Err(e) = self.find_duplicates_internal(tx) {
131 log::error!("Error during duplicate finding: {}", e);
132 }
133 });
134
135 while let Ok(event) = rx.recv() {
136 match event {
137 HashProgress::StartDiscovering => {
138 progress.set_message("Hashing files...");
139 }
140 HashProgress::TotalFiles(total) => {
141 progress.set_length(total as u64);
142 if num_cache_hits > 0 {
143 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
144 }
145 }
146 HashProgress::Result(path, size, hash, is_cache_hit) => {
147 if is_cache_hit {
148 num_cache_hits += 1;
149 if progress.length().is_none() {
150 progress.set_message(format!(
151 "Hashing files... ({} cache hits)",
152 num_cache_hits
153 ));
154 } else {
155 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
156 }
157 }
158
159 progress.inc(1);
160 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
161 paths: Vec::new(),
162 size,
163 });
164 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
166 entry.paths.push(path);
167 }
168 }
169 }
170 });
171 progress.finish();
172
173 let mut duplicates = Vec::new();
174 for (_, mut dupes) in by_hash {
175 if dupes.paths.len() > 1 {
176 dupes.paths.sort();
177 duplicates.push(dupes);
178 }
179 }
180 Ok(duplicates)
181 }
182
183 fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
184 tx.send(HashProgress::StartDiscovering)?;
185 let mut by_size: HashMap<u64, EntryState> = HashMap::new();
186 let mut total_hashed = 0;
187
188 rayon::scope(|scope| -> anyhow::Result<()> {
189 let mut it = FileIterator::new(self.dir.clone());
190 it.hasher = Some(self);
191 it.exclude = self.exclude.as_ref();
192 for (_, current_path) in it {
193 let meta = fs::metadata(¤t_path)?;
194 let size = meta.len();
195 let modified = meta.modified()?;
196
197 match by_size.entry(size) {
200 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
201 EntryState::Single(first_path, first_modified) => {
202 self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
205 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
206
207 *occ.get_mut() = EntryState::Hashing;
209 total_hashed += 2;
210 }
211 EntryState::Hashing => {
212 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
214 total_hashed += 1;
215 }
216 },
217 std::collections::hash_map::Entry::Vacant(vac) => {
218 vac.insert(EntryState::Single(current_path, modified));
219 }
220 }
221 }
222 tx.send(HashProgress::TotalFiles(total_hashed))?;
223 Ok(())
224 })?;
225
226 self.save_cache()
229 }
230
231 fn spawn_hash_task<'scope>(
232 &'scope self,
233 scope: &rayon::Scope<'scope>,
234 path: &Path,
235 size: u64,
236 modified: std::time::SystemTime,
237 tx: &mpsc::Sender<HashProgress>,
238 ) {
239 let relative = crate::strip_prefix(path, self.cache.base_dir())
240 .expect("path should be in cache base_dir");
241 if let Some(hash) = self.cache.get(relative, modified) {
242 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
243 let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
244 return;
245 }
246
247 let path_owned = path.to_path_buf();
248 let relative_owned = relative.to_path_buf();
249 let tx_owned = tx.clone();
250 let cache_owned = self.cache.clone();
251 scope.spawn(move |_| {
252 if let Ok(hash) = self.compute_hash(&path_owned) {
253 self.num_hashed.fetch_add(1, Ordering::Relaxed);
254 cache_owned.insert(&relative_owned, modified, hash);
255 let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
256 } else {
257 log::warn!("Failed to hash file: {:?}", path_owned);
258 }
259 });
260 }
261
262 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
264 let meta = fs::metadata(path)?;
265 let modified = meta.modified()?;
266 let relative = crate::strip_prefix(path, self.cache.base_dir())
267 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
268 if let Some(hash) = self.cache.get(relative, modified) {
269 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
270 return Ok(hash);
271 }
272
273 let hash = self.compute_hash(path)?;
274 self.num_hashed.fetch_add(1, Ordering::Relaxed);
275 self.cache.insert(relative, modified, hash);
276 Ok(hash)
277 }
278
279 fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
280 let start_time = std::time::Instant::now();
281 let mut f = fs::File::open(path)?;
282 let len = f.metadata()?.len();
283 let progress = self
284 .progress
285 .as_ref()
286 .map(|progress| progress.add_file(path, len))
287 .unwrap_or_else(Progress::none);
288 let mut hasher = blake3::Hasher::new();
289 if self.buffer_size == 0 {
290 if len > 0 {
291 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
292 hasher.update(&mmap[..]);
293 progress.inc(len);
294 }
295 } else {
296 let mut buf = vec![0u8; self.buffer_size];
297 loop {
298 let n = f.read(&mut buf)?;
299 if n == 0 {
300 break;
301 }
302 hasher.update(&buf[..n]);
303 progress.inc(n as u64);
304 }
305 }
306 progress.finish();
307 log::debug!("Computed hash in {:?}: {:?}", start_time.elapsed(), path);
308 Ok(hasher.finalize())
309 }
310}
311
312#[cfg(test)]
313mod tests {
314 use super::*;
315
316 #[test]
317 fn find_duplicates() -> anyhow::Result<()> {
318 let dir = tempfile::tempdir()?;
319
320 let file1_path = dir.path().join("same1.txt");
321 fs::write(&file1_path, "same content")?;
322
323 let file2_path = dir.path().join("same2.txt");
324 fs::write(&file2_path, "same content")?;
325
326 let diff_path = dir.path().join("diff.txt");
327 fs::write(&diff_path, "different content")?;
328
329 let mut hasher = FileHasher::new(dir.path().to_path_buf());
330 hasher.buffer_size = 8192;
331 let duplicates = hasher.find_duplicates()?;
332
333 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
334 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
335
336 assert_eq!(duplicates.len(), 1);
337 let group = &duplicates[0];
338 assert_eq!(group.paths.len(), 2);
339 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
342 assert!(group.paths.contains(&file2_path));
343
344 Ok(())
345 }
346
347 #[test]
348 fn find_duplicates_merge_cache() -> anyhow::Result<()> {
349 let dir = tempfile::tempdir()?;
350 let dir_path = dir.path();
351
352 let sub_dir = dir_path.join("a").join("a");
353 fs::create_dir_all(&sub_dir)?;
354
355 let file1_path = sub_dir.join("1");
356 fs::write(&file1_path, "same content")?;
357
358 let file2_path = sub_dir.join("2");
359 fs::write(&file2_path, "same content")?;
360
361 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
363 fs::File::create(&cache_aa_path)?;
364
365 let hasher_aa = FileHasher::new(sub_dir.clone());
367 let duplicates_aa = hasher_aa.find_duplicates()?;
368 assert_eq!(duplicates_aa.len(), 1);
369 assert!(cache_aa_path.exists());
370 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
371 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
372
373 let root_a = dir_path.join("a");
375 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
376 fs::File::create(&cache_a_path)?;
377
378 let hasher_a = FileHasher::new(root_a.clone());
380 let duplicates_a = hasher_a.find_duplicates()?;
381 assert_eq!(duplicates_a.len(), 1);
382 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
383 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
384
385 assert!(cache_a_path.exists());
387 assert!(!cache_aa_path.exists());
388
389 Ok(())
390 }
391
392 #[test]
393 fn find_duplicates_with_exclude() -> anyhow::Result<()> {
394 let dir = tempfile::tempdir()?;
395
396 let file1_path = dir.path().join("same1.txt");
397 fs::write(&file1_path, "same content")?;
398
399 let file2_path = dir.path().join("same2.txt");
400 fs::write(&file2_path, "same content")?;
401
402 let exclude_path = dir.path().join("exclude.txt");
403 fs::write(&exclude_path, "same content")?;
404
405 let mut hasher = FileHasher::new(dir.path().to_path_buf());
406 hasher.buffer_size = 8192;
407 let mut builder = globset::GlobSetBuilder::new();
408 builder.add(
409 globset::GlobBuilder::new("exclude.txt")
410 .case_insensitive(true)
411 .build()?,
412 );
413 let filter = builder.build()?;
414 hasher.exclude = Some(filter);
415
416 let duplicates = hasher.find_duplicates()?;
417 assert_eq!(duplicates.len(), 1);
418 let group = &duplicates[0];
419 assert_eq!(group.paths.len(), 2);
420 assert!(group.paths.contains(&file1_path));
421 assert!(group.paths.contains(&file2_path));
422 assert!(!group.paths.contains(&exclude_path));
423 Ok(())
424 }
425}