Skip to main content

compare_dir/
file_hasher.rs

1use crate::{FileComparer, FileHashCache, FileIterator, Progress, ProgressBuilder};
2use globset::GlobSet;
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::{Arc, mpsc};
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12    StartDiscovering,
13    TotalFiles(usize),
14    Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18    Single(PathBuf, std::time::SystemTime),
19    Hashing,
20}
21
22/// A group of duplicated files and their size.
23#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25    pub paths: Vec<PathBuf>,
26    pub size: u64,
27}
28
29/// A tool for finding duplicated files in a directory.
30pub struct FileHasher {
31    dir: PathBuf,
32    pub buffer_size: usize,
33    cache: Arc<FileHashCache>,
34    pub(crate) num_hashed: AtomicUsize,
35    pub(crate) num_hash_looked_up: AtomicUsize,
36    pub exclude: Option<GlobSet>,
37    pub progress: Option<Arc<ProgressBuilder>>,
38}
39
40impl FileHasher {
41    /// Creates a new `FileHasher` for the given directory.
42    pub fn new(dir: PathBuf) -> Self {
43        let cache = FileHashCache::find_or_new(&dir);
44        Self {
45            dir,
46            buffer_size: FileComparer::DEFAULT_BUFFER_SIZE,
47            cache,
48            num_hashed: AtomicUsize::new(0),
49            num_hash_looked_up: AtomicUsize::new(0),
50            exclude: None,
51            progress: None,
52        }
53    }
54
55    /// Remove a cache entry if it exists.
56    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
57        let relative = crate::strip_prefix(path, self.cache.base_dir())?;
58        self.cache.remove(relative);
59        Ok(())
60    }
61
62    /// Save the hash cache if it is dirty.
63    pub fn save_cache(&self) -> anyhow::Result<()> {
64        log::info!(
65            "Hash stats for {:?}: {} computed, {} looked up",
66            self.dir,
67            self.num_hashed.load(Ordering::Relaxed),
68            self.num_hash_looked_up.load(Ordering::Relaxed)
69        );
70        Ok(self.cache.save()?)
71    }
72
73    /// Merges another cache into this hasher's cache.
74    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
75        self.cache.merge(other_cache);
76    }
77
78    /// Clears the loaded hashes in the cache.
79    pub fn clear_cache(&self) -> anyhow::Result<()> {
80        let relative = crate::strip_prefix(&self.dir, self.cache.base_dir())?;
81        self.cache.clear(relative);
82        Ok(())
83    }
84
85    /// Executes the duplicate file finding process and prints results.
86    pub fn run(&self) -> anyhow::Result<()> {
87        let start_time = std::time::Instant::now();
88        let mut duplicates = self.find_duplicates()?;
89        if duplicates.is_empty() {
90            println!("No duplicates found.");
91        } else {
92            duplicates.sort_by_key(|a| a.size);
93            let mut total_wasted_space = 0;
94            for dupes in &duplicates {
95                let paths = &dupes.paths;
96                let file_size = dupes.size;
97                println!(
98                    "Identical {} files of {}:",
99                    paths.len(),
100                    crate::human_readable_size(file_size)
101                );
102                for path in paths {
103                    println!("  {}", path.display());
104                }
105                total_wasted_space += file_size * (paths.len() as u64 - 1);
106            }
107            eprintln!(
108                "Total wasted space: {}",
109                crate::human_readable_size(total_wasted_space)
110            );
111        }
112        eprintln!("Finished in {:?}.", start_time.elapsed());
113        Ok(())
114    }
115
116    /// Finds duplicated files and returns a list of duplicate groups.
117    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
118        let progress = self
119            .progress
120            .as_ref()
121            .map(|progress| progress.add_spinner())
122            .unwrap_or_else(Progress::none);
123        progress.set_message("Scanning directories...");
124
125        let (tx, rx) = mpsc::channel();
126        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
127        let mut num_cache_hits = 0;
128        std::thread::scope(|scope| {
129            scope.spawn(|| {
130                if let Err(e) = self.find_duplicates_internal(tx) {
131                    log::error!("Error during duplicate finding: {}", e);
132                }
133            });
134
135            while let Ok(event) = rx.recv() {
136                match event {
137                    HashProgress::StartDiscovering => {
138                        progress.set_message("Hashing files...");
139                    }
140                    HashProgress::TotalFiles(total) => {
141                        progress.set_length(total as u64);
142                        if num_cache_hits > 0 {
143                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
144                        }
145                    }
146                    HashProgress::Result(path, size, hash, is_cache_hit) => {
147                        if is_cache_hit {
148                            num_cache_hits += 1;
149                            if progress.length().is_none() {
150                                progress.set_message(format!(
151                                    "Hashing files... ({} cache hits)",
152                                    num_cache_hits
153                                ));
154                            } else {
155                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
156                            }
157                        }
158
159                        progress.inc(1);
160                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
161                            paths: Vec::new(),
162                            size,
163                        });
164                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
165                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
166                        entry.paths.push(path);
167                    }
168                }
169            }
170        });
171        progress.finish();
172
173        let mut duplicates = Vec::new();
174        for (_, mut dupes) in by_hash {
175            if dupes.paths.len() > 1 {
176                dupes.paths.sort();
177                duplicates.push(dupes);
178            }
179        }
180        Ok(duplicates)
181    }
182
183    fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
184        tx.send(HashProgress::StartDiscovering)?;
185        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
186        let mut total_hashed = 0;
187
188        rayon::scope(|scope| -> anyhow::Result<()> {
189            let mut it = FileIterator::new(self.dir.clone());
190            it.hasher = Some(self);
191            it.exclude = self.exclude.as_ref();
192            for (_, current_path) in it {
193                let meta = fs::metadata(&current_path)?;
194                let size = meta.len();
195                let modified = meta.modified()?;
196
197                // Small optimization: If file size is 0, it's not really worth treating
198                // as wasted space duplicates in the same way, but keeping it unified for now.
199                match by_size.entry(size) {
200                    std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
201                        EntryState::Single(first_path, first_modified) => {
202                            // We found a second file of identical size.
203                            // Time to start hashing both the *original* matching file and the *new* one!
204                            self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
205                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
206
207                            // Modify the state to indicate we are now fully hashing this size bucket.
208                            *occ.get_mut() = EntryState::Hashing;
209                            total_hashed += 2;
210                        }
211                        EntryState::Hashing => {
212                            // File size bucket already hashing; just dynamically spawn the new file immediately.
213                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
214                            total_hashed += 1;
215                        }
216                    },
217                    std::collections::hash_map::Entry::Vacant(vac) => {
218                        vac.insert(EntryState::Single(current_path, modified));
219                    }
220                }
221            }
222            tx.send(HashProgress::TotalFiles(total_hashed))?;
223            Ok(())
224        })?;
225
226        // The scope waits for all spawned tasks to complete.
227        // Channel `tx` gets naturally closed when it drops at the end of this function.
228        self.save_cache()
229    }
230
231    fn spawn_hash_task<'scope>(
232        &'scope self,
233        scope: &rayon::Scope<'scope>,
234        path: &Path,
235        size: u64,
236        modified: std::time::SystemTime,
237        tx: &mpsc::Sender<HashProgress>,
238    ) {
239        let relative = crate::strip_prefix(path, self.cache.base_dir())
240            .expect("path should be in cache base_dir");
241        if let Some(hash) = self.cache.get(relative, modified) {
242            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
243            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
244            return;
245        }
246
247        let path_owned = path.to_path_buf();
248        let relative_owned = relative.to_path_buf();
249        let tx_owned = tx.clone();
250        let cache_owned = self.cache.clone();
251        scope.spawn(move |_| {
252            if let Ok(hash) = self.compute_hash(&path_owned) {
253                self.num_hashed.fetch_add(1, Ordering::Relaxed);
254                cache_owned.insert(&relative_owned, modified, hash);
255                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
256            } else {
257                log::warn!("Failed to hash file: {:?}", path_owned);
258            }
259        });
260    }
261
262    /// Gets the hash of a file, using the cache if available.
263    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
264        let meta = fs::metadata(path)?;
265        let modified = meta.modified()?;
266        let relative = crate::strip_prefix(path, self.cache.base_dir())
267            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
268        if let Some(hash) = self.cache.get(relative, modified) {
269            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
270            return Ok(hash);
271        }
272
273        let hash = self.compute_hash(path)?;
274        self.num_hashed.fetch_add(1, Ordering::Relaxed);
275        self.cache.insert(relative, modified, hash);
276        Ok(hash)
277    }
278
279    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
280        let start_time = std::time::Instant::now();
281        let mut f = fs::File::open(path)?;
282        let len = f.metadata()?.len();
283        let progress = self
284            .progress
285            .as_ref()
286            .map(|progress| progress.add_file(path, len))
287            .unwrap_or_else(Progress::none);
288        let mut hasher = blake3::Hasher::new();
289        if self.buffer_size == 0 {
290            if len > 0 {
291                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
292                hasher.update(&mmap[..]);
293                progress.inc(len);
294            }
295        } else {
296            let mut buf = vec![0u8; self.buffer_size];
297            loop {
298                let n = f.read(&mut buf)?;
299                if n == 0 {
300                    break;
301                }
302                hasher.update(&buf[..n]);
303                progress.inc(n as u64);
304            }
305        }
306        progress.finish();
307        log::debug!("Computed hash in {:?}: {:?}", start_time.elapsed(), path);
308        Ok(hasher.finalize())
309    }
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn find_duplicates() -> anyhow::Result<()> {
318        let dir = tempfile::tempdir()?;
319
320        let file1_path = dir.path().join("same1.txt");
321        fs::write(&file1_path, "same content")?;
322
323        let file2_path = dir.path().join("same2.txt");
324        fs::write(&file2_path, "same content")?;
325
326        let diff_path = dir.path().join("diff.txt");
327        fs::write(&diff_path, "different content")?;
328
329        let mut hasher = FileHasher::new(dir.path().to_path_buf());
330        hasher.buffer_size = 8192;
331        let duplicates = hasher.find_duplicates()?;
332
333        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
334        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
335
336        assert_eq!(duplicates.len(), 1);
337        let group = &duplicates[0];
338        assert_eq!(group.paths.len(), 2);
339        assert_eq!(group.size, 12); // "same content" is 12 bytes
340
341        assert!(group.paths.contains(&file1_path));
342        assert!(group.paths.contains(&file2_path));
343
344        Ok(())
345    }
346
347    #[test]
348    fn find_duplicates_merge_cache() -> anyhow::Result<()> {
349        let dir = tempfile::tempdir()?;
350        let dir_path = dir.path();
351
352        let sub_dir = dir_path.join("a").join("a");
353        fs::create_dir_all(&sub_dir)?;
354
355        let file1_path = sub_dir.join("1");
356        fs::write(&file1_path, "same content")?;
357
358        let file2_path = sub_dir.join("2");
359        fs::write(&file2_path, "same content")?;
360
361        // Create empty cache file in a/a to force it to be the cache base
362        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
363        fs::File::create(&cache_aa_path)?;
364
365        // Run find_duplicates on a/a
366        let hasher_aa = FileHasher::new(sub_dir.clone());
367        let duplicates_aa = hasher_aa.find_duplicates()?;
368        assert_eq!(duplicates_aa.len(), 1);
369        assert!(cache_aa_path.exists());
370        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
371        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
372
373        // Create empty cache file in a to force it to be the cache base
374        let root_a = dir_path.join("a");
375        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
376        fs::File::create(&cache_a_path)?;
377
378        // Run find_duplicates on a
379        let hasher_a = FileHasher::new(root_a.clone());
380        let duplicates_a = hasher_a.find_duplicates()?;
381        assert_eq!(duplicates_a.len(), 1);
382        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
383        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
384
385        // The merged child cache should be removed.
386        assert!(cache_a_path.exists());
387        assert!(!cache_aa_path.exists());
388
389        Ok(())
390    }
391
392    #[test]
393    fn find_duplicates_with_exclude() -> anyhow::Result<()> {
394        let dir = tempfile::tempdir()?;
395
396        let file1_path = dir.path().join("same1.txt");
397        fs::write(&file1_path, "same content")?;
398
399        let file2_path = dir.path().join("same2.txt");
400        fs::write(&file2_path, "same content")?;
401
402        let exclude_path = dir.path().join("exclude.txt");
403        fs::write(&exclude_path, "same content")?;
404
405        let mut hasher = FileHasher::new(dir.path().to_path_buf());
406        hasher.buffer_size = 8192;
407        let mut builder = globset::GlobSetBuilder::new();
408        builder.add(
409            globset::GlobBuilder::new("exclude.txt")
410                .case_insensitive(true)
411                .build()?,
412        );
413        let filter = builder.build()?;
414        hasher.exclude = Some(filter);
415
416        let duplicates = hasher.find_duplicates()?;
417        assert_eq!(duplicates.len(), 1);
418        let group = &duplicates[0];
419        assert_eq!(group.paths.len(), 2);
420        assert!(group.paths.contains(&file1_path));
421        assert!(group.paths.contains(&file2_path));
422        assert!(!group.paths.contains(&exclude_path));
423        Ok(())
424    }
425}