Skip to main content

compare_dir/
file_hasher.rs

1use crate::{FileHashCache, FileIterator, ProgressReporter};
2use globset::GlobSet;
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::{Arc, mpsc};
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12    StartDiscovering,
13    TotalFiles(usize),
14    Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18    Single(PathBuf, std::time::SystemTime),
19    Hashing,
20}
21
22/// A group of duplicated files and their size.
23#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25    pub paths: Vec<PathBuf>,
26    pub size: u64,
27}
28
29/// A tool for finding duplicated files in a directory.
30pub struct FileHasher {
31    dir: PathBuf,
32    pub buffer_size: usize,
33    cache: Arc<FileHashCache>,
34    pub(crate) num_hashed: AtomicUsize,
35    pub(crate) num_hash_looked_up: AtomicUsize,
36    pub exclude: Option<GlobSet>,
37}
38
39impl FileHasher {
40    /// Creates a new `FileHasher` for the given directory.
41    pub fn new(dir: PathBuf) -> Self {
42        let cache = FileHashCache::find_or_new(&dir);
43        Self {
44            dir,
45            buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
46            cache,
47            num_hashed: AtomicUsize::new(0),
48            num_hash_looked_up: AtomicUsize::new(0),
49            exclude: None,
50        }
51    }
52
53    /// Remove a cache entry if it exists.
54    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
55        let relative = crate::strip_prefix(path, self.cache.base_dir())?;
56        self.cache.remove(relative);
57        Ok(())
58    }
59
60    /// Save the hash cache if it is dirty.
61    pub fn save_cache(&self) -> anyhow::Result<()> {
62        log::info!(
63            "Hash stats for {:?}: {} computed, {} looked up",
64            self.dir,
65            self.num_hashed.load(Ordering::Relaxed),
66            self.num_hash_looked_up.load(Ordering::Relaxed)
67        );
68        Ok(self.cache.save()?)
69    }
70
71    /// Merges another cache into this hasher's cache.
72    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
73        self.cache.merge(other_cache);
74    }
75
76    /// Clears the loaded hashes in the cache.
77    pub fn clear_cache(&self) -> anyhow::Result<()> {
78        let relative = crate::strip_prefix(&self.dir, self.cache.base_dir())?;
79        self.cache.clear(relative);
80        Ok(())
81    }
82
83    /// Executes the duplicate file finding process and prints results.
84    pub fn run(&self) -> anyhow::Result<()> {
85        let start_time = std::time::Instant::now();
86        let mut duplicates = self.find_duplicates()?;
87        if duplicates.is_empty() {
88            println!("No duplicates found.");
89        } else {
90            duplicates.sort_by(|a, b| a.size.cmp(&b.size));
91            let mut total_wasted_space = 0;
92            for dupes in &duplicates {
93                let paths = &dupes.paths;
94                let file_size = dupes.size;
95                println!(
96                    "Identical {} files of {}:",
97                    paths.len(),
98                    crate::human_readable_size(file_size)
99                );
100                for path in paths {
101                    println!("  {}", path.display());
102                }
103                total_wasted_space += file_size * (paths.len() as u64 - 1);
104            }
105            eprintln!(
106                "Total wasted space: {}",
107                crate::human_readable_size(total_wasted_space)
108            );
109        }
110        eprintln!("Finished in {:?}.", start_time.elapsed());
111        Ok(())
112    }
113
114    /// Finds duplicated files and returns a list of duplicate groups.
115    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
116        let progress = ProgressReporter::new();
117        progress.set_message("Scanning directories...");
118
119        let (tx, rx) = mpsc::channel();
120        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
121        let mut num_cache_hits = 0;
122        std::thread::scope(|scope| {
123            scope.spawn(|| {
124                if let Err(e) = self.find_duplicates_internal(tx) {
125                    log::error!("Error during duplicate finding: {}", e);
126                }
127            });
128
129            while let Ok(event) = rx.recv() {
130                match event {
131                    HashProgress::StartDiscovering => {
132                        progress.set_message("Hashing files...");
133                    }
134                    HashProgress::TotalFiles(total) => {
135                        progress.set_length(total as u64);
136                        if num_cache_hits > 0 {
137                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
138                        }
139                    }
140                    HashProgress::Result(path, size, hash, is_cache_hit) => {
141                        if is_cache_hit {
142                            num_cache_hits += 1;
143                            if progress.length().is_none() {
144                                progress.set_message(format!(
145                                    "Hashing files... ({} cache hits)",
146                                    num_cache_hits
147                                ));
148                            } else {
149                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
150                            }
151                        }
152
153                        progress.inc(1);
154                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
155                            paths: Vec::new(),
156                            size,
157                        });
158                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
159                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
160                        entry.paths.push(path);
161                    }
162                }
163            }
164        });
165        progress.finish();
166
167        let mut duplicates = Vec::new();
168        for (_, mut dupes) in by_hash {
169            if dupes.paths.len() > 1 {
170                dupes.paths.sort();
171                duplicates.push(dupes);
172            }
173        }
174        Ok(duplicates)
175    }
176
177    fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
178        tx.send(HashProgress::StartDiscovering)?;
179        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
180        let mut total_hashed = 0;
181
182        rayon::scope(|scope| -> anyhow::Result<()> {
183            let mut it = FileIterator::new(self.dir.clone());
184            it.hasher = Some(self);
185            it.exclude = self.exclude.as_ref();
186            for (_, current_path) in it {
187                let meta = fs::metadata(&current_path)?;
188                let size = meta.len();
189                let modified = meta.modified()?;
190
191                // Small optimization: If file size is 0, it's not really worth treating
192                // as wasted space duplicates in the same way, but keeping it unified for now.
193                match by_size.entry(size) {
194                    std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
195                        EntryState::Single(first_path, first_modified) => {
196                            // We found a second file of identical size.
197                            // Time to start hashing both the *original* matching file and the *new* one!
198                            self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
199                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
200
201                            // Modify the state to indicate we are now fully hashing this size bucket.
202                            *occ.get_mut() = EntryState::Hashing;
203                            total_hashed += 2;
204                        }
205                        EntryState::Hashing => {
206                            // File size bucket already hashing; just dynamically spawn the new file immediately.
207                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
208                            total_hashed += 1;
209                        }
210                    },
211                    std::collections::hash_map::Entry::Vacant(vac) => {
212                        vac.insert(EntryState::Single(current_path, modified));
213                    }
214                }
215            }
216            tx.send(HashProgress::TotalFiles(total_hashed))?;
217            Ok(())
218        })?;
219
220        // The scope waits for all spawned tasks to complete.
221        // Channel `tx` gets naturally closed when it drops at the end of this function.
222        self.save_cache()
223    }
224
225    fn spawn_hash_task<'scope>(
226        &'scope self,
227        scope: &rayon::Scope<'scope>,
228        path: &Path,
229        size: u64,
230        modified: std::time::SystemTime,
231        tx: &mpsc::Sender<HashProgress>,
232    ) {
233        let relative = crate::strip_prefix(path, self.cache.base_dir())
234            .expect("path should be in cache base_dir");
235        if let Some(hash) = self.cache.get(relative, modified) {
236            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
237            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
238            return;
239        }
240
241        let path_owned = path.to_path_buf();
242        let relative_owned = relative.to_path_buf();
243        let tx_owned = tx.clone();
244        let cache_owned = self.cache.clone();
245        scope.spawn(move |_| {
246            if let Ok(hash) = self.compute_hash(&path_owned) {
247                self.num_hashed.fetch_add(1, Ordering::Relaxed);
248                cache_owned.insert(&relative_owned, modified, hash);
249                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
250            } else {
251                log::warn!("Failed to hash file: {:?}", path_owned);
252            }
253        });
254    }
255
256    /// Gets the hash of a file, using the cache if available.
257    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
258        let meta = fs::metadata(path)?;
259        let modified = meta.modified()?;
260        let relative = crate::strip_prefix(path, self.cache.base_dir())
261            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
262        if let Some(hash) = self.cache.get(relative, modified) {
263            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
264            return Ok(hash);
265        }
266
267        let hash = self.compute_hash(path)?;
268        self.num_hashed.fetch_add(1, Ordering::Relaxed);
269        self.cache.insert(relative, modified, hash);
270        Ok(hash)
271    }
272
273    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
274        let start_time = std::time::Instant::now();
275        let mut f = fs::File::open(path)?;
276        let mut hasher = blake3::Hasher::new();
277        if self.buffer_size == 0 {
278            let len = f.metadata()?.len();
279            if len > 0 {
280                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
281                hasher.update(&mmap[..]);
282            }
283        } else {
284            let mut buf = vec![0u8; self.buffer_size];
285            loop {
286                let n = f.read(&mut buf)?;
287                if n == 0 {
288                    break;
289                }
290                hasher.update(&buf[..n]);
291            }
292        }
293        log::trace!("Computed hash in {:?}: {:?}", start_time.elapsed(), path);
294        Ok(hasher.finalize())
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    #[test]
303    fn test_find_duplicates() -> anyhow::Result<()> {
304        let dir = tempfile::tempdir()?;
305
306        let file1_path = dir.path().join("same1.txt");
307        fs::write(&file1_path, "same content")?;
308
309        let file2_path = dir.path().join("same2.txt");
310        fs::write(&file2_path, "same content")?;
311
312        let diff_path = dir.path().join("diff.txt");
313        fs::write(&diff_path, "different content")?;
314
315        let mut hasher = FileHasher::new(dir.path().to_path_buf());
316        hasher.buffer_size = 8192;
317        let duplicates = hasher.find_duplicates()?;
318
319        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
320        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
321
322        assert_eq!(duplicates.len(), 1);
323        let group = &duplicates[0];
324        assert_eq!(group.paths.len(), 2);
325        assert_eq!(group.size, 12); // "same content" is 12 bytes
326
327        assert!(group.paths.contains(&file1_path));
328        assert!(group.paths.contains(&file2_path));
329
330        Ok(())
331    }
332
333    #[test]
334    fn test_find_duplicates_merge_cache() -> anyhow::Result<()> {
335        let dir = tempfile::tempdir()?;
336        let dir_path = dir.path();
337
338        let sub_dir = dir_path.join("a").join("a");
339        fs::create_dir_all(&sub_dir)?;
340
341        let file1_path = sub_dir.join("1");
342        fs::write(&file1_path, "same content")?;
343
344        let file2_path = sub_dir.join("2");
345        fs::write(&file2_path, "same content")?;
346
347        // Create empty cache file in a/a to force it to be the cache base
348        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
349        fs::File::create(&cache_aa_path)?;
350
351        // Run find_duplicates on a/a
352        let hasher_aa = FileHasher::new(sub_dir.clone());
353        let duplicates_aa = hasher_aa.find_duplicates()?;
354        assert_eq!(duplicates_aa.len(), 1);
355        assert!(cache_aa_path.exists());
356        assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
357        assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
358
359        // Create empty cache file in a to force it to be the cache base
360        let root_a = dir_path.join("a");
361        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
362        fs::File::create(&cache_a_path)?;
363
364        // Run find_duplicates on a
365        let hasher_a = FileHasher::new(root_a.clone());
366        let duplicates_a = hasher_a.find_duplicates()?;
367        assert_eq!(duplicates_a.len(), 1);
368        assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
369        assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
370
371        // The merged child cache should be removed.
372        assert!(cache_a_path.exists());
373        assert!(!cache_aa_path.exists());
374
375        Ok(())
376    }
377
378    #[test]
379    fn test_find_duplicates_with_exclude() -> anyhow::Result<()> {
380        let dir = tempfile::tempdir()?;
381
382        let file1_path = dir.path().join("same1.txt");
383        fs::write(&file1_path, "same content")?;
384
385        let file2_path = dir.path().join("same2.txt");
386        fs::write(&file2_path, "same content")?;
387
388        let exclude_path = dir.path().join("exclude.txt");
389        fs::write(&exclude_path, "same content")?;
390
391        let mut hasher = FileHasher::new(dir.path().to_path_buf());
392        hasher.buffer_size = 8192;
393        let mut builder = globset::GlobSetBuilder::new();
394        builder.add(
395            globset::GlobBuilder::new("exclude.txt")
396                .case_insensitive(true)
397                .build()?,
398        );
399        let filter = builder.build()?;
400        hasher.exclude = Some(filter);
401
402        let duplicates = hasher.find_duplicates()?;
403        assert_eq!(duplicates.len(), 1);
404        let group = &duplicates[0];
405        assert_eq!(group.paths.len(), 2);
406        assert!(group.paths.contains(&file1_path));
407        assert!(group.paths.contains(&file2_path));
408        assert!(!group.paths.contains(&exclude_path));
409        Ok(())
410    }
411}