Skip to main content

compare_dir/
file_hasher.rs

1use crate::{FileHashCache, FileIterator, ProgressReporter};
2use std::collections::HashMap;
3use std::fs;
4use std::io::{self, Read};
5use std::path::{Path, PathBuf};
6use std::sync::atomic::{AtomicUsize, Ordering};
7use std::sync::{Arc, mpsc};
8
9#[derive(Debug, Clone)]
10enum HashProgress {
11    StartDiscovering,
12    TotalFiles(usize),
13    Result(PathBuf, u64, blake3::Hash, bool),
14}
15
16enum EntryState {
17    Single(PathBuf, std::time::SystemTime),
18    Hashing,
19}
20
21/// A group of duplicated files and their size.
22#[derive(Debug, Clone)]
23pub struct DuplicatedFiles {
24    pub paths: Vec<PathBuf>,
25    pub size: u64,
26}
27
28/// A tool for finding duplicated files in a directory.
29pub struct FileHasher {
30    dir: PathBuf,
31    pub buffer_size: usize,
32    cache: Arc<FileHashCache>,
33    pub(crate) num_hashed: AtomicUsize,
34    pub(crate) num_hash_looked_up: AtomicUsize,
35}
36
37impl FileHasher {
38    /// Creates a new `FileHasher` for the given directory.
39    pub fn new(dir: PathBuf) -> Self {
40        let cache = FileHashCache::find_or_new(&dir);
41        Self {
42            dir,
43            buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
44            cache,
45            num_hashed: AtomicUsize::new(0),
46            num_hash_looked_up: AtomicUsize::new(0),
47        }
48    }
49
50    /// Remove a cache entry if it exists.
51    pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
52        let relative = path.strip_prefix(self.cache.base_dir())?;
53        self.cache.remove(relative);
54        Ok(())
55    }
56
57    /// Save the hash cache if it is dirty.
58    pub fn save_cache(&self) -> anyhow::Result<()> {
59        log::info!(
60            "Hash stats for {:?}: {} computed, {} looked up",
61            self.dir,
62            self.num_hashed.load(Ordering::Relaxed),
63            self.num_hash_looked_up.load(Ordering::Relaxed)
64        );
65        Ok(self.cache.save()?)
66    }
67
68    /// Merges another cache into this hasher's cache.
69    pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
70        self.cache.merge(other_cache);
71    }
72
73    /// Clears the loaded hashes in the cache.
74    pub fn clear_cache(&self) -> anyhow::Result<()> {
75        let relative = self.dir.strip_prefix(self.cache.base_dir())?;
76        self.cache.clear(relative);
77        Ok(())
78    }
79
80    /// Executes the duplicate file finding process and prints results.
81    pub fn run(&self) -> anyhow::Result<()> {
82        let start_time = std::time::Instant::now();
83        let mut duplicates = self.find_duplicates()?;
84        if duplicates.is_empty() {
85            println!("No duplicates found.");
86        } else {
87            duplicates.sort_by(|a, b| a.size.cmp(&b.size));
88            let mut total_wasted_space = 0;
89            for dupes in &duplicates {
90                let paths = &dupes.paths;
91                let file_size = dupes.size;
92                println!(
93                    "Identical {} files of {}:",
94                    paths.len(),
95                    crate::human_readable_size(file_size)
96                );
97                for path in paths {
98                    println!("  {}", path.display());
99                }
100                total_wasted_space += file_size * (paths.len() as u64 - 1);
101            }
102            eprintln!(
103                "Total wasted space: {}",
104                crate::human_readable_size(total_wasted_space)
105            );
106        }
107        eprintln!("Finished in {:?}.", start_time.elapsed());
108        Ok(())
109    }
110
111    /// Finds duplicated files and returns a list of duplicate groups.
112    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
113        let progress = ProgressReporter::new();
114        progress.set_message("Scanning directories...");
115
116        let (tx, rx) = mpsc::channel();
117        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
118        let mut num_cache_hits = 0;
119        std::thread::scope(|scope| {
120            scope.spawn(|| {
121                if let Err(e) = self.find_duplicates_internal(tx) {
122                    log::error!("Error during duplicate finding: {}", e);
123                }
124            });
125
126            while let Ok(event) = rx.recv() {
127                match event {
128                    HashProgress::StartDiscovering => {
129                        progress.set_message("Hashing files...");
130                    }
131                    HashProgress::TotalFiles(total) => {
132                        progress.set_length(total as u64);
133                        if num_cache_hits > 0 {
134                            progress.set_message(format!(" ({} cache hits)", num_cache_hits));
135                        }
136                    }
137                    HashProgress::Result(path, size, hash, is_cache_hit) => {
138                        if is_cache_hit {
139                            num_cache_hits += 1;
140                            if progress.length().is_none() {
141                                progress.set_message(format!(
142                                    "Hashing files... ({} cache hits)",
143                                    num_cache_hits
144                                ));
145                            } else {
146                                progress.set_message(format!(" ({} cache hits)", num_cache_hits));
147                            }
148                        }
149
150                        progress.inc(1);
151                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
152                            paths: Vec::new(),
153                            size,
154                        });
155                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
156                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
157                        entry.paths.push(path);
158                    }
159                }
160            }
161        });
162        progress.finish();
163
164        let mut duplicates = Vec::new();
165        for (_, mut dupes) in by_hash {
166            if dupes.paths.len() > 1 {
167                dupes.paths.sort();
168                duplicates.push(dupes);
169            }
170        }
171        Ok(duplicates)
172    }
173
174    fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
175        tx.send(HashProgress::StartDiscovering)?;
176        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
177        let mut total_hashed = 0;
178
179        rayon::scope(|scope| -> anyhow::Result<()> {
180            let mut it = FileIterator::new(self.dir.clone());
181            it.hasher = Some(self);
182            for (_, current_path) in it {
183                let meta = fs::metadata(&current_path)?;
184                let size = meta.len();
185                let modified = meta.modified()?;
186
187                // Small optimization: If file size is 0, it's not really worth treating
188                // as wasted space duplicates in the same way, but keeping it unified for now.
189                match by_size.entry(size) {
190                    std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
191                        EntryState::Single(first_path, first_modified) => {
192                            // We found a second file of identical size.
193                            // Time to start hashing both the *original* matching file and the *new* one!
194                            self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
195                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
196
197                            // Modify the state to indicate we are now fully hashing this size bucket.
198                            *occ.get_mut() = EntryState::Hashing;
199                            total_hashed += 2;
200                        }
201                        EntryState::Hashing => {
202                            // File size bucket already hashing; just dynamically spawn the new file immediately.
203                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
204                            total_hashed += 1;
205                        }
206                    },
207                    std::collections::hash_map::Entry::Vacant(vac) => {
208                        vac.insert(EntryState::Single(current_path, modified));
209                    }
210                }
211            }
212            tx.send(HashProgress::TotalFiles(total_hashed))?;
213            Ok(())
214        })?;
215
216        // The scope waits for all spawned tasks to complete.
217        // Channel `tx` gets naturally closed when it drops at the end of this function.
218        self.save_cache()
219    }
220
221    fn spawn_hash_task<'scope>(
222        &'scope self,
223        scope: &rayon::Scope<'scope>,
224        path: &Path,
225        size: u64,
226        modified: std::time::SystemTime,
227        tx: &mpsc::Sender<HashProgress>,
228    ) {
229        let relative = path
230            .strip_prefix(self.cache.base_dir())
231            .expect("path should be in cache base_dir");
232        if let Some(hash) = self.cache.get(relative, modified) {
233            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
234            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
235            return;
236        }
237
238        let path_owned = path.to_path_buf();
239        let relative_owned = relative.to_path_buf();
240        let tx_owned = tx.clone();
241        let cache_owned = self.cache.clone();
242        scope.spawn(move |_| {
243            if let Ok(hash) = self.compute_hash(&path_owned) {
244                self.num_hashed.fetch_add(1, Ordering::Relaxed);
245                cache_owned.insert(&relative_owned, modified, hash);
246                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
247            } else {
248                log::warn!("Failed to hash file: {:?}", path_owned);
249            }
250        });
251    }
252
253    /// Gets the hash of a file, using the cache if available.
254    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
255        let meta = fs::metadata(path)?;
256        let modified = meta.modified()?;
257        let relative = path
258            .strip_prefix(self.cache.base_dir())
259            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
260
261        if let Some(hash) = self.cache.get(relative, modified) {
262            self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
263            return Ok(hash);
264        }
265
266        let hash = self.compute_hash(path)?;
267        self.num_hashed.fetch_add(1, Ordering::Relaxed);
268        self.cache.insert(relative, modified, hash);
269        Ok(hash)
270    }
271
272    fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
273        let start_time = std::time::Instant::now();
274        let mut f = fs::File::open(path)?;
275        let mut hasher = blake3::Hasher::new();
276        if self.buffer_size == 0 {
277            let len = f.metadata()?.len();
278            if len > 0 {
279                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
280                hasher.update(&mmap[..]);
281            }
282        } else {
283            let mut buf = vec![0u8; self.buffer_size];
284            loop {
285                let n = f.read(&mut buf)?;
286                if n == 0 {
287                    break;
288                }
289                hasher.update(&buf[..n]);
290            }
291        }
292        log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
293        Ok(hasher.finalize())
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300    use std::io::Write;
301
302    fn create_file(path: &Path, content: &str) -> io::Result<()> {
303        let mut file = fs::File::create(path)?;
304        file.write_all(content.as_bytes())?;
305        Ok(())
306    }
307
308    #[test]
309    fn test_find_duplicates() -> anyhow::Result<()> {
310        let dir = tempfile::tempdir()?;
311
312        let file1_path = dir.path().join("same1.txt");
313        create_file(&file1_path, "same content")?;
314
315        let file2_path = dir.path().join("same2.txt");
316        create_file(&file2_path, "same content")?;
317
318        let diff_path = dir.path().join("diff.txt");
319        create_file(&diff_path, "different content")?;
320
321        let mut hasher = FileHasher::new(dir.path().to_path_buf());
322        hasher.buffer_size = 8192;
323        let duplicates = hasher.find_duplicates()?;
324
325        assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
326        assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
327
328        assert_eq!(duplicates.len(), 1);
329        let group = &duplicates[0];
330        assert_eq!(group.paths.len(), 2);
331        assert_eq!(group.size, 12); // "same content" is 12 bytes
332
333        assert!(group.paths.contains(&file1_path));
334        assert!(group.paths.contains(&file2_path));
335
336        Ok(())
337    }
338
339    #[test]
340    fn test_find_duplicates_merge_cache() -> anyhow::Result<()> {
341        let dir = tempfile::tempdir()?;
342        let dir_path = dir.path();
343
344        let sub_dir = dir_path.join("a").join("a");
345        fs::create_dir_all(&sub_dir)?;
346
347        let file1_path = sub_dir.join("1");
348        create_file(&file1_path, "same content")?;
349
350        let file2_path = sub_dir.join("2");
351        create_file(&file2_path, "same content")?;
352
353        // Create empty cache file in a/a to force it to be the cache base
354        let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
355        fs::File::create(&cache_aa_path)?;
356
357        // Run find_duplicates on a/a
358        let hasher_aa = FileHasher::new(sub_dir.clone());
359        let duplicates_aa = hasher_aa.find_duplicates()?;
360        assert_eq!(duplicates_aa.len(), 1);
361        assert!(cache_aa_path.exists());
362
363        // Create empty cache file in a to force it to be the cache base
364        let root_a = dir_path.join("a");
365        let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
366        fs::File::create(&cache_a_path)?;
367
368        // Run find_duplicates on a
369        let hasher_a = FileHasher::new(root_a.clone());
370        let duplicates_a = hasher_a.find_duplicates()?;
371        assert_eq!(duplicates_a.len(), 1);
372
373        // It should have merged and removed the child cache.
374        assert!(cache_a_path.exists());
375        assert!(!cache_aa_path.exists());
376
377        Ok(())
378    }
379}