Skip to main content

compare_dir/
file_hasher.rs

1use crate::file_hash_cache::FileHashCache;
2use indicatif::{ProgressBar, ProgressStyle};
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, mpsc};
8use walkdir::WalkDir;
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12    StartDiscovering,
13    TotalFiles(usize),
14    Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18    Single(PathBuf, std::time::SystemTime),
19    Hashing,
20}
21
22/// A group of duplicated files and their size.
23#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25    pub paths: Vec<PathBuf>,
26    pub size: u64,
27}
28
29/// A tool for finding duplicated files in a directory.
30pub struct FileHasher {
31    dir: PathBuf,
32    pub buffer_size: usize,
33    cache: Arc<FileHashCache>,
34}
35
36impl FileHasher {
37    /// Creates a new `FileHasher` for the given directory.
38    pub fn new(dir: PathBuf) -> Self {
39        let cache = FileHashCache::find_or_new(&dir);
40        Self {
41            dir,
42            buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
43            cache,
44        }
45    }
46
47    /// Save the hash cache if it is dirty.
48    pub fn save_cache(&self) -> anyhow::Result<()> {
49        Ok(self.cache.save()?)
50    }
51
52    /// Clears the loaded hashes in the cache.
53    pub fn clear_cache(&self) -> anyhow::Result<()> {
54        let relative = self.dir.strip_prefix(self.cache.base_dir())?;
55        self.cache.clear(relative);
56        Ok(())
57    }
58
59    /// Executes the duplicate file finding process and prints results.
60    pub fn run(&self) -> anyhow::Result<()> {
61        let start_time = std::time::Instant::now();
62        let mut duplicates = self.find_duplicates()?;
63        if duplicates.is_empty() {
64            println!("No duplicates found.");
65        } else {
66            duplicates.sort_by(|a, b| a.size.cmp(&b.size));
67            let mut total_wasted_space = 0;
68            for dupes in &duplicates {
69                let paths = &dupes.paths;
70                let file_size = dupes.size;
71                println!(
72                    "Identical {} files of {}:",
73                    paths.len(),
74                    crate::human_readable_size(file_size)
75                );
76                for path in paths {
77                    println!("  {}", path.display());
78                }
79                total_wasted_space += file_size * (paths.len() as u64 - 1);
80            }
81            eprintln!(
82                "Total wasted space: {}",
83                crate::human_readable_size(total_wasted_space)
84            );
85        }
86        eprintln!("Finished in {:?}.", start_time.elapsed());
87        Ok(())
88    }
89
90    /// Finds duplicated files and returns a list of duplicate groups.
91    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
92        let progress = ProgressBar::new_spinner();
93        progress.enable_steady_tick(std::time::Duration::from_millis(120));
94        progress.set_style(
95            ProgressStyle::with_template("[{elapsed_precise}] {spinner:.green} {msg}").unwrap(),
96        );
97        progress.set_message("Discovering and hashing files...");
98
99        let (tx, rx) = mpsc::channel();
100        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
101        let mut hashed_count = 0;
102        let mut cache_hits = 0;
103        std::thread::scope(|scope| {
104            scope.spawn(|| {
105                if let Err(e) = self.find_duplicates_internal(tx) {
106                    log::error!("Error during duplicate finding: {}", e);
107                }
108            });
109
110            while let Ok(event) = rx.recv() {
111                match event {
112                    HashProgress::StartDiscovering => {
113                        progress.set_message("Discovering and hashing files...");
114                    }
115                    HashProgress::TotalFiles(total) => {
116                        progress.set_length(total as u64);
117                        if total > 0 {
118                            progress.set_style(
119                                ProgressStyle::with_template(
120                                    "[{elapsed_precise}] {bar:40.cyan/blue} {percent}% {pos:>7}/{len:7} {msg}",
121                                )
122                                .unwrap(),
123                            );
124                        }
125                        progress.set_message(format!(" ({} cache hits)", cache_hits));
126                    }
127                    HashProgress::Result(path, size, hash, is_cache_hit) => {
128                        hashed_count += 1;
129                        if is_cache_hit {
130                            cache_hits += 1;
131                        }
132
133                        // Avoid overwriting the precise progress bar message once total length is known
134                        if progress.length().is_none() {
135                            progress.set_message(format!(
136                                "Hashed {} files ({} cache hits)...",
137                                hashed_count, cache_hits
138                            ));
139                        } else if cache_hits > 0 {
140                            progress.set_message(format!(" ({} cache hits)", cache_hits));
141                        }
142
143                        progress.inc(1);
144                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
145                            paths: Vec::new(),
146                            size,
147                        });
148                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
149                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
150                        entry.paths.push(path);
151                    }
152                }
153            }
154        });
155        progress.finish();
156        log::info!("Hashed {} files ({} cache hits).", hashed_count, cache_hits);
157
158        let mut duplicates = Vec::new();
159        for (_, mut dupes) in by_hash {
160            if dupes.paths.len() > 1 {
161                dupes.paths.sort();
162                duplicates.push(dupes);
163            }
164        }
165        Ok(duplicates)
166    }
167
168    fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
169        tx.send(HashProgress::StartDiscovering)?;
170        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
171        let mut total_hashed = 0;
172
173        rayon::scope(|scope| -> anyhow::Result<()> {
174            for entry in WalkDir::new(&self.dir).into_iter().filter_map(|e| e.ok()) {
175                if !entry.file_type().is_file() {
176                    continue;
177                }
178                let meta = entry.metadata()?;
179                let size = meta.len();
180                let modified = meta.modified()?;
181                // Small optimization: If file size is 0, it's not really worth treating
182                // as wasted space duplicates in the same way, but keeping it unified for now.
183                let current_path = entry.path().to_path_buf();
184
185                match by_size.entry(size) {
186                    std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
187                        EntryState::Single(first_path, first_modified) => {
188                            // We found a second file of identical size.
189                            // Time to start hashing both the *original* matching file and the *new* one!
190                            self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
191                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
192
193                            // Modify the state to indicate we are now fully hashing this size bucket.
194                            *occ.get_mut() = EntryState::Hashing;
195                            total_hashed += 2;
196                        }
197                        EntryState::Hashing => {
198                            // File size bucket already hashing; just dynamically spawn the new file immediately.
199                            self.spawn_hash_task(scope, &current_path, size, modified, &tx);
200                            total_hashed += 1;
201                        }
202                    },
203                    std::collections::hash_map::Entry::Vacant(vac) => {
204                        vac.insert(EntryState::Single(current_path, modified));
205                    }
206                }
207            }
208            tx.send(HashProgress::TotalFiles(total_hashed))?;
209            Ok(())
210        })?;
211
212        // The scope waits for all spawned tasks to complete.
213        // Channel `tx` gets naturally closed when it drops at the end of this function.
214        self.save_cache()
215    }
216
217    fn spawn_hash_task<'scope>(
218        &'scope self,
219        scope: &rayon::Scope<'scope>,
220        path: &Path,
221        size: u64,
222        modified: std::time::SystemTime,
223        tx: &mpsc::Sender<HashProgress>,
224    ) {
225        let relative = path
226            .strip_prefix(self.cache.base_dir())
227            .expect("path should be in cache base_dir");
228        if let Some(hash) = self.cache.get(relative, modified) {
229            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
230            return;
231        }
232
233        let path_owned = path.to_path_buf();
234        let relative_owned = relative.to_path_buf();
235        let tx_owned = tx.clone();
236        let cache_owned = self.cache.clone();
237        scope.spawn(move |_| {
238            if let Ok(hash) = Self::compute_hash(&path_owned, self.buffer_size) {
239                cache_owned.insert(&relative_owned, modified, hash);
240                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
241            } else {
242                log::warn!("Failed to hash file: {:?}", path_owned);
243            }
244        });
245    }
246
247    /// Gets the hash of a file, using the cache if available.
248    pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
249        let meta = fs::metadata(path)?;
250        let modified = meta.modified()?;
251        let relative = path
252            .strip_prefix(self.cache.base_dir())
253            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
254
255        if let Some(hash) = self.cache.get(relative, modified) {
256            return Ok(hash);
257        }
258
259        let hash = Self::compute_hash(path, self.buffer_size)?;
260        self.cache.insert(relative, modified, hash);
261        Ok(hash)
262    }
263
264    fn compute_hash(path: &Path, buffer_size: usize) -> io::Result<blake3::Hash> {
265        let start_time = std::time::Instant::now();
266        let mut f = fs::File::open(path)?;
267        let mut hasher = blake3::Hasher::new();
268        if buffer_size == 0 {
269            let len = f.metadata()?.len();
270            if len > 0 {
271                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
272                hasher.update(&mmap[..]);
273            }
274        } else {
275            let mut buf = vec![0u8; buffer_size];
276            loop {
277                let n = f.read(&mut buf)?;
278                if n == 0 {
279                    break;
280                }
281                hasher.update(&buf[..n]);
282            }
283        }
284        log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
285        Ok(hasher.finalize())
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292    use std::io::Write;
293
294    fn create_file(path: &Path, content: &str) -> io::Result<()> {
295        let mut file = fs::File::create(path)?;
296        file.write_all(content.as_bytes())?;
297        Ok(())
298    }
299
300    #[test]
301    fn test_find_duplicates() -> anyhow::Result<()> {
302        let dir = tempfile::tempdir()?;
303
304        let file1_path = dir.path().join("same1.txt");
305        create_file(&file1_path, "same content")?;
306
307        let file2_path = dir.path().join("same2.txt");
308        create_file(&file2_path, "same content")?;
309
310        let diff_path = dir.path().join("diff.txt");
311        create_file(&diff_path, "different content")?;
312
313        let mut hasher = FileHasher::new(dir.path().to_path_buf());
314        hasher.buffer_size = 8192;
315        let duplicates = hasher.find_duplicates()?;
316
317        assert_eq!(duplicates.len(), 1);
318        let group = &duplicates[0];
319        assert_eq!(group.paths.len(), 2);
320        assert_eq!(group.size, 12); // "same content" is 12 bytes
321
322        assert!(group.paths.contains(&file1_path));
323        assert!(group.paths.contains(&file2_path));
324
325        Ok(())
326    }
327}