Skip to main content

compare_dir/
file_hasher.rs

1use crate::file_hash_cache::FileHashCache;
2use indicatif::{ProgressBar, ProgressStyle};
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::mpsc;
8use walkdir::WalkDir;
9
10#[derive(Debug, Clone)]
11pub(crate) enum HashProgress {
12    StartDiscovering,
13    TotalFiles(usize),
14    Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18    Single(PathBuf, std::time::SystemTime),
19    Hashing,
20}
21
22/// A group of duplicated files and their size.
23#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25    pub paths: Vec<PathBuf>,
26    pub size: u64,
27}
28
29/// A tool for finding duplicated files in a directory.
30pub struct FileHasher {
31    dir: PathBuf,
32    pub buffer_size: usize,
33}
34
35impl FileHasher {
36    /// Creates a new `FileHasher` for the given directory.
37    pub fn new(dir: PathBuf) -> Self {
38        Self {
39            dir,
40            buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
41        }
42    }
43
44    /// Executes the duplicate file finding process and prints results.
45    pub fn run(&self) -> anyhow::Result<()> {
46        let start_time = std::time::Instant::now();
47        let mut duplicates = self.find_duplicates()?;
48        if duplicates.is_empty() {
49            println!("No duplicates found.");
50        } else {
51            duplicates.sort_by(|a, b| a.size.cmp(&b.size));
52            let mut total_wasted_space = 0;
53            for dupes in &duplicates {
54                let paths = &dupes.paths;
55                let file_size = dupes.size;
56                println!(
57                    "Identical {} files of {}:",
58                    paths.len(),
59                    crate::human_readable_size(file_size)
60                );
61                for path in paths {
62                    println!("  {}", path.display());
63                }
64                total_wasted_space += file_size * (paths.len() as u64 - 1);
65            }
66            eprintln!(
67                "Total wasted space: {}",
68                crate::human_readable_size(total_wasted_space)
69            );
70        }
71        eprintln!("Finished in {:?}.", start_time.elapsed());
72        Ok(())
73    }
74
75    /// Finds duplicated files and returns a list of duplicate groups.
76    pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
77        let progress = ProgressBar::new_spinner();
78        progress.enable_steady_tick(std::time::Duration::from_millis(120));
79        progress.set_style(
80            ProgressStyle::with_template("[{elapsed_precise}] {spinner:.green} {msg}").unwrap(),
81        );
82        progress.set_message("Discovering and hashing files...");
83
84        let (tx, rx) = mpsc::channel();
85        let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
86        let mut hashed_count = 0;
87        let mut cache_hits = 0;
88        std::thread::scope(|scope| {
89            scope.spawn(|| {
90                if let Err(e) = self.find_duplicates_internal(tx) {
91                    log::error!("Error during duplicate finding: {}", e);
92                }
93            });
94
95            while let Ok(event) = rx.recv() {
96                match event {
97                    HashProgress::StartDiscovering => {
98                        progress.set_message("Discovering and hashing files...");
99                    }
100                    HashProgress::TotalFiles(total) => {
101                        progress.set_length(total as u64);
102                        if total > 0 {
103                            progress.set_style(
104                                ProgressStyle::with_template(
105                                    "[{elapsed_precise}] {bar:40.cyan/blue} {percent}% {pos:>7}/{len:7} {msg}",
106                                )
107                                .unwrap(),
108                            );
109                        }
110                        progress.set_message(format!(" ({} cache hits)", cache_hits));
111                    }
112                    HashProgress::Result(path, size, hash, is_cache_hit) => {
113                        hashed_count += 1;
114                        if is_cache_hit {
115                            cache_hits += 1;
116                        }
117
118                        // Avoid overwriting the precise progress bar message once total length is known
119                        if progress.length().is_none() {
120                            progress.set_message(format!(
121                                "Hashed {} files ({} cache hits)...",
122                                hashed_count, cache_hits
123                            ));
124                        } else if cache_hits > 0 {
125                            progress.set_message(format!(" ({} cache hits)", cache_hits));
126                        }
127
128                        progress.inc(1);
129                        let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
130                            paths: Vec::new(),
131                            size,
132                        });
133                        // Hash collisions shouldn't happen, but if they do, sizes shouldn't mismatch.
134                        assert_eq!(entry.size, size, "Hash collision: sizes do not match");
135                        entry.paths.push(path);
136                    }
137                }
138            }
139        });
140        progress.finish();
141        log::info!("Hashed {} files ({} cache hits).", hashed_count, cache_hits);
142
143        let mut duplicates = Vec::new();
144        for (_, mut dupes) in by_hash {
145            if dupes.paths.len() > 1 {
146                dupes.paths.sort();
147                duplicates.push(dupes);
148            }
149        }
150        Ok(duplicates)
151    }
152
153    fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
154        tx.send(HashProgress::StartDiscovering)?;
155        let mut by_size: HashMap<u64, EntryState> = HashMap::new();
156        let mut total_hashed = 0;
157        let cache = FileHashCache::find_or_new(&self.dir);
158
159        rayon::scope(|scope| -> anyhow::Result<()> {
160            for entry in WalkDir::new(&self.dir).into_iter().filter_map(|e| e.ok()) {
161                if !entry.file_type().is_file() {
162                    continue;
163                }
164                let meta = entry.metadata()?;
165                let size = meta.len();
166                let modified = meta.modified()?;
167                // Small optimization: If file size is 0, it's not really worth treating
168                // as wasted space duplicates in the same way, but keeping it unified for now.
169                let current_path = entry.path().to_path_buf();
170
171                match by_size.entry(size) {
172                    std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
173                        EntryState::Single(first_path, first_modified) => {
174                            // We found a second file of identical size.
175                            // Time to start hashing both the *original* matching file and the *new* one!
176                            self.spawn_hash_task(
177                                scope,
178                                first_path,
179                                size,
180                                *first_modified,
181                                &tx,
182                                &cache,
183                            );
184                            self.spawn_hash_task(scope, &current_path, size, modified, &tx, &cache);
185
186                            // Modify the state to indicate we are now fully hashing this size bucket.
187                            *occ.get_mut() = EntryState::Hashing;
188                            total_hashed += 2;
189                        }
190                        EntryState::Hashing => {
191                            // File size bucket already hashing; just dynamically spawn the new file immediately.
192                            self.spawn_hash_task(scope, &current_path, size, modified, &tx, &cache);
193                            total_hashed += 1;
194                        }
195                    },
196                    std::collections::hash_map::Entry::Vacant(vac) => {
197                        vac.insert(EntryState::Single(current_path, modified));
198                    }
199                }
200            }
201            tx.send(HashProgress::TotalFiles(total_hashed))?;
202            Ok(())
203        })?;
204
205        // The scope waits for all spawned tasks to complete.
206        // Channel `tx` gets naturally closed when it drops at the end of this function.
207        let _ = cache.save();
208        Ok(())
209    }
210
211    fn spawn_hash_task<'scope>(
212        &'scope self,
213        scope: &rayon::Scope<'scope>,
214        path: &Path,
215        size: u64,
216        modified: std::time::SystemTime,
217        tx: &mpsc::Sender<HashProgress>,
218        cache: &std::sync::Arc<FileHashCache>,
219    ) {
220        let relative = path
221            .strip_prefix(cache.base_dir())
222            .expect("path should be in cache base_dir");
223        if let Some(hash) = cache.get(relative, modified) {
224            let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
225            return;
226        }
227
228        let path_owned = path.to_path_buf();
229        let relative_owned = relative.to_path_buf();
230        let tx_owned = tx.clone();
231        let cache_owned = cache.clone();
232        scope.spawn(move |_| {
233            if let Ok(hash) = Self::compute_hash(&path_owned, self.buffer_size) {
234                cache_owned.insert(&relative_owned, modified, hash);
235                let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
236            } else {
237                log::warn!("Failed to hash file: {:?}", path_owned);
238            }
239        });
240    }
241
242    fn compute_hash(path: &Path, buffer_size: usize) -> io::Result<blake3::Hash> {
243        let start_time = std::time::Instant::now();
244        let mut f = fs::File::open(path)?;
245        let mut hasher = blake3::Hasher::new();
246        if buffer_size == 0 {
247            let len = f.metadata()?.len();
248            if len > 0 {
249                let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
250                hasher.update(&mmap[..]);
251            }
252        } else {
253            let mut buf = vec![0u8; buffer_size];
254            loop {
255                let n = f.read(&mut buf)?;
256                if n == 0 {
257                    break;
258                }
259                hasher.update(&buf[..n]);
260            }
261        }
262        log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
263        Ok(hasher.finalize())
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270    use std::io::Write;
271
272    fn create_file(path: &Path, content: &str) -> io::Result<()> {
273        let mut file = fs::File::create(path)?;
274        file.write_all(content.as_bytes())?;
275        Ok(())
276    }
277
278    #[test]
279    fn test_find_duplicates() -> anyhow::Result<()> {
280        let dir = tempfile::tempdir()?;
281
282        let file1_path = dir.path().join("same1.txt");
283        create_file(&file1_path, "same content")?;
284
285        let file2_path = dir.path().join("same2.txt");
286        create_file(&file2_path, "same content")?;
287
288        let diff_path = dir.path().join("diff.txt");
289        create_file(&diff_path, "different content")?;
290
291        let mut hasher = FileHasher::new(dir.path().to_path_buf());
292        hasher.buffer_size = 8192;
293        let duplicates = hasher.find_duplicates()?;
294
295        assert_eq!(duplicates.len(), 1);
296        let group = &duplicates[0];
297        assert_eq!(group.paths.len(), 2);
298        assert_eq!(group.size, 12); // "same content" is 12 bytes
299
300        assert!(group.paths.contains(&file1_path));
301        assert!(group.paths.contains(&file2_path));
302
303        Ok(())
304    }
305}