1use crate::{FileHashCache, FileIterator, ProgressReporter};
2use std::collections::HashMap;
3use std::fs;
4use std::io::{self, Read};
5use std::path::{Path, PathBuf};
6use std::sync::atomic::{AtomicUsize, Ordering};
7use std::sync::{Arc, mpsc};
8
9#[derive(Debug, Clone)]
10enum HashProgress {
11 StartDiscovering,
12 TotalFiles(usize),
13 Result(PathBuf, u64, blake3::Hash, bool),
14}
15
16enum EntryState {
17 Single(PathBuf, std::time::SystemTime),
18 Hashing,
19}
20
21#[derive(Debug, Clone)]
23pub struct DuplicatedFiles {
24 pub paths: Vec<PathBuf>,
25 pub size: u64,
26}
27
28pub struct FileHasher {
30 dir: PathBuf,
31 pub buffer_size: usize,
32 cache: Arc<FileHashCache>,
33 pub(crate) num_hashed: AtomicUsize,
34 pub(crate) num_hash_looked_up: AtomicUsize,
35}
36
37impl FileHasher {
38 pub fn new(dir: PathBuf) -> Self {
40 let cache = FileHashCache::find_or_new(&dir);
41 Self {
42 dir,
43 buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
44 cache,
45 num_hashed: AtomicUsize::new(0),
46 num_hash_looked_up: AtomicUsize::new(0),
47 }
48 }
49
50 pub fn remove_cache_entry(&self, path: &Path) -> anyhow::Result<()> {
52 let relative = crate::strip_prefix(path, self.cache.base_dir())?;
53 self.cache.remove(relative);
54 Ok(())
55 }
56
57 pub fn save_cache(&self) -> anyhow::Result<()> {
59 log::info!(
60 "Hash stats for {:?}: {} computed, {} looked up",
61 self.dir,
62 self.num_hashed.load(Ordering::Relaxed),
63 self.num_hash_looked_up.load(Ordering::Relaxed)
64 );
65 Ok(self.cache.save()?)
66 }
67
68 pub(crate) fn merge_cache(&self, other_cache: &FileHashCache) {
70 self.cache.merge(other_cache);
71 }
72
73 pub fn clear_cache(&self) -> anyhow::Result<()> {
75 let relative = crate::strip_prefix(&self.dir, self.cache.base_dir())?;
76 self.cache.clear(relative);
77 Ok(())
78 }
79
80 pub fn run(&self) -> anyhow::Result<()> {
82 let start_time = std::time::Instant::now();
83 let mut duplicates = self.find_duplicates()?;
84 if duplicates.is_empty() {
85 println!("No duplicates found.");
86 } else {
87 duplicates.sort_by(|a, b| a.size.cmp(&b.size));
88 let mut total_wasted_space = 0;
89 for dupes in &duplicates {
90 let paths = &dupes.paths;
91 let file_size = dupes.size;
92 println!(
93 "Identical {} files of {}:",
94 paths.len(),
95 crate::human_readable_size(file_size)
96 );
97 for path in paths {
98 println!(" {}", path.display());
99 }
100 total_wasted_space += file_size * (paths.len() as u64 - 1);
101 }
102 eprintln!(
103 "Total wasted space: {}",
104 crate::human_readable_size(total_wasted_space)
105 );
106 }
107 eprintln!("Finished in {:?}.", start_time.elapsed());
108 Ok(())
109 }
110
111 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
113 let progress = ProgressReporter::new();
114 progress.set_message("Scanning directories...");
115
116 let (tx, rx) = mpsc::channel();
117 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
118 let mut num_cache_hits = 0;
119 std::thread::scope(|scope| {
120 scope.spawn(|| {
121 if let Err(e) = self.find_duplicates_internal(tx) {
122 log::error!("Error during duplicate finding: {}", e);
123 }
124 });
125
126 while let Ok(event) = rx.recv() {
127 match event {
128 HashProgress::StartDiscovering => {
129 progress.set_message("Hashing files...");
130 }
131 HashProgress::TotalFiles(total) => {
132 progress.set_length(total as u64);
133 if num_cache_hits > 0 {
134 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
135 }
136 }
137 HashProgress::Result(path, size, hash, is_cache_hit) => {
138 if is_cache_hit {
139 num_cache_hits += 1;
140 if progress.length().is_none() {
141 progress.set_message(format!(
142 "Hashing files... ({} cache hits)",
143 num_cache_hits
144 ));
145 } else {
146 progress.set_message(format!(" ({} cache hits)", num_cache_hits));
147 }
148 }
149
150 progress.inc(1);
151 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
152 paths: Vec::new(),
153 size,
154 });
155 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
157 entry.paths.push(path);
158 }
159 }
160 }
161 });
162 progress.finish();
163
164 let mut duplicates = Vec::new();
165 for (_, mut dupes) in by_hash {
166 if dupes.paths.len() > 1 {
167 dupes.paths.sort();
168 duplicates.push(dupes);
169 }
170 }
171 Ok(duplicates)
172 }
173
174 fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
175 tx.send(HashProgress::StartDiscovering)?;
176 let mut by_size: HashMap<u64, EntryState> = HashMap::new();
177 let mut total_hashed = 0;
178
179 rayon::scope(|scope| -> anyhow::Result<()> {
180 let mut it = FileIterator::new(self.dir.clone());
181 it.hasher = Some(self);
182 for (_, current_path) in it {
183 let meta = fs::metadata(¤t_path)?;
184 let size = meta.len();
185 let modified = meta.modified()?;
186
187 match by_size.entry(size) {
190 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
191 EntryState::Single(first_path, first_modified) => {
192 self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
195 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
196
197 *occ.get_mut() = EntryState::Hashing;
199 total_hashed += 2;
200 }
201 EntryState::Hashing => {
202 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
204 total_hashed += 1;
205 }
206 },
207 std::collections::hash_map::Entry::Vacant(vac) => {
208 vac.insert(EntryState::Single(current_path, modified));
209 }
210 }
211 }
212 tx.send(HashProgress::TotalFiles(total_hashed))?;
213 Ok(())
214 })?;
215
216 self.save_cache()
219 }
220
221 fn spawn_hash_task<'scope>(
222 &'scope self,
223 scope: &rayon::Scope<'scope>,
224 path: &Path,
225 size: u64,
226 modified: std::time::SystemTime,
227 tx: &mpsc::Sender<HashProgress>,
228 ) {
229 let relative = crate::strip_prefix(path, self.cache.base_dir())
230 .expect("path should be in cache base_dir");
231 if let Some(hash) = self.cache.get(relative, modified) {
232 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
233 let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
234 return;
235 }
236
237 let path_owned = path.to_path_buf();
238 let relative_owned = relative.to_path_buf();
239 let tx_owned = tx.clone();
240 let cache_owned = self.cache.clone();
241 scope.spawn(move |_| {
242 if let Ok(hash) = self.compute_hash(&path_owned) {
243 self.num_hashed.fetch_add(1, Ordering::Relaxed);
244 cache_owned.insert(&relative_owned, modified, hash);
245 let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
246 } else {
247 log::warn!("Failed to hash file: {:?}", path_owned);
248 }
249 });
250 }
251
252 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
254 let meta = fs::metadata(path)?;
255 let modified = meta.modified()?;
256 let relative = crate::strip_prefix(path, self.cache.base_dir())
257 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
258 if let Some(hash) = self.cache.get(relative, modified) {
259 self.num_hash_looked_up.fetch_add(1, Ordering::Relaxed);
260 return Ok(hash);
261 }
262
263 let hash = self.compute_hash(path)?;
264 self.num_hashed.fetch_add(1, Ordering::Relaxed);
265 self.cache.insert(relative, modified, hash);
266 Ok(hash)
267 }
268
269 fn compute_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
270 let start_time = std::time::Instant::now();
271 let mut f = fs::File::open(path)?;
272 let mut hasher = blake3::Hasher::new();
273 if self.buffer_size == 0 {
274 let len = f.metadata()?.len();
275 if len > 0 {
276 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
277 hasher.update(&mmap[..]);
278 }
279 } else {
280 let mut buf = vec![0u8; self.buffer_size];
281 loop {
282 let n = f.read(&mut buf)?;
283 if n == 0 {
284 break;
285 }
286 hasher.update(&buf[..n]);
287 }
288 }
289 log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
290 Ok(hasher.finalize())
291 }
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_find_duplicates() -> anyhow::Result<()> {
300 let dir = tempfile::tempdir()?;
301
302 let file1_path = dir.path().join("same1.txt");
303 fs::write(&file1_path, "same content")?;
304
305 let file2_path = dir.path().join("same2.txt");
306 fs::write(&file2_path, "same content")?;
307
308 let diff_path = dir.path().join("diff.txt");
309 fs::write(&diff_path, "different content")?;
310
311 let mut hasher = FileHasher::new(dir.path().to_path_buf());
312 hasher.buffer_size = 8192;
313 let duplicates = hasher.find_duplicates()?;
314
315 assert_eq!(hasher.num_hashed.load(Ordering::Relaxed), 2);
316 assert_eq!(hasher.num_hash_looked_up.load(Ordering::Relaxed), 0);
317
318 assert_eq!(duplicates.len(), 1);
319 let group = &duplicates[0];
320 assert_eq!(group.paths.len(), 2);
321 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
324 assert!(group.paths.contains(&file2_path));
325
326 Ok(())
327 }
328
329 #[test]
330 fn test_find_duplicates_merge_cache() -> anyhow::Result<()> {
331 let dir = tempfile::tempdir()?;
332 let dir_path = dir.path();
333
334 let sub_dir = dir_path.join("a").join("a");
335 fs::create_dir_all(&sub_dir)?;
336
337 let file1_path = sub_dir.join("1");
338 fs::write(&file1_path, "same content")?;
339
340 let file2_path = sub_dir.join("2");
341 fs::write(&file2_path, "same content")?;
342
343 let cache_aa_path = sub_dir.join(FileHashCache::FILE_NAME);
345 fs::File::create(&cache_aa_path)?;
346
347 let hasher_aa = FileHasher::new(sub_dir.clone());
349 let duplicates_aa = hasher_aa.find_duplicates()?;
350 assert_eq!(duplicates_aa.len(), 1);
351 assert!(cache_aa_path.exists());
352 assert_eq!(hasher_aa.num_hashed.load(Ordering::Relaxed), 2);
353 assert_eq!(hasher_aa.num_hash_looked_up.load(Ordering::Relaxed), 0);
354
355 let root_a = dir_path.join("a");
357 let cache_a_path = root_a.join(FileHashCache::FILE_NAME);
358 fs::File::create(&cache_a_path)?;
359
360 let hasher_a = FileHasher::new(root_a.clone());
362 let duplicates_a = hasher_a.find_duplicates()?;
363 assert_eq!(duplicates_a.len(), 1);
364 assert_eq!(hasher_a.num_hashed.load(Ordering::Relaxed), 0);
365 assert_eq!(hasher_a.num_hash_looked_up.load(Ordering::Relaxed), 2);
366
367 assert!(cache_a_path.exists());
369 assert!(!cache_aa_path.exists());
370
371 Ok(())
372 }
373}