1use crate::file_hash_cache::FileHashCache;
2use indicatif::{ProgressBar, ProgressStyle};
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, mpsc};
8use walkdir::WalkDir;
9
10#[derive(Debug, Clone)]
11enum HashProgress {
12 StartDiscovering,
13 TotalFiles(usize),
14 Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18 Single(PathBuf, std::time::SystemTime),
19 Hashing,
20}
21
22#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25 pub paths: Vec<PathBuf>,
26 pub size: u64,
27}
28
29pub struct FileHasher {
31 dir: PathBuf,
32 pub buffer_size: usize,
33 cache: Arc<FileHashCache>,
34}
35
36impl FileHasher {
37 pub fn new(dir: PathBuf) -> Self {
39 let cache = FileHashCache::find_or_new(&dir);
40 Self {
41 dir,
42 buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
43 cache,
44 }
45 }
46
47 pub fn save_cache(&self) -> anyhow::Result<()> {
49 Ok(self.cache.save()?)
50 }
51
52 pub fn clear_cache(&self) -> anyhow::Result<()> {
54 let relative = self.dir.strip_prefix(self.cache.base_dir())?;
55 self.cache.clear(relative);
56 Ok(())
57 }
58
59 pub fn run(&self) -> anyhow::Result<()> {
61 let start_time = std::time::Instant::now();
62 let mut duplicates = self.find_duplicates()?;
63 if duplicates.is_empty() {
64 println!("No duplicates found.");
65 } else {
66 duplicates.sort_by(|a, b| a.size.cmp(&b.size));
67 let mut total_wasted_space = 0;
68 for dupes in &duplicates {
69 let paths = &dupes.paths;
70 let file_size = dupes.size;
71 println!(
72 "Identical {} files of {}:",
73 paths.len(),
74 crate::human_readable_size(file_size)
75 );
76 for path in paths {
77 println!(" {}", path.display());
78 }
79 total_wasted_space += file_size * (paths.len() as u64 - 1);
80 }
81 eprintln!(
82 "Total wasted space: {}",
83 crate::human_readable_size(total_wasted_space)
84 );
85 }
86 eprintln!("Finished in {:?}.", start_time.elapsed());
87 Ok(())
88 }
89
90 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
92 let progress = ProgressBar::new_spinner();
93 progress.enable_steady_tick(std::time::Duration::from_millis(120));
94 progress.set_style(
95 ProgressStyle::with_template("[{elapsed_precise}] {spinner:.green} {msg}").unwrap(),
96 );
97 progress.set_message("Discovering and hashing files...");
98
99 let (tx, rx) = mpsc::channel();
100 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
101 let mut hashed_count = 0;
102 let mut cache_hits = 0;
103 std::thread::scope(|scope| {
104 scope.spawn(|| {
105 if let Err(e) = self.find_duplicates_internal(tx) {
106 log::error!("Error during duplicate finding: {}", e);
107 }
108 });
109
110 while let Ok(event) = rx.recv() {
111 match event {
112 HashProgress::StartDiscovering => {
113 progress.set_message("Discovering and hashing files...");
114 }
115 HashProgress::TotalFiles(total) => {
116 progress.set_length(total as u64);
117 if total > 0 {
118 progress.set_style(
119 ProgressStyle::with_template(
120 "[{elapsed_precise}] {bar:40.cyan/blue} {percent}% {pos:>7}/{len:7} {msg}",
121 )
122 .unwrap(),
123 );
124 }
125 progress.set_message(format!(" ({} cache hits)", cache_hits));
126 }
127 HashProgress::Result(path, size, hash, is_cache_hit) => {
128 hashed_count += 1;
129 if is_cache_hit {
130 cache_hits += 1;
131 }
132
133 if progress.length().is_none() {
135 progress.set_message(format!(
136 "Hashed {} files ({} cache hits)...",
137 hashed_count, cache_hits
138 ));
139 } else if cache_hits > 0 {
140 progress.set_message(format!(" ({} cache hits)", cache_hits));
141 }
142
143 progress.inc(1);
144 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
145 paths: Vec::new(),
146 size,
147 });
148 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
150 entry.paths.push(path);
151 }
152 }
153 }
154 });
155 progress.finish();
156 log::info!("Hashed {} files ({} cache hits).", hashed_count, cache_hits);
157
158 let mut duplicates = Vec::new();
159 for (_, mut dupes) in by_hash {
160 if dupes.paths.len() > 1 {
161 dupes.paths.sort();
162 duplicates.push(dupes);
163 }
164 }
165 Ok(duplicates)
166 }
167
168 fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
169 tx.send(HashProgress::StartDiscovering)?;
170 let mut by_size: HashMap<u64, EntryState> = HashMap::new();
171 let mut total_hashed = 0;
172
173 rayon::scope(|scope| -> anyhow::Result<()> {
174 for entry in WalkDir::new(&self.dir).into_iter().filter_map(|e| e.ok()) {
175 if !entry.file_type().is_file() {
176 continue;
177 }
178 let meta = entry.metadata()?;
179 let size = meta.len();
180 let modified = meta.modified()?;
181 let current_path = entry.path().to_path_buf();
184
185 match by_size.entry(size) {
186 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
187 EntryState::Single(first_path, first_modified) => {
188 self.spawn_hash_task(scope, first_path, size, *first_modified, &tx);
191 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
192
193 *occ.get_mut() = EntryState::Hashing;
195 total_hashed += 2;
196 }
197 EntryState::Hashing => {
198 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx);
200 total_hashed += 1;
201 }
202 },
203 std::collections::hash_map::Entry::Vacant(vac) => {
204 vac.insert(EntryState::Single(current_path, modified));
205 }
206 }
207 }
208 tx.send(HashProgress::TotalFiles(total_hashed))?;
209 Ok(())
210 })?;
211
212 self.save_cache()
215 }
216
217 fn spawn_hash_task<'scope>(
218 &'scope self,
219 scope: &rayon::Scope<'scope>,
220 path: &Path,
221 size: u64,
222 modified: std::time::SystemTime,
223 tx: &mpsc::Sender<HashProgress>,
224 ) {
225 let relative = path
226 .strip_prefix(self.cache.base_dir())
227 .expect("path should be in cache base_dir");
228 if let Some(hash) = self.cache.get(relative, modified) {
229 let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
230 return;
231 }
232
233 let path_owned = path.to_path_buf();
234 let relative_owned = relative.to_path_buf();
235 let tx_owned = tx.clone();
236 let cache_owned = self.cache.clone();
237 scope.spawn(move |_| {
238 if let Ok(hash) = Self::compute_hash(&path_owned, self.buffer_size) {
239 cache_owned.insert(&relative_owned, modified, hash);
240 let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
241 } else {
242 log::warn!("Failed to hash file: {:?}", path_owned);
243 }
244 });
245 }
246
247 pub fn get_hash(&self, path: &Path) -> io::Result<blake3::Hash> {
249 let meta = fs::metadata(path)?;
250 let modified = meta.modified()?;
251 let relative = path
252 .strip_prefix(self.cache.base_dir())
253 .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
254
255 if let Some(hash) = self.cache.get(relative, modified) {
256 return Ok(hash);
257 }
258
259 let hash = Self::compute_hash(path, self.buffer_size)?;
260 self.cache.insert(relative, modified, hash);
261 Ok(hash)
262 }
263
264 fn compute_hash(path: &Path, buffer_size: usize) -> io::Result<blake3::Hash> {
265 let start_time = std::time::Instant::now();
266 let mut f = fs::File::open(path)?;
267 let mut hasher = blake3::Hasher::new();
268 if buffer_size == 0 {
269 let len = f.metadata()?.len();
270 if len > 0 {
271 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
272 hasher.update(&mmap[..]);
273 }
274 } else {
275 let mut buf = vec![0u8; buffer_size];
276 loop {
277 let n = f.read(&mut buf)?;
278 if n == 0 {
279 break;
280 }
281 hasher.update(&buf[..n]);
282 }
283 }
284 log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
285 Ok(hasher.finalize())
286 }
287}
288
289#[cfg(test)]
290mod tests {
291 use super::*;
292 use std::io::Write;
293
294 fn create_file(path: &Path, content: &str) -> io::Result<()> {
295 let mut file = fs::File::create(path)?;
296 file.write_all(content.as_bytes())?;
297 Ok(())
298 }
299
300 #[test]
301 fn test_find_duplicates() -> anyhow::Result<()> {
302 let dir = tempfile::tempdir()?;
303
304 let file1_path = dir.path().join("same1.txt");
305 create_file(&file1_path, "same content")?;
306
307 let file2_path = dir.path().join("same2.txt");
308 create_file(&file2_path, "same content")?;
309
310 let diff_path = dir.path().join("diff.txt");
311 create_file(&diff_path, "different content")?;
312
313 let mut hasher = FileHasher::new(dir.path().to_path_buf());
314 hasher.buffer_size = 8192;
315 let duplicates = hasher.find_duplicates()?;
316
317 assert_eq!(duplicates.len(), 1);
318 let group = &duplicates[0];
319 assert_eq!(group.paths.len(), 2);
320 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
323 assert!(group.paths.contains(&file2_path));
324
325 Ok(())
326 }
327}