1use crate::file_hash_cache::FileHashCache;
2use indicatif::{ProgressBar, ProgressStyle};
3use std::collections::HashMap;
4use std::fs;
5use std::io::{self, Read};
6use std::path::{Path, PathBuf};
7use std::sync::mpsc;
8use walkdir::WalkDir;
9
10#[derive(Debug, Clone)]
11pub(crate) enum HashProgress {
12 StartDiscovering,
13 TotalFiles(usize),
14 Result(PathBuf, u64, blake3::Hash, bool),
15}
16
17enum EntryState {
18 Single(PathBuf, std::time::SystemTime),
19 Hashing,
20}
21
22#[derive(Debug, Clone)]
24pub struct DuplicatedFiles {
25 pub paths: Vec<PathBuf>,
26 pub size: u64,
27}
28
29pub struct FileHasher {
31 dir: PathBuf,
32 pub buffer_size: usize,
33}
34
35impl FileHasher {
36 pub fn new(dir: PathBuf) -> Self {
38 Self {
39 dir,
40 buffer_size: crate::FileComparer::DEFAULT_BUFFER_SIZE,
41 }
42 }
43
44 pub fn run(&self) -> anyhow::Result<()> {
46 let start_time = std::time::Instant::now();
47 let mut duplicates = self.find_duplicates()?;
48 if duplicates.is_empty() {
49 println!("No duplicates found.");
50 } else {
51 duplicates.sort_by(|a, b| a.size.cmp(&b.size));
52 let mut total_wasted_space = 0;
53 for dupes in &duplicates {
54 let paths = &dupes.paths;
55 let file_size = dupes.size;
56 println!(
57 "Identical {} files of {}:",
58 paths.len(),
59 crate::human_readable_size(file_size)
60 );
61 for path in paths {
62 println!(" {}", path.display());
63 }
64 total_wasted_space += file_size * (paths.len() as u64 - 1);
65 }
66 eprintln!(
67 "Total wasted space: {}",
68 crate::human_readable_size(total_wasted_space)
69 );
70 }
71 eprintln!("Finished in {:?}.", start_time.elapsed());
72 Ok(())
73 }
74
75 pub fn find_duplicates(&self) -> anyhow::Result<Vec<DuplicatedFiles>> {
77 let progress = ProgressBar::new_spinner();
78 progress.enable_steady_tick(std::time::Duration::from_millis(120));
79 progress.set_style(
80 ProgressStyle::with_template("[{elapsed_precise}] {spinner:.green} {msg}").unwrap(),
81 );
82 progress.set_message("Discovering and hashing files...");
83
84 let (tx, rx) = mpsc::channel();
85 let mut by_hash: HashMap<blake3::Hash, DuplicatedFiles> = HashMap::new();
86 let mut hashed_count = 0;
87 let mut cache_hits = 0;
88 std::thread::scope(|scope| {
89 scope.spawn(|| {
90 if let Err(e) = self.find_duplicates_internal(tx) {
91 log::error!("Error during duplicate finding: {}", e);
92 }
93 });
94
95 while let Ok(event) = rx.recv() {
96 match event {
97 HashProgress::StartDiscovering => {
98 progress.set_message("Discovering and hashing files...");
99 }
100 HashProgress::TotalFiles(total) => {
101 progress.set_length(total as u64);
102 if total > 0 {
103 progress.set_style(
104 ProgressStyle::with_template(
105 "[{elapsed_precise}] {bar:40.cyan/blue} {percent}% {pos:>7}/{len:7} {msg}",
106 )
107 .unwrap(),
108 );
109 }
110 progress.set_message(format!(" ({} cache hits)", cache_hits));
111 }
112 HashProgress::Result(path, size, hash, is_cache_hit) => {
113 hashed_count += 1;
114 if is_cache_hit {
115 cache_hits += 1;
116 }
117
118 if progress.length().is_none() {
120 progress.set_message(format!(
121 "Hashed {} files ({} cache hits)...",
122 hashed_count, cache_hits
123 ));
124 } else if cache_hits > 0 {
125 progress.set_message(format!(" ({} cache hits)", cache_hits));
126 }
127
128 progress.inc(1);
129 let entry = by_hash.entry(hash).or_insert_with(|| DuplicatedFiles {
130 paths: Vec::new(),
131 size,
132 });
133 assert_eq!(entry.size, size, "Hash collision: sizes do not match");
135 entry.paths.push(path);
136 }
137 }
138 }
139 });
140 progress.finish();
141 log::info!("Hashed {} files ({} cache hits).", hashed_count, cache_hits);
142
143 let mut duplicates = Vec::new();
144 for (_, mut dupes) in by_hash {
145 if dupes.paths.len() > 1 {
146 dupes.paths.sort();
147 duplicates.push(dupes);
148 }
149 }
150 Ok(duplicates)
151 }
152
153 fn find_duplicates_internal(&self, tx: mpsc::Sender<HashProgress>) -> anyhow::Result<()> {
154 tx.send(HashProgress::StartDiscovering)?;
155 let mut by_size: HashMap<u64, EntryState> = HashMap::new();
156 let mut total_hashed = 0;
157 let cache = FileHashCache::find_or_new(&self.dir);
158
159 rayon::scope(|scope| -> anyhow::Result<()> {
160 for entry in WalkDir::new(&self.dir).into_iter().filter_map(|e| e.ok()) {
161 if !entry.file_type().is_file() {
162 continue;
163 }
164 let meta = entry.metadata()?;
165 let size = meta.len();
166 let modified = meta.modified()?;
167 let current_path = entry.path().to_path_buf();
170
171 match by_size.entry(size) {
172 std::collections::hash_map::Entry::Occupied(mut occ) => match occ.get_mut() {
173 EntryState::Single(first_path, first_modified) => {
174 self.spawn_hash_task(
177 scope,
178 first_path,
179 size,
180 *first_modified,
181 &tx,
182 &cache,
183 );
184 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx, &cache);
185
186 *occ.get_mut() = EntryState::Hashing;
188 total_hashed += 2;
189 }
190 EntryState::Hashing => {
191 self.spawn_hash_task(scope, ¤t_path, size, modified, &tx, &cache);
193 total_hashed += 1;
194 }
195 },
196 std::collections::hash_map::Entry::Vacant(vac) => {
197 vac.insert(EntryState::Single(current_path, modified));
198 }
199 }
200 }
201 tx.send(HashProgress::TotalFiles(total_hashed))?;
202 Ok(())
203 })?;
204
205 let _ = cache.save();
208 Ok(())
209 }
210
211 fn spawn_hash_task<'scope>(
212 &'scope self,
213 scope: &rayon::Scope<'scope>,
214 path: &Path,
215 size: u64,
216 modified: std::time::SystemTime,
217 tx: &mpsc::Sender<HashProgress>,
218 cache: &std::sync::Arc<FileHashCache>,
219 ) {
220 let relative = path
221 .strip_prefix(cache.base_dir())
222 .expect("path should be in cache base_dir");
223 if let Some(hash) = cache.get(relative, modified) {
224 let _ = tx.send(HashProgress::Result(path.to_path_buf(), size, hash, true));
225 return;
226 }
227
228 let path_owned = path.to_path_buf();
229 let relative_owned = relative.to_path_buf();
230 let tx_owned = tx.clone();
231 let cache_owned = cache.clone();
232 scope.spawn(move |_| {
233 if let Ok(hash) = Self::compute_hash(&path_owned, self.buffer_size) {
234 cache_owned.insert(&relative_owned, modified, hash);
235 let _ = tx_owned.send(HashProgress::Result(path_owned, size, hash, false));
236 } else {
237 log::warn!("Failed to hash file: {:?}", path_owned);
238 }
239 });
240 }
241
242 fn compute_hash(path: &Path, buffer_size: usize) -> io::Result<blake3::Hash> {
243 let start_time = std::time::Instant::now();
244 let mut f = fs::File::open(path)?;
245 let mut hasher = blake3::Hasher::new();
246 if buffer_size == 0 {
247 let len = f.metadata()?.len();
248 if len > 0 {
249 let mmap = unsafe { memmap2::MmapOptions::new().map(&f)? };
250 hasher.update(&mmap[..]);
251 }
252 } else {
253 let mut buf = vec![0u8; buffer_size];
254 loop {
255 let n = f.read(&mut buf)?;
256 if n == 0 {
257 break;
258 }
259 hasher.update(&buf[..n]);
260 }
261 }
262 log::trace!("Hashed in {:?}: {:?}", start_time.elapsed(), path);
263 Ok(hasher.finalize())
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270 use std::io::Write;
271
272 fn create_file(path: &Path, content: &str) -> io::Result<()> {
273 let mut file = fs::File::create(path)?;
274 file.write_all(content.as_bytes())?;
275 Ok(())
276 }
277
278 #[test]
279 fn test_find_duplicates() -> anyhow::Result<()> {
280 let dir = tempfile::tempdir()?;
281
282 let file1_path = dir.path().join("same1.txt");
283 create_file(&file1_path, "same content")?;
284
285 let file2_path = dir.path().join("same2.txt");
286 create_file(&file2_path, "same content")?;
287
288 let diff_path = dir.path().join("diff.txt");
289 create_file(&diff_path, "different content")?;
290
291 let mut hasher = FileHasher::new(dir.path().to_path_buf());
292 hasher.buffer_size = 8192;
293 let duplicates = hasher.find_duplicates()?;
294
295 assert_eq!(duplicates.len(), 1);
296 let group = &duplicates[0];
297 assert_eq!(group.paths.len(), 2);
298 assert_eq!(group.size, 12); assert!(group.paths.contains(&file1_path));
301 assert!(group.paths.contains(&file2_path));
302
303 Ok(())
304 }
305}