1use crate::file::{FileContent, FileSet};
2use crate::metadata::Metadata;
3use crate::reflink::{LinkType, reflink, reflink_or_hardlink};
4use std::cell::RefCell;
5use std::cmp;
6use std::collections::btree_map::Entry as BTreeEntry;
7use std::collections::hash_map::Entry as HashEntry;
8use std::collections::BTreeMap;
9use std::collections::BinaryHeap;
10use std::collections::HashMap;
11use std::collections::HashSet;
12use std::ffi::OsString;
13use std::fmt::Debug;
14use std::fs;
15use std::io;
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18use std::path::Path;
19use std::rc::Rc;
20use std::sync::atomic::AtomicU32;
21use std::sync::atomic::Ordering;
22use std::time::{Duration, Instant};
23
24#[cfg(unix)]
26fn get_inode(metadata: &fs::Metadata) -> u64 {
27 metadata.ino()
28}
29
30#[cfg(windows)]
31fn get_inode(metadata: &fs::Metadata) -> u64 {
32 use std::collections::hash_map::DefaultHasher;
36 use std::hash::{Hash, Hasher};
37
38 let mut hasher = DefaultHasher::new();
39 metadata.size().hash(&mut hasher);
40 metadata.modified().unwrap_or(std::time::SystemTime::UNIX_EPOCH).hash(&mut hasher);
41 hasher.finish()
42}
43
44#[cfg(unix)]
45fn get_device(metadata: &fs::Metadata) -> u64 {
46 metadata.dev()
47}
48
49#[cfg(windows)]
50fn get_device(_metadata: &fs::Metadata) -> u64 {
51 0
55}
56
57#[cfg(unix)]
59fn get_size(metadata: &fs::Metadata) -> u64 {
60 metadata.size()
61}
62
63#[cfg(windows)]
64fn get_size(metadata: &fs::Metadata) -> u64 {
65 let len = metadata.size();
67 const BLOCK_SIZE: u64 = 4096;
68 ((len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE
69}
70
71#[derive(Debug, Copy, Clone, Eq, PartialEq)]
72pub enum RunMode {
73 DryRun,
75 DryRunNoMerging,
77 Hardlink,
78 Reflink,
79 ReflinkOrHardlink,
81}
82
83#[derive(Debug)]
84pub struct Settings {
85 pub ignore_small: bool,
88 pub run_mode: RunMode,
89
90 pub break_on: Option<&'static AtomicU32>,
92}
93
94impl Settings {
95 pub fn breaks(&self) -> u32 {
96 if let Some(break_on) = self.break_on {
97 break_on.load(Ordering::SeqCst)
98 } else {
99 0
100 }
101 }
102}
103
104#[derive(Debug, Default, Copy, Clone)]
105#[cfg_attr(feature = "json", derive(serde_derive::Serialize))]
106pub struct Stats {
107 pub added: usize,
108 pub skipped: usize,
109 pub dupes: usize,
110 pub bytes_deduplicated: usize,
111 pub hardlinks: usize,
112 pub bytes_saved_by_hardlinks: usize,
113 pub reflinks: usize,
114 pub bytes_saved_by_reflinks: usize,
115}
116
117pub trait ScanListener: Debug {
118 fn file_scanned(&mut self, path: &Path, stats: &Stats);
119 fn scan_over(&self, scanner: &Scanner, stats: &Stats, scan_duration: Duration);
120 fn hardlinked(&mut self, src: &Path, dst: &Path);
121 fn reflinked(&mut self, src: &Path, dst: &Path);
122 fn duplicate_found(&mut self, src: &Path, dst: &Path);
123}
124
125#[derive(Debug)]
126struct SilentListener;
127impl ScanListener for SilentListener {
128 fn file_scanned(&mut self, _: &Path, _: &Stats) {}
129
130 fn scan_over(&self, _: &Scanner, _: &Stats, _: Duration) {}
131
132 fn hardlinked(&mut self, _: &Path, _: &Path) {}
133
134 fn reflinked(&mut self, _: &Path, _: &Path) {}
135
136 fn duplicate_found(&mut self, _: &Path, _: &Path) {}
137}
138
139type RcFileSet = Rc<RefCell<FileSet>>;
140
141#[derive(Debug)]
142pub struct Scanner {
143 by_inode: HashMap<(u64, u64), RcFileSet>,
145 by_content: BTreeMap<FileContent, Vec<RcFileSet>>,
147 to_scan: BinaryHeap<(u64, Box<Path>)>,
151
152 scan_listener: Box<dyn ScanListener>,
153 stats: Stats,
154 exclude: HashSet<OsString>,
155 pub settings: Settings,
156
157 deferred_count: usize,
158 next_deferred_count: usize,
159}
160
161impl Scanner {
162 #[must_use]
163 pub fn new() -> Self {
164 Self {
165 settings: Settings {
166 ignore_small: true,
167 run_mode: RunMode::Hardlink,
168 break_on: None,
169 },
170 by_inode: HashMap::new(),
171 by_content: BTreeMap::new(),
172 to_scan: BinaryHeap::new(),
173 scan_listener: Box::new(SilentListener),
174 stats: Stats::default(),
175 exclude: HashSet::new(),
176 deferred_count: 0,
177 next_deferred_count: 4096,
178 }
179 }
180
181 pub fn exclude(&mut self, exclude: Vec<String>) {
182 self.exclude = exclude.into_iter().map(From::from).collect();
183 }
184
185 pub fn set_listener(&mut self, listener: Box<dyn ScanListener>) {
188 self.scan_listener = listener;
189 }
190
191 pub fn scan(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
194 self.enqueue(path)?;
195 self.flush()?;
196 Ok(())
197 }
198
199 pub fn enqueue(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
200 let path = fs::canonicalize(path)?.into_boxed_path();
201 let metadata = fs::symlink_metadata(&path)?;
202 self.add(path, &metadata)?;
203 Ok(())
204 }
205
206 pub fn flush(&mut self) -> io::Result<()> {
208 let start_time = Instant::now();
209 while let Some((_, path)) = self.to_scan.pop() {
210 if let Err(err) = self.scan_dir(&path) {
211 eprintln!("Error scanning {}: {}", path.display(), err);
212 self.stats.skipped += 1;
213 }
214 if self.settings.breaks() > 0 {
215 eprintln!("Stopping scan");
216 break;
217 }
218 }
219 self.flush_deferred();
220 let scan_duration = Instant::now().duration_since(start_time);
221 self.scan_listener.scan_over(self, &self.stats, scan_duration);
222 Ok(())
223 }
224
225 fn scan_dir(&mut self, path: &Path) -> io::Result<()> {
226 for entry in fs::read_dir(path)?.filter_map(|p| p.ok()) {
230 if self.settings.breaks() > 0 {
231 break;
232 }
233
234 let path = entry.path();
235 if let Some(file_name) = path.file_name() {
236 if self.exclude.contains(file_name) {
237 self.stats.skipped += 1;
238 continue;
239 }
240 }
241 if let Err(err) = self.add(path.into_boxed_path(), &entry.metadata()?) {
242 eprintln!("{}: {}", entry.path().display(), err);
243 }
244 }
245 Ok(())
246 }
247
248 fn add(&mut self, path: Box<Path>, metadata: &fs::Metadata) -> io::Result<()> {
249 self.scan_listener.file_scanned(&path, &self.stats);
250
251 let ty = metadata.file_type();
252 if ty.is_dir() {
253 let order_key = !(get_inode(metadata) >> 8);
257 self.to_scan.push((order_key, path));
258 return Ok(());
259 } else if ty.is_symlink() || !ty.is_file() {
260 self.stats.skipped += 1;
263 return Ok(());
264 }
265
266 #[cfg(unix)]
269 let small_size = cmp::min(16 * 1024, metadata.blksize());
270 #[cfg(windows)]
271 let small_size = cmp::min(16 * 1024, 4096u64); if get_size(metadata) == 0 || (self.settings.ignore_small && get_size(metadata) < small_size) {
274 self.stats.skipped += 1;
275 return Ok(());
276 }
277 self.stats.added += 1;
278
279 if let Some(fileset) = self.new_fileset(&path, metadata) {
280 self.dedupe_by_content(fileset, path, metadata)?;
281 } else {
282 self.stats.hardlinks += 1;
283 self.stats.bytes_saved_by_hardlinks += get_size(metadata) as usize;
284 }
285 Ok(())
286 }
287
288 fn new_fileset(&mut self, path: &Path, metadata: &fs::Metadata) -> Option<RcFileSet> {
291 let path: Box<Path> = path.into();
292
293 #[cfg(windows)]
296 {
297 let fileset = Rc::new(RefCell::new(FileSet::new(path, 1u64)));
298 Some(fileset)
299 }
300
301 #[cfg(unix)]
302 {
303 let device_inode = (get_device(metadata), get_inode(metadata));
304
305 match self.by_inode.entry(device_inode) {
306 HashEntry::Vacant(e) => {
307 let links = metadata.nlink();
308 let fileset = Rc::new(RefCell::new(FileSet::new(path, links)));
309 e.insert(Rc::clone(&fileset)); Some(fileset)
311 },
312 HashEntry::Occupied(mut e) => {
313 let mut t = e.get_mut().borrow_mut();
316 t.push(path);
317 None
318 },
319 }
320 }
321 }
322
323 fn dedupe_by_content(&mut self, fileset: RcFileSet, path: Box<Path>, metadata: &fs::Metadata) -> io::Result<()> {
325 let mut deferred = false;
326 match self.by_content.entry(FileContent::new(path, Metadata::new(metadata))) {
327 BTreeEntry::Vacant(e) => {
328 e.insert(vec![fileset]);
330 },
331 BTreeEntry::Occupied(mut e) => {
332 self.stats.dupes += 1;
334 self.stats.bytes_deduplicated += get_size(metadata) as usize;
335 let filesets = e.get_mut();
336 filesets.push(fileset);
337 if filesets.iter().all(|set| set.borrow().links() == 1) {
341 Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats)?;
342 } else {
343 deferred = true;
344 }
345 },
346 }
347
348 if deferred {
352 self.deferred_count += 1;
353 if self.deferred_count >= self.next_deferred_count {
354 self.next_deferred_count *= 2;
355 self.deferred_count = 0;
356 self.flush_deferred();
357 }
358 }
359 Ok(())
360 }
361
362 fn flush_deferred(&mut self) {
363 for filesets in self.by_content.values_mut() {
364 if self.settings.breaks() > 1 {
365 eprintln!("Aborting");
366 break;
367 }
368 if let Err(err) = Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats) {
369 eprintln!("{err}");
370 }
371 }
372 }
373
374 fn dedupe(filesets: &[RcFileSet], run_mode: RunMode, scan_listener: &mut dyn ScanListener, stats: &mut Stats) -> io::Result<()> {
375 if run_mode == RunMode::DryRunNoMerging {
376 return Ok(());
377 }
378
379 let mut largest_idx = 0;
381 let mut largest_links = 0;
382 let mut nonempty_filesets = 0;
383 for (idx, fileset) in filesets.iter().enumerate() {
384 let fileset = fileset.borrow();
385 if !fileset.paths.is_empty() {
386 nonempty_filesets += 1;
388 }
389 let links = fileset.links();
390 if links > largest_links {
391 largest_idx = idx;
392 largest_links = links;
393 }
394 }
395
396 if nonempty_filesets == 0 {
397 return Ok(()); }
399
400 let merged_paths = &mut { filesets[largest_idx].borrow_mut() }.paths;
402 let source_path = merged_paths[0].clone();
403
404 let file_size = get_size(&fs::symlink_metadata(&source_path)?) as usize;
406
407 for (i, set) in filesets.iter().enumerate() {
408 if i == largest_idx {
410 continue;
411 }
412
413 let paths = &mut set.borrow_mut().paths;
414 for dest_path in paths.drain(..) {
416 assert_ne!(&source_path, &dest_path);
417 debug_assert_ne!(get_inode(&fs::symlink_metadata(&source_path)?), get_inode(&fs::symlink_metadata(&dest_path)?));
418
419 if run_mode == RunMode::DryRun {
420 scan_listener.duplicate_found(&dest_path, &source_path);
421 merged_paths.push(dest_path);
422 continue;
423 }
424
425 let temp_path = dest_path.with_file_name(".tmp-dupe-e1iIQcBFn5pC4MUSm-xkcd-221");
426 debug_assert!(!temp_path.exists());
427 debug_assert!(source_path.exists());
428 debug_assert!(dest_path.exists());
429
430 match run_mode {
431 RunMode::Hardlink => {
432 if let Err(err) = fs::hard_link(&source_path, &temp_path) {
434 eprintln!("unable to hardlink {} {} due to {}", source_path.display(), temp_path.display(), err);
435 let _ = fs::remove_file(temp_path);
436 return Err(err);
437 }
438 if let Err(err) = fs::rename(&temp_path, &dest_path) {
439 eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
440 let _ = fs::remove_file(temp_path);
441 return Err(err);
442 }
443 scan_listener.hardlinked(&dest_path, &source_path);
444 },
445 RunMode::Reflink => {
446 if let Err(err) = reflink(&source_path, &temp_path) {
448 eprintln!("unable to reflink {} {} due to {}", source_path.display(), temp_path.display(), err);
449 let _ = fs::remove_file(temp_path);
450 return Err(err);
451 }
452 if let Err(err) = fs::rename(&temp_path, &dest_path) {
453 eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
454 let _ = fs::remove_file(temp_path);
455 return Err(err);
456 }
457 scan_listener.reflinked(&dest_path, &source_path);
458 stats.reflinks += 1;
459 stats.bytes_saved_by_reflinks += file_size;
460 },
461 RunMode::ReflinkOrHardlink => {
462 match reflink_or_hardlink(&source_path, &temp_path)? {
464 LinkType::Reflink => {
465 if let Err(err) = fs::rename(&temp_path, &dest_path) {
466 eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
467 let _ = fs::remove_file(temp_path);
468 return Err(err);
469 }
470 scan_listener.reflinked(&dest_path, &source_path);
471 stats.reflinks += 1;
472 stats.bytes_saved_by_reflinks += file_size;
473 },
474 LinkType::Hardlink => {
475 if let Err(err) = fs::rename(&temp_path, &dest_path) {
476 eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
477 let _ = fs::remove_file(temp_path);
478 return Err(err);
479 }
480 scan_listener.hardlinked(&dest_path, &source_path);
481 },
482 }
483 },
484 _ => unreachable!("Invalid run mode for linking operation"),
485 }
486
487 debug_assert!(!temp_path.exists());
488 debug_assert!(source_path.exists());
489 debug_assert!(dest_path.exists());
490 merged_paths.push(dest_path);
491 }
492 }
493 Ok(())
494 }
495
496 #[must_use]
497 pub fn dupes(&self) -> Vec<Vec<FileSet>> {
498 self.by_content.values().map(|filesets| {
499 filesets.iter().map(|d|{
500 let tmp = d.borrow();
501 (*tmp).clone()
502 }).collect()
503 }).collect()
504 }
505}
506