dupe_krill/
scanner.rs

1use crate::file::{FileContent, FileSet};
2use crate::metadata::Metadata;
3use crate::reflink::{LinkType, reflink, reflink_or_hardlink};
4use std::cell::RefCell;
5use std::cmp;
6use std::collections::btree_map::Entry as BTreeEntry;
7use std::collections::hash_map::Entry as HashEntry;
8use std::collections::BTreeMap;
9use std::collections::BinaryHeap;
10use std::collections::HashMap;
11use std::collections::HashSet;
12use std::ffi::OsString;
13use std::fmt::Debug;
14use std::fs;
15use std::io;
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18use std::path::Path;
19use std::rc::Rc;
20use std::sync::atomic::AtomicU32;
21use std::sync::atomic::Ordering;
22use std::time::{Duration, Instant};
23
24// Platform-specific metadata access functions
25#[cfg(unix)]
26fn get_inode(metadata: &fs::Metadata) -> u64 {
27    metadata.ino()
28}
29
30#[cfg(windows)]
31fn get_inode(metadata: &fs::Metadata) -> u64 {
32    // Windows doesn't have inodes, but we can create a simple hash-based substitute
33    // This is a simplified approach - for production use, more sophisticated methods
34    // might be needed to ensure uniqueness
35    use std::collections::hash_map::DefaultHasher;
36    use std::hash::{Hash, Hasher};
37    
38    let mut hasher = DefaultHasher::new();
39    metadata.size().hash(&mut hasher);
40    metadata.modified().unwrap_or(std::time::SystemTime::UNIX_EPOCH).hash(&mut hasher);
41    hasher.finish()
42}
43
44#[cfg(unix)]
45fn get_device(metadata: &fs::Metadata) -> u64 {
46    metadata.dev()
47}
48
49#[cfg(windows)]
50fn get_device(_metadata: &fs::Metadata) -> u64 {
51    // On Windows, we'll use a simple constant for device identification
52    // This means hardlinking across different drives won't work properly,
53    // but that's expected behavior and matches filesystem limitations
54    0
55}
56
57// Helper functions to get the proper size (accounting for block overhead)
58#[cfg(unix)]
59fn get_size(metadata: &fs::Metadata) -> u64 {
60    metadata.size()
61}
62
63#[cfg(windows)]
64fn get_size(metadata: &fs::Metadata) -> u64 {
65    // Windows polyfill: round up to the next 4KB block to account for block overhead
66    let len = metadata.size();
67    const BLOCK_SIZE: u64 = 4096;
68    ((len + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE
69}
70
71#[derive(Debug, Copy, Clone, Eq, PartialEq)]
72pub enum RunMode {
73    /// Merges paths in memory, but not on disk. Gives realistic UI output.
74    DryRun,
75    /// Like dry run, but completely skips deduping, with no UI for dupes.
76    DryRunNoMerging,
77    Hardlink,
78    Reflink,
79    /// Try reflinking first, fall back to hardlinking if reflinking fails
80    ReflinkOrHardlink,
81}
82
83#[derive(Debug)]
84pub struct Settings {
85    /// Ignore files smaller than a filesystem block.
86    /// Deduping of such files is unlikely to save space.
87    pub ignore_small: bool,
88    pub run_mode: RunMode,
89
90    // If 1, go to flush. If > 1, abort immediately.
91    pub break_on: Option<&'static AtomicU32>,
92}
93
94impl Settings {
95    pub fn breaks(&self) -> u32 {
96        if let Some(break_on) = self.break_on {
97            break_on.load(Ordering::SeqCst)
98        } else {
99            0
100        }
101    }
102}
103
104#[derive(Debug, Default, Copy, Clone)]
105#[cfg_attr(feature = "json", derive(serde_derive::Serialize))]
106pub struct Stats {
107    pub added: usize,
108    pub skipped: usize,
109    pub dupes: usize,
110    pub bytes_deduplicated: usize,
111    pub hardlinks: usize,
112    pub bytes_saved_by_hardlinks: usize,
113    pub reflinks: usize,
114    pub bytes_saved_by_reflinks: usize,
115}
116
117pub trait ScanListener: Debug {
118    fn file_scanned(&mut self, path: &Path, stats: &Stats);
119    fn scan_over(&self, scanner: &Scanner, stats: &Stats, scan_duration: Duration);
120    fn hardlinked(&mut self, src: &Path, dst: &Path);
121    fn reflinked(&mut self, src: &Path, dst: &Path);
122    fn duplicate_found(&mut self, src: &Path, dst: &Path);
123}
124
125#[derive(Debug)]
126struct SilentListener;
127impl ScanListener for SilentListener {
128    fn file_scanned(&mut self, _: &Path, _: &Stats) {}
129
130    fn scan_over(&self, _: &Scanner, _: &Stats, _: Duration) {}
131
132    fn hardlinked(&mut self, _: &Path, _: &Path) {}
133
134    fn reflinked(&mut self, _: &Path, _: &Path) {}
135
136    fn duplicate_found(&mut self, _: &Path, _: &Path) {}
137}
138
139type RcFileSet = Rc<RefCell<FileSet>>;
140
141#[derive(Debug)]
142pub struct Scanner {
143    /// All hardlinks of the same inode have to be treated as the same file
144    by_inode: HashMap<(u64, u64), RcFileSet>,
145    /// See Hasher for explanation
146    by_content: BTreeMap<FileContent, Vec<RcFileSet>>,
147    /// Directories left to scan. Sorted by inode number.
148    /// I'm assuming scanning in this order is faster, since inode is related to file's age,
149    /// which is related to its physical position on disk, which makes the scan more sequential.
150    to_scan: BinaryHeap<(u64, Box<Path>)>,
151
152    scan_listener: Box<dyn ScanListener>,
153    stats: Stats,
154    exclude: HashSet<OsString>,
155    pub settings: Settings,
156
157    deferred_count: usize,
158    next_deferred_count: usize,
159}
160
161impl Scanner {
162    #[must_use] 
163    pub fn new() -> Self {
164        Self {
165            settings: Settings {
166                ignore_small: true,
167                run_mode: RunMode::Hardlink,
168                break_on: None,
169            },
170            by_inode: HashMap::new(),
171            by_content: BTreeMap::new(),
172            to_scan: BinaryHeap::new(),
173            scan_listener: Box::new(SilentListener),
174            stats: Stats::default(),
175            exclude: HashSet::new(),
176            deferred_count: 0,
177            next_deferred_count: 4096,
178        }
179    }
180
181    pub fn exclude(&mut self, exclude: Vec<String>) {
182        self.exclude = exclude.into_iter().map(From::from).collect();
183    }
184
185    /// Set the scan listener. Caution: This overrides previously set listeners!
186    /// Use a multiplexing listener if multiple listeners are required.
187    pub fn set_listener(&mut self, listener: Box<dyn ScanListener>) {
188        self.scan_listener = listener;
189    }
190
191    /// Scan any file or directory for dupes.
192    /// Dedupe is done within the path as well as against all previously added paths.
193    pub fn scan(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
194        self.enqueue(path)?;
195        self.flush()?;
196        Ok(())
197    }
198
199    pub fn enqueue(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
200        let path = fs::canonicalize(path)?.into_boxed_path();
201        let metadata = fs::symlink_metadata(&path)?;
202        self.add(path, &metadata)?;
203        Ok(())
204    }
205
206    /// Drains the queue of directories to scan
207    pub fn flush(&mut self) -> io::Result<()> {
208        let start_time = Instant::now();
209        while let Some((_, path)) = self.to_scan.pop() {
210            if let Err(err) = self.scan_dir(&path) {
211                eprintln!("Error scanning {}: {}", path.display(), err);
212                self.stats.skipped += 1;
213            }
214            if self.settings.breaks() > 0 {
215                eprintln!("Stopping scan");
216                break;
217            }
218        }
219        self.flush_deferred();
220        let scan_duration = Instant::now().duration_since(start_time);
221        self.scan_listener.scan_over(self, &self.stats, scan_duration);
222        Ok(())
223    }
224
225    fn scan_dir(&mut self, path: &Path) -> io::Result<()> {
226        // Errors are ignored here, since it's super common to find permission denied and unreadable symlinks,
227        // and it'd be annoying if that aborted the whole operation.
228        // FIXME: store the errors somehow to report them in a controlled manner
229        for entry in fs::read_dir(path)?.filter_map(|p| p.ok()) {
230            if self.settings.breaks() > 0 {
231                break;
232            }
233
234            let path = entry.path();
235            if let Some(file_name) = path.file_name() {
236                if self.exclude.contains(file_name) {
237                    self.stats.skipped += 1;
238                    continue;
239                }
240            }
241            if let Err(err) = self.add(path.into_boxed_path(), &entry.metadata()?) {
242                eprintln!("{}: {}", entry.path().display(), err);
243            }
244        }
245        Ok(())
246    }
247
248    fn add(&mut self, path: Box<Path>, metadata: &fs::Metadata) -> io::Result<()> {
249        self.scan_listener.file_scanned(&path, &self.stats);
250
251        let ty = metadata.file_type();
252        if ty.is_dir() {
253            // Inode is truncated to group scanning of roughly close inodes together,
254            // But still preserve some directory traversal order.
255            // Negation to scan from the highest (assuming latest) first.
256            let order_key = !(get_inode(metadata) >> 8);
257            self.to_scan.push((order_key, path));
258            return Ok(());
259        } else if ty.is_symlink() || !ty.is_file() {
260            // Support for traversing symlinks would require preventing loops
261            // Deduping /dev/ would be funny
262            self.stats.skipped += 1;
263            return Ok(());
264        }
265
266        // APFS reports 4*MB* block size
267        // On Windows, use a reasonable default block size since blksize() doesn't exist
268        #[cfg(unix)]
269        let small_size = cmp::min(16 * 1024, metadata.blksize());
270        #[cfg(windows)]
271        let small_size = cmp::min(16 * 1024, 4096u64); // Assume 4KB blocks on Windows
272
273        if get_size(metadata) == 0 || (self.settings.ignore_small && get_size(metadata) < small_size) {
274            self.stats.skipped += 1;
275            return Ok(());
276        }
277        self.stats.added += 1;
278
279        if let Some(fileset) = self.new_fileset(&path, metadata) {
280            self.dedupe_by_content(fileset, path, metadata)?;
281        } else {
282            self.stats.hardlinks += 1;
283            self.stats.bytes_saved_by_hardlinks += get_size(metadata) as usize;
284        }
285        Ok(())
286    }
287
288    /// Creates a new fileset if it's a new file.
289    /// Returns None if it's a hardlink of a file already seen.
290    fn new_fileset(&mut self, path: &Path, metadata: &fs::Metadata) -> Option<RcFileSet> {
291        let path: Box<Path> = path.into();
292
293        // On Windows, skip the by_inode check entirely since Windows doesn't have
294        // proper inodes and hardlink counts
295        #[cfg(windows)]
296        {
297            let fileset = Rc::new(RefCell::new(FileSet::new(path, 1u64)));
298            Some(fileset)
299        }
300
301        #[cfg(unix)]
302        {
303            let device_inode = (get_device(metadata), get_inode(metadata));
304
305            match self.by_inode.entry(device_inode) {
306                HashEntry::Vacant(e) => {
307                    let links = metadata.nlink();
308                    let fileset = Rc::new(RefCell::new(FileSet::new(path, links)));
309                    e.insert(Rc::clone(&fileset)); // clone just bumps a refcount here
310                    Some(fileset)
311                },
312                HashEntry::Occupied(mut e) => {
313                    // This case may require a deferred deduping later,
314                    // if the new link belongs to an old fileset that has already been deduped.
315                    let mut t = e.get_mut().borrow_mut();
316                    t.push(path);
317                    None
318                },
319            }
320        }
321    }
322
323    /// Here's where all the magic happens
324    fn dedupe_by_content(&mut self, fileset: RcFileSet, path: Box<Path>, metadata: &fs::Metadata) -> io::Result<()> {
325        let mut deferred = false;
326        match self.by_content.entry(FileContent::new(path, Metadata::new(metadata))) {
327            BTreeEntry::Vacant(e) => {
328                // Seems unique so far
329                e.insert(vec![fileset]);
330            },
331            BTreeEntry::Occupied(mut e) => {
332                // Found a dupe!
333                self.stats.dupes += 1;
334                self.stats.bytes_deduplicated += get_size(metadata) as usize;
335                let filesets = e.get_mut();
336                filesets.push(fileset);
337                // Deduping can either be done immediately or later. Immediate is more cache-friendly and interactive,
338                // but for files that already have hardlinks it can cause unnecessary re-linking. So if there are
339                // hardlinks in the set, wait until the end to dedupe when all hardlinks are known.
340                if filesets.iter().all(|set| set.borrow().links() == 1) {
341                    Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats)?;
342                } else {
343                    deferred = true;
344                }
345            },
346        }
347
348        // Periodically flush deferred files to avoid building a huge queue
349        // (the growing limit is a compromise between responsiveness
350        // and potential to hit a pathological case of hardlinking with wrong hardlink groups)
351        if deferred {
352            self.deferred_count += 1;
353            if self.deferred_count >= self.next_deferred_count {
354                self.next_deferred_count *= 2;
355                self.deferred_count = 0;
356                self.flush_deferred();
357            }
358        }
359        Ok(())
360    }
361
362    fn flush_deferred(&mut self) {
363        for filesets in self.by_content.values_mut() {
364            if self.settings.breaks() > 1 {
365                eprintln!("Aborting");
366                break;
367            }
368            if let Err(err) = Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener, &mut self.stats) {
369                eprintln!("{err}");
370            }
371        }
372    }
373
374    fn dedupe(filesets: &[RcFileSet], run_mode: RunMode, scan_listener: &mut dyn ScanListener, stats: &mut Stats) -> io::Result<()> {
375        if run_mode == RunMode::DryRunNoMerging {
376            return Ok(());
377        }
378
379        // Find file with the largest number of hardlinks, since it's less work to merge a small group into a large group
380        let mut largest_idx = 0;
381        let mut largest_links = 0;
382        let mut nonempty_filesets = 0;
383        for (idx, fileset) in filesets.iter().enumerate() {
384            let fileset = fileset.borrow();
385            if !fileset.paths.is_empty() {
386                // Only actual paths we can merge matter here
387                nonempty_filesets += 1;
388            }
389            let links = fileset.links();
390            if links > largest_links {
391                largest_idx = idx;
392                largest_links = links;
393            }
394        }
395
396        if nonempty_filesets == 0 {
397            return Ok(()); // Already merged
398        }
399
400        // The set is still going to be in use! So everything has to be updated to make sense for the next call
401        let merged_paths = &mut { filesets[largest_idx].borrow_mut() }.paths;
402        let source_path = merged_paths[0].clone();
403
404        // Get the file size for statistics tracking
405        let file_size = get_size(&fs::symlink_metadata(&source_path)?) as usize;
406
407        for (i, set) in filesets.iter().enumerate() {
408            // We don't want to merge the set with itself
409            if i == largest_idx {
410                continue;
411            }
412
413            let paths = &mut set.borrow_mut().paths;
414            // dest_path will be "lost" on error, but that's fine, since we don't want to dedupe it if it causes errors
415            for dest_path in paths.drain(..) {
416                assert_ne!(&source_path, &dest_path);
417                debug_assert_ne!(get_inode(&fs::symlink_metadata(&source_path)?), get_inode(&fs::symlink_metadata(&dest_path)?));
418
419                if run_mode == RunMode::DryRun {
420                    scan_listener.duplicate_found(&dest_path, &source_path);
421                    merged_paths.push(dest_path);
422                    continue;
423                }
424
425                let temp_path = dest_path.with_file_name(".tmp-dupe-e1iIQcBFn5pC4MUSm-xkcd-221");
426                debug_assert!(!temp_path.exists());
427                debug_assert!(source_path.exists());
428                debug_assert!(dest_path.exists());
429
430                match run_mode {
431                    RunMode::Hardlink => {
432                        // Traditional hardlink behavior
433                        if let Err(err) = fs::hard_link(&source_path, &temp_path) {
434                            eprintln!("unable to hardlink {} {} due to {}", source_path.display(), temp_path.display(), err);
435                            let _ = fs::remove_file(temp_path);
436                            return Err(err);
437                        }
438                        if let Err(err) = fs::rename(&temp_path, &dest_path) {
439                            eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
440                            let _ = fs::remove_file(temp_path);
441                            return Err(err);
442                        }
443                        scan_listener.hardlinked(&dest_path, &source_path);
444                    },
445                    RunMode::Reflink => {
446                        // Only try reflink
447                        if let Err(err) = reflink(&source_path, &temp_path) {
448                            eprintln!("unable to reflink {} {} due to {}", source_path.display(), temp_path.display(), err);
449                            let _ = fs::remove_file(temp_path);
450                            return Err(err);
451                        }
452                        if let Err(err) = fs::rename(&temp_path, &dest_path) {
453                            eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
454                            let _ = fs::remove_file(temp_path);
455                            return Err(err);
456                        }
457                        scan_listener.reflinked(&dest_path, &source_path);
458                        stats.reflinks += 1;
459                        stats.bytes_saved_by_reflinks += file_size;
460                    },
461                    RunMode::ReflinkOrHardlink => {
462                        // Try reflink first, fallback to hardlink
463                        match reflink_or_hardlink(&source_path, &temp_path)? {
464                            LinkType::Reflink => {
465                                if let Err(err) = fs::rename(&temp_path, &dest_path) {
466                                    eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
467                                    let _ = fs::remove_file(temp_path);
468                                    return Err(err);
469                                }
470                                scan_listener.reflinked(&dest_path, &source_path);
471                                stats.reflinks += 1;
472                                stats.bytes_saved_by_reflinks += file_size;
473                            },
474                            LinkType::Hardlink => {
475                                if let Err(err) = fs::rename(&temp_path, &dest_path) {
476                                    eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
477                                    let _ = fs::remove_file(temp_path);
478                                    return Err(err);
479                                }
480                                scan_listener.hardlinked(&dest_path, &source_path);
481                            },
482                        }
483                    },
484                    _ => unreachable!("Invalid run mode for linking operation"),
485                }
486
487                debug_assert!(!temp_path.exists());
488                debug_assert!(source_path.exists());
489                debug_assert!(dest_path.exists());
490                merged_paths.push(dest_path);
491            }
492        }
493        Ok(())
494    }
495
496    #[must_use]
497    pub fn dupes(&self) -> Vec<Vec<FileSet>> {
498        self.by_content.values().map(|filesets| {
499            filesets.iter().map(|d|{
500                let tmp = d.borrow();
501                (*tmp).clone()
502            }).collect()
503        }).collect()
504    }
505}
506