use std::ffi::OsString;
use crate::file::{FileContent, FileSet};
use crate::metadata::Metadata;
use std::cell::RefCell;
use std::cmp;
use std::collections::btree_map::Entry as BTreeEntry;
use std::collections::hash_map::Entry as HashEntry;
use std::collections::BTreeMap;
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Debug;
use std::fs;
use std::io;
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::rc::Rc;
use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use std::time::{Duration, Instant};
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum RunMode {
DryRun,
DryRunNoMerging,
Hardlink,
}
#[derive(Debug)]
pub struct Settings {
pub ignore_small: bool,
pub run_mode: RunMode,
pub break_on: Option<&'static AtomicU32>,
}
impl Settings {
pub fn breaks(&self) -> u32 {
if let Some(break_on) = self.break_on {
break_on.load(Ordering::SeqCst)
} else {
0
}
}
}
#[derive(Debug, Default, Copy, Clone)]
#[cfg_attr(feature = "json", derive(serde_derive::Serialize))]
pub struct Stats {
pub added: usize,
pub skipped: usize,
pub dupes: usize,
pub hardlinks: usize,
}
pub trait ScanListener: Debug {
fn file_scanned(&mut self, path: &Path, stats: &Stats);
fn scan_over(&self, scanner: &Scanner, stats: &Stats, scan_duration: Duration);
fn hardlinked(&mut self, src: &Path, dst: &Path);
fn duplicate_found(&mut self, src: &Path, dst: &Path);
}
#[derive(Debug)]
struct SilentListener;
impl ScanListener for SilentListener {
fn file_scanned(&mut self, _: &Path, _: &Stats) {}
fn scan_over(&self, _: &Scanner, _: &Stats, _: Duration) {}
fn hardlinked(&mut self, _: &Path, _: &Path) {}
fn duplicate_found(&mut self, _: &Path, _: &Path) {}
}
type RcFileSet = Rc<RefCell<FileSet>>;
#[derive(Debug)]
pub struct Scanner {
by_inode: HashMap<(u64, u64), RcFileSet>,
by_content: BTreeMap<FileContent, Vec<RcFileSet>>,
to_scan: BinaryHeap<(u64, PathBuf)>,
scan_listener: Box<dyn ScanListener>,
stats: Stats,
exclude: HashSet<OsString>,
pub settings: Settings,
deferred_count: usize,
next_deferred_count: usize,
}
impl Scanner {
pub fn new() -> Self {
Scanner {
settings: Settings {
ignore_small: true,
run_mode: RunMode::Hardlink,
break_on: None,
},
by_inode: HashMap::new(),
by_content: BTreeMap::new(),
to_scan: BinaryHeap::new(),
scan_listener: Box::new(SilentListener),
stats: Stats::default(),
exclude: HashSet::new(),
deferred_count: 0,
next_deferred_count: 4096,
}
}
pub fn exclude(&mut self, exclude: Vec<String>) {
self.exclude = exclude.into_iter().map(From::from).collect();
}
pub fn set_listener(&mut self, listener: Box<dyn ScanListener>) {
self.scan_listener = listener;
}
pub fn scan(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
self.enqueue(path)?;
self.flush()?;
Ok(())
}
pub fn enqueue(&mut self, path: impl AsRef<Path>) -> io::Result<()> {
let path = fs::canonicalize(path)?;
let metadata = fs::symlink_metadata(&path)?;
self.add(path, &metadata)?;
Ok(())
}
pub fn flush(&mut self) -> io::Result<()> {
let start_time = Instant::now();
while let Some((_, path)) = self.to_scan.pop() {
if let Err(err) = self.scan_dir(&path) {
eprintln!("Error scanning {}: {}", path.display(), err);
self.stats.skipped += 1;
}
if self.settings.breaks() > 0 {
eprintln!("Stopping scan");
break;
}
}
self.flush_deferred()?;
let scan_duration = Instant::now().duration_since(start_time);
self.scan_listener.scan_over(self, &self.stats, scan_duration);
Ok(())
}
fn scan_dir(&mut self, path: &Path) -> io::Result<()> {
for entry in fs::read_dir(path)?.filter_map(|p| p.ok()) {
if self.settings.breaks() > 0 {
break;
}
let path = entry.path();
if let Some(file_name) = path.file_name() {
if self.exclude.contains(file_name) {
self.stats.skipped += 1;
continue;
}
}
if let Err(err) = self.add(path, &entry.metadata()?) {
eprintln!("{}: {}", entry.path().display(), err);
}
}
Ok(())
}
fn add(&mut self, path: PathBuf, metadata: &fs::Metadata) -> io::Result<()> {
self.scan_listener.file_scanned(&path, &self.stats);
let ty = metadata.file_type();
if ty.is_dir() {
let order_key = !(metadata.ino() >> 8);
self.to_scan.push((order_key, path));
return Ok(());
} else if ty.is_symlink() {
self.stats.skipped += 1;
return Ok(());
} else if !ty.is_file() {
self.stats.skipped += 1;
return Ok(());
}
let small_size = cmp::min(16 * 1024, metadata.blksize());
if metadata.size() == 0 || (self.settings.ignore_small && metadata.size() < small_size) {
self.stats.skipped += 1;
return Ok(());
}
self.stats.added += 1;
if let Some(fileset) = self.new_fileset(&path, metadata) {
self.dedupe_by_content(fileset, path, metadata)?;
} else {
self.stats.hardlinks += 1;
}
Ok(())
}
fn new_fileset(&mut self, path: &Path, metadata: &fs::Metadata) -> Option<RcFileSet> {
let path = path.to_path_buf();
let device_inode = (metadata.dev(), metadata.ino());
match self.by_inode.entry(device_inode) {
HashEntry::Vacant(e) => {
let fileset = Rc::new(RefCell::new(FileSet::new(path, metadata.nlink())));
e.insert(Rc::clone(&fileset));
Some(fileset)
},
HashEntry::Occupied(mut e) => {
let mut t = e.get_mut().borrow_mut();
t.push(path);
None
},
}
}
fn dedupe_by_content(&mut self, fileset: RcFileSet, path: PathBuf, metadata: &fs::Metadata) -> io::Result<()> {
let mut deferred = false;
match self.by_content.entry(FileContent::new(path, Metadata::new(metadata))) {
BTreeEntry::Vacant(e) => {
e.insert(vec![fileset]);
},
BTreeEntry::Occupied(mut e) => {
self.stats.dupes += 1;
let filesets = e.get_mut();
filesets.push(fileset);
if filesets.iter().all(|set| set.borrow().links() == 1) {
Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener)?;
} else {
deferred = true;
}
},
}
if deferred {
self.deferred_count += 1;
if self.deferred_count >= self.next_deferred_count {
self.next_deferred_count *= 2;
self.deferred_count = 0;
self.flush_deferred()?;
}
}
Ok(())
}
fn flush_deferred(&mut self) -> io::Result<()> {
for (_, filesets) in &mut self.by_content {
if self.settings.breaks() > 1 {
eprintln!("Aborting");
break;
}
if let Err(err) = Self::dedupe(filesets, self.settings.run_mode, &mut *self.scan_listener) {
eprintln!("{}", err);
}
}
Ok(())
}
fn dedupe(filesets: &mut Vec<RcFileSet>, run_mode: RunMode, scan_listener: &mut dyn ScanListener) -> io::Result<()> {
if run_mode == RunMode::DryRunNoMerging {
return Ok(());
}
let mut largest_idx = 0;
let mut largest_links = 0;
let mut nonempty_filesets = 0;
for (idx, fileset) in filesets.iter().enumerate() {
let fileset = fileset.borrow();
if !fileset.paths.is_empty() {
nonempty_filesets += 1;
}
let links = fileset.links();
if links > largest_links {
largest_idx = idx;
largest_links = links;
}
}
if nonempty_filesets == 0 {
return Ok(());
}
let merged_paths = &mut { filesets[largest_idx].borrow_mut() }.paths;
let source_path = merged_paths[0].clone();
for (i, set) in filesets.iter().enumerate() {
if i == largest_idx {
continue;
}
let paths = &mut set.borrow_mut().paths;
for dest_path in paths.drain(..) {
assert_ne!(&source_path, &dest_path);
debug_assert_ne!(fs::symlink_metadata(&source_path)?.ino(), fs::symlink_metadata(&dest_path)?.ino());
if run_mode == RunMode::DryRun {
scan_listener.duplicate_found(&dest_path, &source_path);
merged_paths.push(dest_path);
continue;
}
let temp_path = dest_path.with_file_name(".tmp-dupe-e1iIQcBFn5pC4MUSm-xkcd-221");
debug_assert!(!temp_path.exists());
debug_assert!(source_path.exists());
debug_assert!(dest_path.exists());
if let Err(err) = fs::hard_link(&source_path, &temp_path) {
eprintln!("unable to hardlink {} {} due to {}", source_path.display(), temp_path.display(), err);
let _ = fs::remove_file(temp_path);
return Err(err);
}
if let Err(err) = fs::rename(&temp_path, &dest_path) {
eprintln!("unable to rename {} {} due to {}", temp_path.display(), dest_path.display(), err);
let _ = fs::remove_file(temp_path);
return Err(err);
}
debug_assert!(!temp_path.exists());
debug_assert!(source_path.exists());
debug_assert!(dest_path.exists());
scan_listener.hardlinked(&dest_path, &source_path);
merged_paths.push(dest_path);
}
}
Ok(())
}
pub fn dupes(&self) -> Vec<Vec<FileSet>> {
self.by_content.iter().map(|(_,filesets)|{
filesets.iter().map(|d|{
let tmp = d.borrow();
(*tmp).clone()
}).collect()
}).collect()
}
}