use super::fsutil;
use super::index;
use itertools::Itertools;
use plmap::PipelineMap;
use std::collections::{HashMap, VecDeque};
use std::ffi::OsString;
use std::os::unix::ffi::{OsStrExt, OsStringExt};
use std::os::unix::fs::FileTypeExt;
use std::os::unix::fs::MetadataExt;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
pub struct FsWalkerOptions {
pub exclusions: globset::GlobSet,
pub exclusion_markers: std::collections::HashSet<std::ffi::OsString>,
pub one_file_system: bool,
pub ignore_permission_errors: bool,
pub file_action_log_fn: Option<Arc<index::FileActionLogFn>>,
}
pub struct FsWalker {
opts: FsWalkerOptions,
base_dev: u64,
prefilled_children: HashMap<PathBuf, Vec<(PathBuf, std::fs::FileType)>>,
buffered: VecDeque<PathBuf>,
work_stack: Vec<Vec<PathBuf>>,
}
impl FsWalker {
pub fn new(
paths: &[PathBuf],
opts: FsWalkerOptions,
) -> Result<(PathBuf, FsWalker), anyhow::Error> {
if paths.is_empty() {
anyhow::bail!("no paths specified");
}
let mut absolute_paths = Vec::new();
for p in paths.iter() {
let p = match fsutil::absolute_path(p) {
Ok(p) => p,
Err(err) => anyhow::bail!("unable to get absolute path of {:?}: {}", p, err),
};
absolute_paths.push(p)
}
if absolute_paths.len() == 1 {
match absolute_paths[0].symlink_metadata() {
Ok(stat) => {
if !stat.is_dir() {
return Ok((
absolute_paths[0].clone(),
FsWalker {
opts,
base_dev: stat.dev(),
prefilled_children: HashMap::new(),
buffered: VecDeque::from(absolute_paths),
work_stack: Vec::new(),
},
));
}
}
Err(err) => anyhow::bail!("unable to stat {:?}: {}", absolute_paths[0], err),
};
}
absolute_paths.sort_by(|l, r| index::path_cmp(l, r));
absolute_paths.dedup();
let mut root_paths = Vec::new();
let mut i = 0;
while i < absolute_paths.len() {
let mut j = i + 1;
loop {
match (&absolute_paths[i], absolute_paths.get(j)) {
(_, None) => break,
(a, Some(b)) => {
if fsutil::common_path(a, b).unwrap() != *a {
break;
}
}
}
j += 1;
}
root_paths.push(absolute_paths[i].clone());
i = j;
}
let base = fsutil::common_path_all(&root_paths).unwrap();
let mut filler_dirs = std::collections::HashSet::new();
if root_paths.len() > 1 {
filler_dirs.insert(base.clone());
for p in root_paths.iter() {
let mut p = p.clone();
p.pop();
while p != base {
filler_dirs.insert(p.clone());
p.pop();
}
}
}
let mut prefilled_children = HashMap::<PathBuf, Vec<(PathBuf, std::fs::FileType)>>::new();
for p in filler_dirs.drain().chain(root_paths.drain(..)) {
let stat = match p.symlink_metadata() {
Ok(stat) => stat,
Err(err) => anyhow::bail!("unable to stat {:?}: {}", p, err),
};
let parent = if let Some(parent) = p.parent() {
parent.to_owned()
} else {
"".into()
};
match prefilled_children.get_mut(&parent) {
Some(v) => {
v.push((p, stat.file_type()));
}
None => {
prefilled_children.insert(parent, vec![(p, stat.file_type())]);
}
}
}
for (_, v) in prefilled_children.iter_mut() {
v.sort_by(|l, r| index::path_cmp(&l.0, &r.0))
}
let base_md = match base.symlink_metadata() {
Ok(md) => md,
Err(err) => anyhow::bail!("unable to stat {:?}: {}", base, err),
};
let start_dir = if let Some(base_parent) = base.parent() {
base_parent.to_owned()
} else {
"".into()
};
Ok((
base,
FsWalker {
opts,
base_dev: base_md.dev(),
prefilled_children,
buffered: VecDeque::new(),
work_stack: vec![vec![start_dir]],
},
))
}
fn walk_dir(&mut self, dir_path: PathBuf) -> Result<(), std::io::Error> {
let mut children = match self.prefilled_children.remove(&dir_path) {
Some(children) => children,
None => match std::fs::read_dir(&dir_path) {
Ok(dir_ents) => {
let mut children = Vec::with_capacity(8);
for dir_ent in dir_ents {
let dir_ent = dir_ent?;
children.push((dir_ent.path(), dir_ent.file_type()?))
}
children
}
Err(err)
if fsutil::likely_smear_error(&err)
|| (self.opts.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied) =>
{
Vec::new()
}
Err(err) => {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!("unable to walk {}: {}", dir_path.display(), err),
));
}
},
};
let mut to_recurse: Vec<PathBuf> = Vec::new();
if !self.opts.exclusion_markers.is_empty() {
for p in children.iter() {
if self
.opts
.exclusion_markers
.contains(p.0.file_name().unwrap())
{
children.retain(|p| {
if !self
.opts
.exclusion_markers
.contains(p.0.file_name().unwrap())
{
if let Some(log) = &self.opts.file_action_log_fn {
let _ = log('x', '-', &p.0);
}
return false;
};
true
});
break;
}
}
}
children.retain(|p| {
if self.opts.exclusions.is_match(&p.0) {
if let Some(log) = &self.opts.file_action_log_fn {
let _ = log('x', '-', &p.0);
}
return false;
};
true
});
children.sort_by(|l, r| {
index::path_cmp(
l.0.file_name().unwrap().as_ref(),
r.0.file_name().unwrap().as_ref(),
)
});
for (child_path, child_file_type) in children.drain(..) {
if child_file_type.is_socket() {
continue;
}
if child_file_type.is_dir() {
if self.opts.one_file_system {
match child_path.symlink_metadata() {
Ok(stat) if stat.dev() == self.base_dev => {
to_recurse.push(child_path.clone())
}
Ok(_) => {}
Err(err)
if fsutil::likely_smear_error(&err)
|| (self.opts.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied) => {}
Err(err) => {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!("unable to walk {}: {}", child_path.display(), err),
))
}
};
} else {
to_recurse.push(child_path.clone());
}
}
self.buffered.push_back(child_path);
}
if !to_recurse.is_empty() {
to_recurse.reverse();
self.work_stack.push(to_recurse)
}
Ok(())
}
}
impl Iterator for FsWalker {
type Item = Result<PathBuf, std::io::Error>;
fn next(&mut self) -> Option<Result<PathBuf, std::io::Error>> {
loop {
if let Some(v) = self.buffered.pop_front() {
return Some(Ok(v));
}
match self.work_stack.last_mut() {
Some(v) => {
if let Some(p) = v.pop() {
match self.walk_dir(p) {
Ok(_) => continue,
Err(err) => return Some(Err(err)),
}
} else {
self.work_stack.pop();
continue;
}
}
None => return None,
}
}
}
}
#[derive(Clone)]
struct MetadataCollector {
base_path: PathBuf,
want_xattrs: bool,
want_sparseness: bool,
want_hash: bool,
ignore_permission_errors: bool,
}
impl MetadataCollector {
#[cfg(any(
target_os = "dragonfly",
target_os = "freebsd",
target_os = "illumos",
target_os = "linux",
target_os = "solaris"
))]
fn probe_sparse(&self, path: &Path, stat: &std::fs::Metadata) -> std::io::Result<bool> {
use std::os::unix::io::AsRawFd;
if (stat.blocks() * 512) >= stat.size() {
return Ok(false);
}
let mut offset = 0;
let f = std::fs::File::open(path)?;
match nix::unistd::lseek(f.as_raw_fd(), offset, nix::unistd::Whence::SeekHole) {
Ok(next) => offset = next,
Err(_) => return Ok(false),
}
let hole_start = offset;
match nix::unistd::lseek(f.as_raw_fd(), offset, nix::unistd::Whence::SeekData) {
Ok(next) => offset = next,
Err(nix::Error::ENXIO) => {
match nix::unistd::lseek(f.as_raw_fd(), 0, nix::unistd::Whence::SeekEnd) {
Ok(next) => offset = next,
Err(err) => return Err(err.into()),
}
}
Err(_) => return Ok(false),
}
let hole_size = offset - hole_start;
Ok(hole_size != 0)
}
#[cfg(not(any(
target_os = "dragonfly",
target_os = "freebsd",
target_os = "illumos",
target_os = "linux",
target_os = "solaris"
)))]
fn probe_sparse(&self, _path: &Path, stat: &std::fs::Metadata) -> std::io::Result<bool> {
Ok((stat.blocks() * 512) < stat.size())
}
fn index_entry_from_fs(&self, path: &Path) -> std::io::Result<index::IndexEntry> {
let mut xattrs = None;
let stat = path.symlink_metadata()?;
if self.want_xattrs && (stat.is_file() || stat.is_dir()) {
match xattr::list(path) {
Ok(attrs) => {
for attr in attrs {
match xattr::get(path, &attr) {
Ok(Some(value)) => {
if xattrs.is_none() {
xattrs = Some(index::Xattrs::new())
}
match xattrs {
Some(ref mut xattrs) => {
xattrs.insert(attr.into_vec(), value);
}
_ => unreachable!(),
}
}
Ok(None) => (), Err(err) if fsutil::likely_smear_error(&err) => (), Err(err)
if self.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied => {}
Err(err) => return Err(err),
}
}
}
Err(err) if fsutil::likely_smear_error(&err) => (), Err(err)
if self.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied => {}
Err(err) => return Err(err),
}
}
let data_hash = if stat.is_file() && self.want_hash {
let mut hasher = blake3::Hasher::new();
let mut f = std::fs::File::open(path)?;
std::io::copy(&mut f, &mut hasher)?;
index::ContentCryptoHash::Blake3(hasher.finalize().into())
} else {
index::ContentCryptoHash::None
};
let sparse = if self.want_sparseness && stat.is_file() {
match self.probe_sparse(path, &stat) {
Ok(sparse) => sparse,
Err(err) if fsutil::likely_smear_error(&err) => false,
Err(err)
if self.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied =>
{
false
}
Err(err) => return Err(err),
}
} else {
false
};
let ft = stat.file_type();
let (dev_major, dev_minor) = if ft.is_block_device() || ft.is_char_device() {
(
fsutil::dev_major(stat.rdev()),
fsutil::dev_minor(stat.rdev()),
)
} else {
(0, 0)
};
let link_target = if ft.is_symlink() {
Some(std::fs::read_link(path)?)
} else {
None
};
let index_path = {
let base_path_bytes = self.base_path.as_os_str().as_bytes();
let path_bytes = path.as_os_str().as_bytes();
if base_path_bytes.len() == path_bytes.len() {
if ft.is_dir() {
PathBuf::from(".")
} else {
PathBuf::from(path.file_name().unwrap())
}
} else {
PathBuf::from(OsString::from_vec(
path_bytes[base_path_bytes.len() + 1..].to_vec(),
))
}
};
Ok(index::IndexEntry {
path: index_path,
size: serde_bare::Uint(if stat.is_file() { stat.size() } else { 0 }),
uid: serde_bare::Uint(stat.uid() as u64),
gid: serde_bare::Uint(stat.gid() as u64),
mode: serde_bare::Uint(stat.permissions().mode() as u64),
ctime: serde_bare::Uint(stat.ctime() as u64),
ctime_nsec: serde_bare::Uint(stat.ctime_nsec() as u64),
mtime: serde_bare::Uint(stat.mtime() as u64),
mtime_nsec: serde_bare::Uint(stat.mtime_nsec() as u64),
nlink: serde_bare::Uint(stat.nlink()),
dev_major: serde_bare::Uint(dev_major as u64),
dev_minor: serde_bare::Uint(dev_minor as u64),
norm_dev: serde_bare::Uint(stat.dev()),
ino: serde_bare::Uint(stat.ino()),
data_cursor: index::RelativeDataCursor {
chunk_delta: serde_bare::Uint(0),
start_byte_offset: serde_bare::Uint(0),
end_byte_offset: serde_bare::Uint(0),
},
sparse,
xattrs,
link_target,
data_hash,
})
}
}
impl plmap::Mapper<Vec<Result<PathBuf, std::io::Error>>> for MetadataCollector {
type Out = Vec<Result<(PathBuf, index::IndexEntry), std::io::Error>>;
fn apply(&mut self, paths: Vec<Result<PathBuf, std::io::Error>>) -> Self::Out {
let mut ents = Vec::with_capacity(paths.len());
for path in paths {
match path {
Ok(path) => match self.index_entry_from_fs(&path) {
Ok(ent) => ents.push(Ok((path, ent))),
Err(err) => ents.push(Err(err)),
},
Err(err) => ents.push(Err(err)),
}
}
ents
}
}
#[derive(Default)]
pub struct DevNormalizer {
count: u64,
tab: HashMap<u64, u64>,
}
impl DevNormalizer {
pub fn new() -> Self {
DevNormalizer {
count: 0,
tab: HashMap::new(),
}
}
pub fn normalize(&mut self, dev: u64) -> u64 {
match self.tab.get(&dev) {
Some(nd) => *nd,
None => {
let nd = self.count;
self.count += 1;
self.tab.insert(dev, nd);
nd
}
}
}
}
pub struct FsIndexerOptions {
pub exclusions: globset::GlobSet,
pub exclusion_markers: std::collections::HashSet<std::ffi::OsString>,
pub want_xattrs: bool,
pub want_sparseness: bool,
pub want_hash: bool,
pub one_file_system: bool,
pub ignore_permission_errors: bool,
pub file_action_log_fn: Option<Arc<index::FileActionLogFn>>,
pub threads: usize,
}
pub struct FsIndexer {
index_ents: Box<dyn Iterator<Item = Result<(PathBuf, index::IndexEntry), std::io::Error>>>,
dev_normalizer: DevNormalizer,
ignore_permission_errors: bool,
}
impl FsIndexer {
pub fn new(paths: &[PathBuf], mut opts: FsIndexerOptions) -> Result<FsIndexer, anyhow::Error> {
let (base_path, fs_walker) = FsWalker::new(
paths,
FsWalkerOptions {
exclusions: std::mem::take(&mut opts.exclusions),
exclusion_markers: std::mem::take(&mut opts.exclusion_markers),
one_file_system: opts.one_file_system,
ignore_permission_errors: opts.ignore_permission_errors,
file_action_log_fn: opts.file_action_log_fn.take(),
},
)?;
let batched_paths = fs_walker.batching(|it| {
const METADATA_BATCH_SIZE: usize = 256;
let mut batch = Vec::with_capacity(METADATA_BATCH_SIZE);
while batch.len() < METADATA_BATCH_SIZE {
match it.next() {
Some(p) => batch.push(p),
None if batch.is_empty() => return None,
None => break,
}
}
Some(batch)
});
let metadata_collector = MetadataCollector {
base_path,
want_xattrs: opts.want_xattrs,
want_sparseness: opts.want_sparseness,
want_hash: opts.want_hash,
ignore_permission_errors: opts.ignore_permission_errors,
};
let index_ents = batched_paths
.plmap(opts.threads, metadata_collector)
.flatten();
Ok(FsIndexer {
index_ents: Box::new(index_ents),
dev_normalizer: DevNormalizer::new(),
ignore_permission_errors: opts.ignore_permission_errors,
})
}
}
impl Iterator for FsIndexer {
type Item = Result<(PathBuf, index::IndexEntry), anyhow::Error>;
fn next(&mut self) -> Option<Result<(PathBuf, index::IndexEntry), anyhow::Error>> {
match self.index_ents.next() {
Some(Ok((path, mut ent))) => {
ent.norm_dev.0 = self.dev_normalizer.normalize(ent.norm_dev.0);
Some(Ok((path, ent)))
}
Some(Err(err)) if fsutil::likely_smear_error(&err) => self.next(),
Some(Err(err))
if self.ignore_permission_errors
&& err.kind() == std::io::ErrorKind::PermissionDenied =>
{
self.next()
}
Some(Err(err)) => Some(Err(err.into())),
None => None,
}
}
}