use std::cmp::{max, min, Reverse};
use std::collections::HashMap;
use std::fmt::{Display, Formatter};
use std::hash::{Hash, Hasher};
use std::io::{ErrorKind, Write};
use std::ops::{Add, AddAssign};
use std::sync::mpsc::channel;
use std::sync::Arc;
use std::time::SystemTime;
use std::{fmt, fs, io};
use chrono::{DateTime, FixedOffset, Local};
use priority_queue::PriorityQueue;
use rand::distributions::Alphanumeric;
use rand::Rng;
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
use crate::config::{DedupeConfig, Priority};
use crate::device::DiskDevices;
use crate::file::{FileId, FileLen, FileMetadata};
use crate::group::{FileGroup, FileSubGroup};
use crate::lock::FileLock;
use crate::log::{Log, LogExt};
use crate::path::Path;
use crate::util::{max_result, min_result, try_sort_by_key};
use crate::{Error, TIMESTAMP_FMT};
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum DedupeOp {
Remove,
Move(Arc<Path>),
SymbolicLink,
HardLink,
RefLink,
}
#[derive(Clone, Debug)]
pub struct PathAndMetadata {
pub path: Path,
pub metadata: FileMetadata,
}
impl PathAndMetadata {
pub fn new(path: Path) -> io::Result<PathAndMetadata> {
let metadata = FileMetadata::new(&path).map_err(|e| {
io::Error::new(
e.kind(),
format!("Failed to read metadata of {}: {}", path.display(), e),
)
})?;
Ok(PathAndMetadata { metadata, path })
}
}
impl AsRef<PathAndMetadata> for PathAndMetadata {
fn as_ref(&self) -> &PathAndMetadata {
self
}
}
impl AsRef<Path> for PathAndMetadata {
fn as_ref(&self) -> &Path {
&self.path
}
}
impl AsRef<FileId> for PathAndMetadata {
fn as_ref(&self) -> &FileId {
self.metadata.as_ref()
}
}
impl From<PathAndMetadata> for Path {
fn from(value: PathAndMetadata) -> Self {
value.path
}
}
impl Display for PathAndMetadata {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.pad(self.path.display().as_str())
}
}
#[derive(Debug)]
pub enum FsCommand {
Remove {
file: PathAndMetadata,
},
Move {
source: PathAndMetadata,
target: Path,
use_rename: bool, },
SoftLink {
target: Arc<PathAndMetadata>,
link: PathAndMetadata,
},
HardLink {
target: Arc<PathAndMetadata>,
link: PathAndMetadata,
},
RefLink {
target: Arc<PathAndMetadata>,
link: PathAndMetadata,
},
}
impl FsCommand {
fn maybe_lock(path: &Path, lock: bool) -> io::Result<Option<FileLock>> {
if lock {
match FileLock::new(path) {
Ok(lock) => Ok(Some(lock)),
Err(e) if e.kind() == ErrorKind::Unsupported => Ok(None),
Err(e) => Err(e),
}
} else {
Ok(None)
}
}
pub fn remove(path: &Path) -> io::Result<()> {
fs::remove_file(path.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!("Failed to remove file {}: {}", path.display(), e),
)
})
}
#[cfg(unix)]
fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> {
std::os::unix::fs::symlink(target, link)
}
#[cfg(windows)]
fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> {
std::os::windows::fs::symlink_file(target, link)
}
fn symlink(target: &Path, link: &Path) -> io::Result<()> {
Self::symlink_internal(&target.to_path_buf(), &link.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!(
"Failed to create symbolic link {} -> {}: {}",
link.display(),
target.display(),
e
),
)
})
}
fn hardlink(target: &Path, link: &Path) -> io::Result<()> {
fs::hard_link(target.to_path_buf(), link.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!(
"Failed to create hard link {} -> {}: {}",
link.display(),
target.display(),
e
),
)
})
}
fn check_can_rename(source: &Path, target: &Path) -> io::Result<()> {
if target.to_path_buf().exists() {
return Err(io::Error::new(
ErrorKind::AlreadyExists,
format!(
"Cannot move {} to {}: Target already exists",
source.display(),
target.display()
),
));
}
Ok(())
}
fn mkdirs(path: &Path) -> io::Result<()> {
fs::create_dir_all(path.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!("Failed to create directory {}: {}", path.display(), e),
)
})
}
pub fn unsafe_rename(source: &Path, target: &Path) -> io::Result<()> {
fs::rename(source.to_path_buf(), target.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!(
"Failed to rename file from {} to {}: {}",
source.display(),
target.display(),
e
),
)
})
}
fn unsafe_copy(source: &Path, target: &Path) -> io::Result<()> {
fs::copy(source.to_path_buf(), target.to_path_buf()).map_err(|e| {
io::Error::new(
e.kind(),
format!(
"Failed to copy file from {} to {}: {}",
source.display(),
target.display(),
e
),
)
})?;
Ok(())
}
fn move_rename(source: &Path, target: &Path) -> io::Result<()> {
Self::check_can_rename(source, target)?;
Self::mkdirs(target.parent().unwrap())?;
Self::unsafe_rename(source, target)?;
Ok(())
}
fn move_copy(source: &Path, target: &Path) -> io::Result<()> {
Self::check_can_rename(source, target)?;
Self::mkdirs(target.parent().unwrap())?;
Self::unsafe_copy(source, target)?;
Self::remove(source)?;
Ok(())
}
pub fn temp_file(path: &Path) -> Path {
let mut name = path
.file_name()
.expect("must be a regular file with a name");
name.push(".");
name.push(
rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(24)
.map(char::from)
.collect::<String>(),
);
match path.parent() {
Some(parent) => parent.join(Path::from(name)),
None => Path::from(name),
}
}
pub fn safe_remove<R>(
path: &Path,
f: impl FnOnce(&Path) -> io::Result<R>,
log: &dyn Log,
) -> io::Result<R> {
let tmp = Self::temp_file(path);
Self::unsafe_rename(path, &tmp)?;
let result = match f(path) {
Ok(result) => result,
Err(e) => {
if let Err(remove_err) = Self::unsafe_rename(&tmp, path) {
log.warn(format!(
"Failed to undo move from {} to {}: {}",
&path.display(),
&tmp.display(),
remove_err
))
}
return Err(e);
}
};
if let Err(e) = Self::remove(&tmp) {
log.warn(format!(
"Failed to remove temporary {}: {}",
&tmp.display(),
e
))
}
Ok(result)
}
pub fn execute(&self, should_lock: bool, log: &dyn Log) -> io::Result<FileLen> {
match self {
FsCommand::Remove { file } => {
let _ = Self::maybe_lock(&file.path, should_lock)?;
Self::remove(&file.path)?;
Ok(file.metadata.len())
}
FsCommand::SoftLink { target, link } => {
let _ = Self::maybe_lock(&link.path, should_lock)?;
Self::safe_remove(&link.path, |link| Self::symlink(&target.path, link), log)?;
Ok(link.metadata.len())
}
FsCommand::HardLink { target, link } => {
let _ = Self::maybe_lock(&link.path, should_lock)?;
Self::safe_remove(&link.path, |link| Self::hardlink(&target.path, link), log)?;
Ok(link.metadata.len())
}
FsCommand::RefLink { target, link } => {
let _ = Self::maybe_lock(&link.path, should_lock)?;
crate::reflink::reflink(target, link, log)?;
Ok(link.metadata.len())
}
FsCommand::Move {
source,
target,
use_rename,
} => {
let _ = Self::maybe_lock(&source.path, should_lock);
let len = source.metadata.len();
if *use_rename && Self::move_rename(&source.path, target).is_ok() {
return Ok(len);
}
Self::move_copy(&source.path, target)?;
Ok(len)
}
}
}
pub fn file_to_remove(&self) -> &Path {
match self {
FsCommand::Remove { file, .. }
| FsCommand::SoftLink { link: file, .. }
| FsCommand::HardLink { link: file, .. }
| FsCommand::RefLink { link: file, .. }
| FsCommand::Move { source: file, .. } => &file.path,
}
}
pub fn space_to_reclaim(&self) -> FileLen {
match self {
FsCommand::Remove { file, .. }
| FsCommand::SoftLink { link: file, .. }
| FsCommand::HardLink { link: file, .. }
| FsCommand::RefLink { link: file, .. }
| FsCommand::Move { source: file, .. } => file.metadata.len(),
}
}
#[cfg(unix)]
pub fn to_shell_str(&self) -> Vec<String> {
let mut result = Vec::new();
match self {
FsCommand::Remove { file, .. } => {
let path = file.path.quote();
result.push(format!("rm {path}"));
}
FsCommand::SoftLink { target, link, .. } => {
let tmp = Self::temp_file(&link.path);
let target = target.path.quote();
let link = link.path.quote();
result.push(format!("mv {} {}", link, tmp.quote()));
result.push(format!("ln -s {target} {link}"));
result.push(format!("rm {}", tmp.quote()));
}
FsCommand::HardLink { target, link, .. } => {
let tmp = Self::temp_file(&link.path);
let target = target.path.quote();
let link = link.path.quote();
result.push(format!("mv {} {}", link, tmp.quote()));
result.push(format!("ln {target} {link}"));
result.push(format!("rm {}", tmp.quote()));
}
FsCommand::RefLink { target, link, .. } => {
let tmp = Self::temp_file(&link.path);
let target = target.path.quote();
let link = link.path.quote();
result.push(format!("mv {} {}", link, tmp.quote()));
result.push(format!("cp --reflink=always {target} {link}"));
result.push(format!("rm {}", tmp.quote()));
}
FsCommand::Move {
source,
target,
use_rename,
} => {
let source = source.path.quote();
let target = target.quote();
if *use_rename {
result.push(format!("mv {} {}", &source, &target));
} else {
result.push(format!("cp {} {}", &source, &target));
result.push(format!("rm {}", &source));
}
}
}
result
}
#[cfg(windows)]
pub fn to_shell_str(&self) -> Vec<String> {
let mut result = Vec::new();
match self {
FsCommand::Remove { file, .. } => {
let path = file.path.quote();
result.push(format!("del {}", path));
}
FsCommand::SoftLink { target, link, .. } => {
let tmp = Self::temp_file(&link.path);
let target = target.path.quote();
let link = link.path.quote();
result.push(format!("move {} {}", link, tmp.quote()));
result.push(format!("mklink {} {}", target, link));
result.push(format!("del {}", tmp.quote()));
}
FsCommand::HardLink { target, link, .. } => {
let tmp = Self::temp_file(&link.path);
let target = target.path.quote();
let link = link.path.quote();
result.push(format!("move {} {}", link, tmp.quote()));
result.push(format!("mklink /H {} {}", target, link));
result.push(format!("del {}", tmp.quote()));
}
FsCommand::RefLink { target, link, .. } => {
result.push(format!(":: deduplicate {} {}", link, target));
}
FsCommand::Move {
source,
target,
use_rename,
} => {
let source = source.path.quote();
let target = target.quote();
if *use_rename {
result.push(format!("move {} {}", &source, &target));
} else {
result.push(format!("copy {} {}", &source, &target));
result.push(format!("del {}", &source));
}
}
}
result
}
}
#[derive(Default)]
pub struct DedupeResult {
pub processed_count: u64,
pub reclaimed_space: FileLen,
}
impl Add<DedupeResult> for DedupeResult {
type Output = DedupeResult;
fn add(self, rhs: Self) -> Self::Output {
DedupeResult {
processed_count: self.processed_count + rhs.processed_count,
reclaimed_space: self.reclaimed_space + rhs.reclaimed_space,
}
}
}
impl AddAssign for DedupeResult {
fn add_assign(&mut self, rhs: Self) {
self.processed_count += rhs.processed_count;
self.reclaimed_space += rhs.reclaimed_space;
}
}
fn was_modified(files: &[PathAndMetadata], after: DateTime<FixedOffset>, log: &dyn Log) -> bool {
let mut result = false;
let after: DateTime<Local> = after.into();
for PathAndMetadata {
path: p,
metadata: m,
..
} in files.iter()
{
match m.modified() {
Ok(file_timestamp) => {
let file_timestamp: DateTime<Local> = file_timestamp.into();
if file_timestamp > after {
log.warn(format!(
"File {} was updated after {} (at {})",
p.display(),
after.format(TIMESTAMP_FMT),
file_timestamp.format(TIMESTAMP_FMT)
));
result = true;
}
}
Err(e) => {
log.warn(format!(
"Failed to read modification time of file {}: {}",
p.display(),
e
));
result = true;
}
}
}
result
}
fn should_keep(path: &Path, config: &DedupeConfig) -> bool {
let matches_any_name = config
.keep_name_patterns
.iter()
.any(|p| match path.file_name_cstr() {
Some(name) => p.matches(name.to_string_lossy().as_ref()),
None => false,
});
let matches_any_path = || {
config
.keep_path_patterns
.iter()
.any(|p| p.matches_path(&path.to_path_buf()))
};
matches_any_name || matches_any_path()
}
fn may_drop(path: &Path, config: &DedupeConfig) -> bool {
let matches_any_name = || {
config
.name_patterns
.iter()
.any(|p| match path.file_name_cstr() {
Some(name) => p.matches(name.to_string_lossy().as_ref()),
None => false,
})
};
let matches_any_path = || {
config
.path_patterns
.iter()
.any(|p| p.matches_path(&path.to_path_buf()))
};
(config.name_patterns.is_empty() && config.path_patterns.is_empty())
|| matches_any_name()
|| matches_any_path()
}
impl<P: AsRef<PathAndMetadata>> FileSubGroup<P> {
pub fn created(&self) -> Result<SystemTime, Error> {
Ok(min_result(self.files.iter().map(|f| {
let f = f.as_ref();
f.metadata.created().map_err(|e| {
format!(
"Failed to read creation time of file {}: {}",
f.path.display(),
e
)
})
}))?
.unwrap())
}
pub fn modified(&self) -> Result<SystemTime, Error> {
Ok(max_result(self.files.iter().map(|f| {
let f = f.as_ref();
f.metadata.modified().map_err(|e| {
format!(
"Failed to read modification time of file {}: {}",
f.path.display(),
e
)
})
}))?
.unwrap())
}
pub fn accessed(&self) -> Result<SystemTime, Error> {
Ok(max_result(self.files.iter().map(|f| {
let f = f.as_ref();
f.metadata.accessed().map_err(|e| {
format!(
"Failed to read access time of file {}: {}",
f.path.display(),
e
)
})
}))?
.unwrap())
}
pub fn should_keep(&self, config: &DedupeConfig) -> bool {
self.files
.iter()
.any(|f| should_keep(&f.as_ref().path, config))
}
pub fn may_drop(&self, config: &DedupeConfig) -> bool {
self.files
.iter()
.all(|f| may_drop(&f.as_ref().path, config))
}
pub fn min_nesting(&self) -> usize {
self.files
.iter()
.map(|f| f.as_ref().path.component_count())
.min()
.unwrap()
}
pub fn max_nesting(&self) -> usize {
self.files
.iter()
.map(|f| f.as_ref().path.component_count())
.max()
.unwrap()
}
}
pub fn sort_by_priority<P>(files: &mut [FileSubGroup<P>], priority: &Priority) -> Vec<Error>
where
P: AsRef<PathAndMetadata>,
{
match priority {
Priority::Top => {
files.reverse();
vec![]
}
Priority::Bottom => vec![],
Priority::Newest => try_sort_by_key(files, |m| m.created()),
Priority::Oldest => try_sort_by_key(files, |m| m.created().map(Reverse)),
Priority::MostRecentlyModified => try_sort_by_key(files, |m| m.modified()),
Priority::LeastRecentlyModified => try_sort_by_key(files, |m| m.modified().map(Reverse)),
Priority::MostRecentlyAccessed => try_sort_by_key(files, |m| m.accessed()),
Priority::LeastRecentlyAccessed => try_sort_by_key(files, |m| m.accessed().map(Reverse)),
Priority::MostNested => {
files.sort_by_key(|m| m.max_nesting());
vec![]
}
Priority::LeastNested => {
files.sort_by_key(|m| Reverse(m.min_nesting()));
vec![]
}
}
}
#[derive(Debug)]
pub struct PartitionedFileGroup {
pub to_keep: Vec<PathAndMetadata>,
pub to_drop: Vec<PathAndMetadata>,
}
impl PartitionedFileGroup {
fn move_target(target_dir: &Arc<Path>, source_path: &Path) -> Path {
let root = source_path
.root()
.map(|p| p.to_string_lossy().replace(['/', '\\', ':'], ""));
let suffix = source_path.strip_root();
match root {
None => target_dir.join(suffix),
Some(root) => Arc::new(target_dir.join(Path::from(root))).join(suffix),
}
}
fn are_on_same_mount(devices: &DiskDevices, file1: &Path, file2: &Path) -> bool {
let mount1 = devices.get_mount_point(file1);
let mount2 = devices.get_mount_point(file2);
mount1 == mount2
}
pub fn dedupe_script(mut self, strategy: &DedupeOp, devices: &DiskDevices) -> Vec<FsCommand> {
if self.to_drop.is_empty() {
return vec![];
}
assert!(
!self.to_keep.is_empty(),
"No files would be left after deduplicating"
);
let mut commands = Vec::new();
let retained_file = Arc::new(self.to_keep.swap_remove(0));
for dropped_file in self.to_drop {
match strategy {
DedupeOp::SymbolicLink => commands.push(FsCommand::SoftLink {
target: retained_file.clone(),
link: dropped_file,
}),
DedupeOp::HardLink => commands.push(FsCommand::HardLink {
target: retained_file.clone(),
link: dropped_file,
}),
DedupeOp::RefLink => commands.push(FsCommand::RefLink {
target: retained_file.clone(),
link: dropped_file,
}),
DedupeOp::Remove => commands.push(FsCommand::Remove { file: dropped_file }),
DedupeOp::Move(target_dir) => {
let source = dropped_file;
let source_path = &source.path;
let use_rename = Self::are_on_same_mount(devices, source_path, target_dir);
let target = Self::move_target(target_dir, source_path);
commands.push(FsCommand::Move {
source,
target,
use_rename,
})
}
}
}
commands
}
}
fn fetch_files_metadata<P>(group: FileGroup<P>, log: &dyn Log) -> Option<FileGroup<PathAndMetadata>>
where
P: Into<Path>,
{
group
.try_map_all(|p| {
PathAndMetadata::new(p.into()).map_err(|e| {
log.warn(&e);
})
})
.ok()
}
fn partition(
group: FileGroup<PathAndMetadata>,
config: &DedupeConfig,
log: &dyn Log,
) -> Result<PartitionedFileGroup, Error> {
let file_len = group.file_len;
let file_hash = group.file_hash.clone();
let mut files = group.files;
let error = |msg: &str| {
Err(Error::from(format!(
"Could not determine files to drop in group with hash {} and len {}: {}",
file_hash, file_len.0, msg
)))
};
files.retain(|m| {
let is_file = m.metadata.is_file();
if !is_file {
log.warn(format!(
"Skipping file {}: Not a regular file",
m.path.display()
));
}
is_file
});
if !config.no_check_size {
files.retain(|m| {
let len_ok = m.metadata.len() == file_len;
if !len_ok {
log.warn(format!(
"Skipping file {} with length {} different than the group length {}",
m.path.display(),
m.metadata.len(),
file_len.0,
));
}
len_ok
});
}
if let Some(max_timestamp) = config.modified_before {
if was_modified(&files, max_timestamp, log) {
return error("Some files could be updated since the previous run of fclones");
}
}
let mut file_sub_groups =
FileSubGroup::group(files, &config.isolated_roots, !config.match_links);
let mut sort_errors = Vec::new();
for priority in config.priority.iter().rev() {
sort_errors.extend(sort_by_priority(&mut file_sub_groups, priority));
}
if !sort_errors.is_empty() {
for e in sort_errors {
log.warn(e);
}
return error("Metadata of some files could not be read.");
}
let (mut to_retain, mut to_drop): (Vec<_>, Vec<_>) = file_sub_groups
.into_iter()
.partition(|m| m.should_keep(config) || !m.may_drop(config));
let n = max(1, config.rf_over.unwrap_or(1));
let missing_count = min(to_drop.len(), n.saturating_sub(to_retain.len()));
to_retain.extend(to_drop.drain(0..missing_count));
assert!(to_retain.len() >= n || to_drop.is_empty());
Ok(PartitionedFileGroup {
to_keep: to_retain.into_iter().flat_map(|g| g.files).collect(),
to_drop: to_drop.into_iter().flat_map(|g| g.files).collect(),
})
}
pub fn dedupe<'a, I, P>(
groups: I,
op: DedupeOp,
config: &'a DedupeConfig,
log: &'a dyn Log,
) -> impl ParallelIterator<Item = (usize, Vec<FsCommand>)> + Send + 'a
where
I: IntoIterator<Item = FileGroup<P>> + 'a,
I::IntoIter: Send,
P: Into<Path> + AsRef<Path> + fmt::Debug + Send + 'a,
{
let devices = DiskDevices::new(&HashMap::new());
let disallow_cross_device = op == DedupeOp::HardLink || op == DedupeOp::RefLink;
groups
.into_iter()
.enumerate()
.par_bridge()
.map(move |(i, group)| {
let mut commands = Vec::new();
if let Some(group) = fetch_files_metadata(group, log) {
let groups = if disallow_cross_device {
group.partition_by_key(|p| p.metadata.device_id())
} else {
vec![group]
};
for group in groups {
match partition(group, config, log) {
Ok(group) => commands.extend(group.dedupe_script(&op, &devices)),
Err(e) => log.warn(e),
}
}
}
(i, commands)
})
}
pub fn run_script<I>(script: I, should_lock: bool, log: &dyn Log) -> DedupeResult
where
I: IntoParallelIterator<Item = (usize, Vec<FsCommand>)>,
{
script
.into_par_iter()
.flat_map(|(_, cmd_vec)| cmd_vec)
.map(|cmd| cmd.execute(should_lock, log))
.inspect(|res| {
if let Err(e) = res {
log.warn(e);
}
})
.filter_map(|res| res.ok())
.map(|len| DedupeResult {
processed_count: 1,
reclaimed_space: len,
})
.reduce(DedupeResult::default, |a, b| a + b)
}
struct FsCommandGroup {
index: usize,
commands: Vec<FsCommand>,
}
impl FsCommandGroup {
pub fn new(index: usize, commands: Vec<FsCommand>) -> FsCommandGroup {
FsCommandGroup { index, commands }
}
}
impl PartialEq<Self> for FsCommandGroup {
fn eq(&self, other: &Self) -> bool {
self.index == other.index
}
}
impl Eq for FsCommandGroup {}
impl Hash for FsCommandGroup {
fn hash<H: Hasher>(&self, state: &mut H) {
self.index.hash(state)
}
}
pub fn log_script(
script: impl IntoParallelIterator<Item = (usize, Vec<FsCommand>)> + Send,
mut out: impl Write + Send,
) -> io::Result<DedupeResult> {
let (tx, rx) = channel();
crossbeam_utils::thread::scope(move |s| {
s.spawn(move |_| {
script
.into_par_iter()
.for_each_with(tx, |tx, item| tx.send(item).unwrap())
});
let mut queue = PriorityQueue::new();
let mut next_group_index = 0;
let mut processed_count = 0;
let mut reclaimed_space = FileLen(0);
while let Ok((group_index, commands)) = rx.recv() {
queue.push(
FsCommandGroup::new(group_index, commands),
Reverse(group_index), );
while let Some((group, _)) = queue.peek() {
if group.index != next_group_index {
break;
}
next_group_index += 1;
let cmd_vec = queue.pop().unwrap().0.commands;
for cmd in cmd_vec {
processed_count += 1;
reclaimed_space += cmd.space_to_reclaim();
for line in cmd.to_shell_str() {
writeln!(out, "{line}")?;
}
}
}
}
Ok(DedupeResult {
processed_count,
reclaimed_space,
})
})
.unwrap()
}
#[cfg(test)]
mod test {
use std::collections::HashSet;
use std::default::Default;
use std::fs::{create_dir, create_dir_all};
use std::path::PathBuf;
use std::str::FromStr;
use std::{thread, time};
use chrono::Duration;
use itertools::Itertools;
use crate::config::GroupConfig;
use crate::file::FileHash;
use crate::group_files;
use crate::log::StdLog;
use crate::pattern::Pattern;
use crate::util::test::{create_file, create_file_newer_than, read_file, with_dir, write_file};
use super::*;
#[test]
fn test_temp_file_name_generation() {
let path = Path::from("/foo/bar");
let temp = FsCommand::temp_file(&path);
assert_ne!(path, temp);
assert_ne!(
path.file_name().unwrap().len(),
temp.file_name().unwrap().len()
);
assert_eq!(path.parent(), temp.parent());
}
#[test]
fn test_remove_command_removes_file() {
with_dir("dedupe/remove_cmd", |root| {
let log = StdLog::new();
let file_path = root.join("file");
create_file(&file_path);
let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
let cmd = FsCommand::Remove { file };
cmd.execute(true, &log).unwrap();
assert!(!file_path.exists())
})
}
#[test]
fn test_move_command_moves_file_by_rename() {
with_dir("dedupe/move_rename_cmd", |root| {
let log = StdLog::new();
let file_path = root.join("file");
let target = Path::from(root.join("target"));
create_file(&file_path);
let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
let cmd = FsCommand::Move {
source: file,
target: target.clone(),
use_rename: true,
};
cmd.execute(true, &log).unwrap();
assert!(!file_path.exists());
assert!(target.to_path_buf().exists());
})
}
#[test]
fn test_move_command_moves_file_by_copy() {
with_dir("dedupe/move_copy_cmd", |root| {
let log = StdLog::new();
let file_path = root.join("file");
let target = Path::from(root.join("target"));
create_file(&file_path);
let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
let cmd = FsCommand::Move {
source: file,
target: target.clone(),
use_rename: false,
};
cmd.execute(true, &log).unwrap();
assert!(!file_path.exists());
assert!(target.to_path_buf().exists());
})
}
#[test]
fn test_move_fails_if_target_exists() {
with_dir("dedupe/move_target_exists", |root| {
let log = StdLog::new();
let file_path = root.join("file");
let target = root.join("target");
create_file(&file_path);
create_file(&target);
let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
let cmd = FsCommand::Move {
source: file,
target: Path::from(&target),
use_rename: false,
};
assert!(cmd.execute(true, &log).is_err());
})
}
#[test]
fn test_soft_link_command_replaces_file_with_a_link() {
with_dir("dedupe/soft_link_cmd", |root| {
let log = StdLog::new();
let file_path_1 = root.join("file_1");
let file_path_2 = root.join("file_2");
write_file(&file_path_1, "foo");
write_file(&file_path_2, "");
let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap();
let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap();
let cmd = FsCommand::SoftLink {
target: Arc::new(file_1),
link: file_2,
};
cmd.execute(true, &log).unwrap();
assert!(file_path_1.exists());
assert!(file_path_2.exists());
assert!(fs::symlink_metadata(&file_path_2)
.unwrap()
.file_type()
.is_symlink());
assert_eq!(read_file(&file_path_2), "foo");
})
}
#[test]
fn test_hard_link_command_replaces_file_with_a_link() {
with_dir("dedupe/hard_link_cmd", |root| {
let log = StdLog::new();
let file_path_1 = root.join("file_1");
let file_path_2 = root.join("file_2");
write_file(&file_path_1, "foo");
write_file(&file_path_2, "");
let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap();
let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap();
let cmd = FsCommand::HardLink {
target: Arc::new(file_1),
link: file_2,
};
cmd.execute(true, &log).unwrap();
assert!(file_path_1.exists());
assert!(file_path_2.exists());
assert_eq!(read_file(&file_path_2), "foo");
})
}
fn make_group(root: &PathBuf, file_hash: FileHash) -> FileGroup<Path> {
create_dir_all(root).unwrap();
let file_1 = root.join("file_1");
let file_2 = root.join("file_2");
let file_3 = root.join("file_3");
create_file(&file_1);
let ctime_1 = fs::metadata(&file_1).unwrap().modified().unwrap();
let ctime_2 = create_file_newer_than(&file_2, ctime_1);
create_file_newer_than(&file_3, ctime_2);
FileGroup {
file_len: FileLen(0),
file_hash,
files: vec![
Path::from(&file_1),
Path::from(&file_2),
Path::from(&file_3),
],
}
}
#[test]
fn test_partition_selects_files_for_removal() {
with_dir("dedupe/partition/basic", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let config = DedupeConfig::default();
let partitioned = partition(group, &config, &StdLog::new()).unwrap();
assert_eq!(partitioned.to_keep.len(), 1);
assert_eq!(partitioned.to_drop.len(), 2);
})
}
#[test]
fn test_partition_bails_out_if_file_modified_too_late() {
with_dir("dedupe/partition/modification", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let config = DedupeConfig {
modified_before: Some(DateTime::from(Local::now() - Duration::days(1))),
..DedupeConfig::default()
};
let partitioned = partition(group, &config, &StdLog::new());
assert!(partitioned.is_err());
})
}
#[test]
fn test_partition_skips_file_with_different_len() {
with_dir("dedupe/partition/file_len", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
let path = group.files[0].clone();
write_file(&path.to_path_buf(), "foo");
let config = DedupeConfig {
priority: vec![Priority::MostRecentlyModified],
..DedupeConfig::default()
};
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let partitioned = partition(group, &config, &StdLog::new()).unwrap();
assert!(!partitioned.to_drop.iter().any(|m| m.path == path));
assert!(!partitioned.to_keep.iter().any(|m| m.path == path));
})
}
fn path_set(v: &[PathAndMetadata]) -> HashSet<&Path> {
v.iter().map(|f| &f.path).collect()
}
#[test]
fn test_partition_respects_creation_time_priority() {
with_dir("dedupe/partition/ctime_priority", |root| {
if fs::metadata(root).unwrap().created().is_err() {
return;
}
let group = make_group(root, FileHash::from_str("00").unwrap());
let mut config = DedupeConfig {
priority: vec![Priority::Newest],
..DedupeConfig::default()
};
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap();
config.priority = vec![Priority::Oldest];
let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap();
assert_ne!(
path_set(&partitioned_1.to_keep),
path_set(&partitioned_2.to_keep)
);
assert_ne!(
path_set(&partitioned_1.to_drop),
path_set(&partitioned_2.to_drop)
);
});
}
#[test]
fn test_partition_respects_modification_time_priority() {
with_dir("dedupe/partition/mtime_priority", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
thread::sleep(time::Duration::from_millis(10));
let path = group.files[0].clone();
write_file(&path.to_path_buf(), "foo");
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let config = DedupeConfig {
priority: vec![Priority::MostRecentlyModified],
..DedupeConfig::default()
};
let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap();
let config = DedupeConfig {
priority: vec![Priority::LeastRecentlyModified],
..DedupeConfig::default()
};
let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap();
assert_ne!(
path_set(&partitioned_1.to_keep),
path_set(&partitioned_2.to_keep)
);
assert_ne!(
path_set(&partitioned_1.to_drop),
path_set(&partitioned_2.to_drop)
);
});
}
#[test]
fn test_partition_respects_keep_patterns() {
with_dir("dedupe/partition/keep", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let mut config = DedupeConfig {
priority: vec![Priority::LeastRecentlyModified],
keep_name_patterns: vec![Pattern::glob("*_1").unwrap()],
..DedupeConfig::default()
};
let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
assert_eq!(p.to_keep.len(), 1);
assert_eq!(&p.to_keep[0].path, &group.files[0].path);
config.keep_name_patterns = vec![];
config.keep_path_patterns = vec![Pattern::glob("**/file_1").unwrap()];
let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
assert_eq!(p.to_keep.len(), 1);
assert_eq!(&p.to_keep[0].path, &group.files[0].path);
})
}
#[test]
fn test_partition_respects_drop_patterns() {
with_dir("dedupe/partition/drop", |root| {
let group = make_group(root, FileHash::from_str("00").unwrap());
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let mut config = DedupeConfig {
priority: vec![Priority::LeastRecentlyModified],
name_patterns: vec![Pattern::glob("*_3").unwrap()],
..DedupeConfig::default()
};
let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
assert_eq!(p.to_drop.len(), 1);
assert_eq!(&p.to_drop[0].path, &group.files[2].path);
config.name_patterns = vec![];
config.path_patterns = vec![Pattern::glob("**/file_3").unwrap()];
let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
assert_eq!(p.to_drop.len(), 1);
assert_eq!(&p.to_drop[0].path, &group.files[2].path);
})
}
#[test]
fn test_partition_respects_isolated_roots() {
with_dir("dedupe/partition/isolated_roots", |root| {
let root1 = root.join("root1");
let root2 = root.join("root2");
create_dir(&root1).unwrap();
create_dir(&root2).unwrap();
let group1 = make_group(&root1, FileHash::from_str("00").unwrap());
let group2 = make_group(&root2, FileHash::from_str("00").unwrap());
let group = FileGroup {
file_len: group1.file_len,
file_hash: group1.file_hash,
files: group1.files.into_iter().chain(group2.files).collect(),
};
let config = DedupeConfig {
isolated_roots: vec![Path::from(&root1), Path::from(&root2)],
..DedupeConfig::default()
};
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let p = partition(group, &config, &StdLog::new()).unwrap();
assert_eq!(p.to_drop.len(), 3);
assert!(p
.to_drop
.iter()
.all(|f| f.path.to_path_buf().starts_with(&root2)));
assert_eq!(p.to_keep.len(), 3);
assert!(p
.to_keep
.iter()
.all(|f| f.path.to_path_buf().starts_with(&root1)));
})
}
#[test]
fn test_partition_respects_links() {
with_dir("dedupe/partition/links", |root| {
let root_a = root.join("root_a");
let root_b = root.join("root_b");
create_dir(&root_a).unwrap();
create_dir(&root_b).unwrap();
let file_a1 = root_a.join("file_a1");
let file_a2 = root_a.join("file_a2");
write_file(&file_a1, "aaa");
fs::hard_link(&file_a1, &file_a2).unwrap();
let file_b1 = root_b.join("file_b1");
let file_b2 = root_b.join("file_b2");
write_file(&file_b1, "aaa");
fs::hard_link(&file_b1, &file_b2).unwrap();
let group = FileGroup {
file_len: FileLen(3),
file_hash: FileHash::from_str("00").unwrap(),
files: vec![
Path::from(&file_b1),
Path::from(&file_a2),
Path::from(&file_a1),
Path::from(&file_b2),
],
};
let config = DedupeConfig::default();
let group = group.map(|p| PathAndMetadata::new(p).unwrap());
let p = partition(group, &config, &StdLog::new()).unwrap();
assert_eq!(p.to_drop.len(), 2);
assert!(p
.to_drop
.iter()
.all(|f| f.path.to_path_buf().starts_with(&root_a)));
assert_eq!(p.to_keep.len(), 2);
assert!(p
.to_keep
.iter()
.all(|f| f.path.to_path_buf().starts_with(&root_b)));
})
}
#[test]
fn test_run_dedupe_script() {
with_dir("dedupe/partition/run_dedupe_script", |root| {
let mut log = StdLog::new();
log.no_progress = true;
log.log_stderr_to_stdout = true;
let group = make_group(root, FileHash::from_str("00").unwrap());
let config = DedupeConfig {
priority: vec![Priority::LeastRecentlyModified],
..DedupeConfig::default()
};
let script = dedupe(vec![group], DedupeOp::Remove, &config, &log);
let dedupe_result = run_script(script, !config.no_lock, &log);
assert_eq!(dedupe_result.processed_count, 2);
assert!(!root.join("file_1").exists());
assert!(!root.join("file_2").exists());
assert!(root.join("file_3").exists());
});
}
#[test]
fn test_log_dedupe_script() {
with_dir("dedupe/partition/log_dedupe_script", |root| {
let mut log = StdLog::new();
log.no_progress = true;
log.log_stderr_to_stdout = true;
let group_1 = make_group(&root.join("group_1"), FileHash::from_str("00").unwrap());
let group_2 = make_group(&root.join("group_2"), FileHash::from_str("01").unwrap());
let group_3 = make_group(&root.join("group_3"), FileHash::from_str("02").unwrap());
let groups = vec![group_1, group_2, group_3];
let config = DedupeConfig {
priority: vec![Priority::LeastRecentlyModified],
..DedupeConfig::default()
};
let script = dedupe(groups, DedupeOp::Remove, &config, &log);
let mut out = Vec::new();
let dedupe_result = log_script(script, &mut out).unwrap();
assert_eq!(dedupe_result.processed_count, 6);
let out = String::from_utf8(out).unwrap();
let out_lines = out.lines().collect_vec();
assert_eq!(out_lines.len(), 6);
assert!(out_lines[0].contains("group_1"));
assert!(out_lines[1].contains("group_1"));
assert!(out_lines[2].contains("group_2"));
assert!(out_lines[3].contains("group_2"));
assert!(out_lines[4].contains("group_3"));
assert!(out_lines[5].contains("group_3"));
});
}
#[test]
fn test_hard_link_merges_subgroups_of_hard_links() {
with_dir("dedupe/merge_subgroups_of_hardlinks", |root| {
let mut log = StdLog::new();
log.no_progress = true;
log.log_stderr_to_stdout = true;
let file_a1 = root.join("file_a1");
let file_a2 = root.join("file_a2");
let file_b1 = root.join("file_b1");
let file_b2 = root.join("file_b2");
write_file(&file_a1, "foo");
write_file(&file_b1, "foo");
let file_id = FileId::new(&Path::from(&file_a1)).unwrap();
fs::hard_link(&file_a1, &file_a2).unwrap();
fs::hard_link(&file_b1, &file_b2).unwrap();
let group_config = GroupConfig {
paths: vec![Path::from(root)],
..GroupConfig::default()
};
let groups = group_files(&group_config, &log).unwrap();
let dedupe_config = DedupeConfig::default();
let script = dedupe(groups, DedupeOp::HardLink, &dedupe_config, &log);
let dedupe_result = run_script(script, false, &log);
assert_eq!(dedupe_result.processed_count, 2);
assert!(file_a1.exists());
assert!(file_a2.exists());
assert!(file_b1.exists());
assert!(file_b2.exists());
assert_eq!(read_file(&file_a1), "foo");
assert_eq!(FileId::new(&Path::from(&file_a2)).unwrap(), file_id);
assert_eq!(FileId::new(&Path::from(&file_b1)).unwrap(), file_id);
assert_eq!(FileId::new(&Path::from(&file_b2)).unwrap(), file_id);
})
}
#[test]
#[cfg(unix)]
fn test_remove_removes_subgroups_of_soft_links() {
use std::os::unix::fs;
with_dir("dedupe/remove_subgroups_with_symlinks", |root| {
let mut log = StdLog::new();
log.no_progress = true;
log.log_stderr_to_stdout = true;
let file_a1 = root.join("file_a1");
let file_a2 = root.join("file_a2");
let file_b1 = root.join("file_b1");
let file_b2 = root.join("file_b2");
write_file(&file_a1, "foo");
write_file(&file_b1, "foo");
fs::symlink(&file_a1, &file_a2).unwrap();
fs::symlink(&file_b1, &file_b2).unwrap();
let group_config = GroupConfig {
paths: vec![Path::from(root)],
symbolic_links: true,
..GroupConfig::default()
};
let groups = group_files(&group_config, &log).unwrap();
let dedupe_config = DedupeConfig::default();
let script = dedupe(groups, DedupeOp::Remove, &dedupe_config, &log);
let dedupe_result = run_script(script, false, &log);
assert_eq!(dedupe_result.processed_count, 2);
assert!(file_a1.exists());
assert!(file_a2.exists());
assert!(!file_b1.exists());
assert!(!file_b2.exists());
assert_eq!(read_file(&file_a1), "foo");
assert_eq!(read_file(&file_a2), "foo");
})
}
}