use std::collections::HashMap;
use std::error::Error;
use std::fs;
use std::fs::File;
use std::io::{BufReader, Read, Write};
use std::path::PathBuf;
use clap::Parser;
use data_encoding::HEXUPPER;
use indicatif::ParallelProgressIterator;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use ring::digest::{Context, Digest, SHA256};
use walkdir::WalkDir;
use wax::{Glob, Pattern};
const MAX_LINKS: usize = 1000;
pub fn sha256_digest<R: Read>(mut reader: R) -> Result<Digest, std::io::Error> {
let mut context = Context::new(&SHA256);
let mut buffer = [0; 1024];
loop {
let count = reader.read(&mut buffer)?;
if count == 0 {
break;
}
context.update(&buffer[..count]);
}
Ok(context.finish())
}
type PathVec = Vec<PathBuf>;
type DedupeMap = HashMap<String, PathVec>;
fn hash_source_files(src_dir: &PathBuf, include: &Option<Vec<String>>) -> Result<(DedupeMap, PathVec), Box<dyn Error>> {
let include_glob: Option<wax::Any> = match include {
Some(include) => {
let patterns: Vec<&str> = include.iter().map(|s| &s[..]).collect();
Some(wax::any::<Glob, _>(patterns).unwrap())
}
_ => None
};
let files = WalkDir::new(src_dir)
.into_iter()
.filter_map(Result::ok)
.filter(|e| !e.file_type().is_dir())
.map(|entry| entry.path().to_path_buf())
.collect::<Vec<_>>();
let mut ignore = vec![false; files.len()];
if let Some(glob) = include_glob {
ignore = files.par_iter().progress_count(files.len() as u64)
.map(|f_path| !glob.is_match(f_path.to_str().unwrap()))
.collect();
}
let (ignore_list, files): (Vec<_>, Vec<_>) = files.iter().enumerate()
.partition(|(i, _)| ignore[*i]);
let ignore_list: PathVec = ignore_list.into_iter().map(|(_, f)| f.to_owned()).collect();
let files: PathVec = files.into_iter().map(|(_, f)| f.to_owned()).collect();
let file_hashes: Vec<String> = files.par_iter().progress_count(files.len() as u64)
.map(|f_path| {
let reader = BufReader::new(File::open(f_path).unwrap());
let digest = sha256_digest(reader).unwrap();
HEXUPPER.encode(digest.as_ref())
}).collect();
let mut hashes: DedupeMap = HashMap::new();
files.into_iter().zip(file_hashes.into_iter())
.for_each(|(f_path, hash)| {
let entry = hashes.entry(hash).or_insert(Vec::new());
entry.push(f_path);
});
Ok((hashes, ignore_list))
}
pub fn dedupe(src_dir: PathBuf, dst_dir: PathBuf, include: Option<Vec<String>>) -> Result<(), Box<dyn Error>> {
println!("Indexing files in {:?}", src_dir);
let (hashes, ignore_list) = hash_source_files(&src_dir, &include)?;
println!("De-duping files with same data");
let src_dir_str = &src_dir.to_str().unwrap();
let dst_dir_str = &dst_dir.to_str().unwrap();
for (hash, paths) in hashes {
let mut src_index = 0;
let temp = &paths[src_index];
let mut ln_source = std::path::Path::new(&temp);
for (i, path) in paths.iter().enumerate() {
if i >= MAX_LINKS {
src_index += 1;
let temp = &paths[src_index];
ln_source = std::path::Path::new(temp);
}
let ln_target = path.to_string_lossy().replace(src_dir_str, dst_dir_str);
let ln_target = PathBuf::from(ln_target);
let parent_dir = ln_target.parent().unwrap();
fs::create_dir_all(parent_dir).unwrap();
println!("{}: {:?} -> {:?}", hash, ln_source, ln_target);
fs::hard_link(ln_source, ln_target).unwrap();
}
}
for path in &ignore_list {
let ln_source = path;
let ln_source = std::path::Path::new(&ln_source);
let ln_target = path.to_string_lossy().replace(src_dir_str, dst_dir_str);
let ln_target = PathBuf::from(ln_target);
let parent_dir = ln_target.parent().unwrap();
fs::create_dir_all(parent_dir).unwrap();
println!("{:?} -> {:?}", ln_source, ln_target);
fs::hard_link(ln_source, ln_target).unwrap();
}
Ok(())
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
pub struct Args {
#[arg(
value_name = "SRC",
help = "The source directory to de-dupe",
value_parser = Args::validate_src_dir
)]
pub source: PathBuf,
#[arg(
value_name = "DST",
help = "The destination directory where the the de-duped copy will be located",
value_parser = Args::validate_dst_dir,
)]
pub destination: PathBuf,
#[arg(
short = 'y',
long = "yes",
help = "Don't ask for confirmation before creating the de-duped directory"
)]
pub yes: bool,
#[arg(
short = 'i',
long = "include",
help = "Optional globs patterns for files to include in the deduplication"
)]
pub include: Option<Vec<String>>,
}
impl Args {
fn validate_src_dir(src_dir: &str) -> Result<PathBuf, String> {
let src_dir = PathBuf::from(src_dir);
if !src_dir.exists() {
Err(format!("Source directory {:?} doesn't exist", src_dir))
} else if !src_dir.is_dir() {
Err(format!("Source directory {:?} is not a directory", src_dir))
} else {
Ok(src_dir)
}
}
fn validate_dst_dir(dst_dir: &str) -> Result<PathBuf, String> {
let dst_dir = PathBuf::from(dst_dir);
if dst_dir.exists() {
if !dst_dir.is_dir() {
return Err(format!("Destination {:?} is not a directory.", dst_dir));
} else if dst_dir.read_dir().unwrap().next().is_some() {
return Err(format!("Destination {:?} is not empty.", dst_dir));
}
return Ok(dst_dir);
}
if !dst_dir.parent().unwrap().exists() {
return Err(format!(
"Destination {:?} must have an existing parent directory.",
dst_dir
));
}
fs::create_dir_all(&dst_dir).unwrap();
Ok(dst_dir)
}
pub fn confirm_action(self: &Args) -> Result<(), Box<dyn Error>> {
if self.yes {
return Ok(());
}
println!(
"This will create a hard-linked copy of {:?} in {:?}",
self.source, self.destination
);
println!("All files with the same data will be hard-linked to the same file.");
println!("This will not modify the source directory.");
println!();
let mut input = String::new();
print!("Continue? [y/N] ");
std::io::stdout().flush()?;
std::io::stdin().read_line(&mut input)?;
while let Some(c) = input.chars().next() {
match c {
'y' | 'Y' => break,
'n' | 'N' | '\n' => {
println!("Aborting");
std::process::exit(0);
}
_ => {
println!("Please enter y or n");
input.clear();
std::io::stdin().read_line(&mut input)?;
}
}
}
Ok(())
}
}