#![allow(warnings)]
use std::collections::HashMap;
use std::env;
use std::error::Error;
use std::fs::{self, File, metadata};
use std::io::{self, Read};
use std::io::BufReader;
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::mpsc::{channel, Receiver, Sender};
use blake2::{Blake2b, Digest};
use blake2::digest::FixedOutput;
use dashmap::DashMap;
use env_logger::*;
use generic_array::{GenericArray, typenum::U64};
use human_bytes::human_bytes;
use ignore::WalkBuilder;
use log::*;
use rayon::prelude::*;
use term_size::dimensions;
type ChannelData = (String, PathBuf);
type GroupingRetVal = (HashMap<u64, Vec<PathBuf>>, u128, u128);
pub const BUFFER_SIZE: usize = 512;
type OutSize = U64;
pub fn find_remove_duplicates(
path: &str,
delete: bool,
) -> Result<(u128, u128, u128, u128), Box<dyn Error>> {
let filter_level = env::var("FDUPS_LOG").unwrap_or_else(|_| "error".to_owned());
let mut builder = Builder::new();
builder.filter_module("fdups", filter_level.parse::<LevelFilter>()?);
builder.init();
debug!("Grouping files by size");
let (files_by_size, total_files, total_file_sizes_in_bytes) =
group_files_by_size(path, true, true, true, true)?;
debug!("Finding potential duplicates");
let potential_duplicates = find_potential_duplicates(files_by_size);
debug!("Finding duplicates");
let duplicates = find_duplicates_with_hasher_cache(potential_duplicates)?;
debug!("Processing duplicates");
let (duplicate_count, size_saved) = process_duplicates(duplicates, delete)?;
info!(
"Duplicate Ratio: {}/{} ({:.2}%)",
duplicate_count,
total_files,
(duplicate_count as f64 / total_files as f64) * 100.0
);
info!(
"Size Reduction: {}/{} ({:.2}%)",
human_bytes(size_saved as f64),
human_bytes(total_file_sizes_in_bytes as f64),
(size_saved as f64 / total_file_sizes_in_bytes as f64) * 100.0
);
Ok((
total_files,
duplicate_count,
total_file_sizes_in_bytes,
size_saved,
))
}
pub fn find_duplicates_with_hasher_cache(
potential_duplicates: Vec<PathBuf>,
) -> Result<HashMap<String, Vec<PathBuf>>, Box<dyn Error>> {
let mut duplicates: HashMap<String, Vec<PathBuf>> = HashMap::new(); let hasher_cache: Arc<DashMap<PathBuf, String>> = Arc::new(DashMap::new());
let (tx, rx): (Sender<ChannelData>, Receiver<ChannelData>) = channel();
potential_duplicates
.into_par_iter()
.for_each_with(tx.clone(), |tx, file_path| {
if let Some(cached_hash) = hasher_cache.get(&file_path) {
tx.send((cached_hash.clone(), file_path)).unwrap();
} else {
let hash_result = partial_hash_file(&file_path);
if let Ok(hash) = hash_result {
hasher_cache.insert(file_path.clone(), hash.clone());
tx.send((hash, file_path)).unwrap();
}
}
});
drop(tx);
for (hash, file_path) in rx {
let file_list = duplicates.entry(hash).or_insert_with(Vec::new); if !file_list.is_empty() {
let full_hash1 = full_hash_file(&file_path)?;
let mut is_duplicate = false;
for dup_path in file_list.iter() {
let full_hash2 = full_hash_file(dup_path)?;
if full_hash1 == full_hash2 {
is_duplicate = true;
break;
}
}
if is_duplicate {
file_list.push(file_path); }
} else {
file_list.push(file_path); }
}
Ok(duplicates)
}
pub fn partial_hash_file(file_path: &PathBuf) -> io::Result<String> {
let mut file = BufReader::new(File::open(file_path)?);
let mut hasher = Blake2b::new();
let mut buffer = [0; BUFFER_SIZE];
let bytes_read = file.read(&mut buffer)?;
hasher.update(&buffer[..bytes_read]);
let hash: GenericArray<u8, OutSize> = hasher.finalize_fixed();
Ok(format!("{hash:x}"))
}
#[inline]
pub fn group_files_by_size(
path: &str,
include_hidden: bool,
include_git_ignore: bool,
include_git_ignore_global: bool,
include_git_exclude: bool,
) -> Result<GroupingRetVal, Box<dyn Error>> {
let mut group_files_by_size: HashMap<u64, Vec<PathBuf>> = HashMap::new();
let mut total_size_in_bytes: u128 = 0;
let walker = WalkBuilder::new(path)
.hidden(!include_hidden)
.git_ignore(!include_git_ignore) .git_global(!include_git_ignore_global) .git_exclude(!include_git_exclude) .build();
let mut total_files = 0;
for entry in walker
.filter_map(Result::ok)
.filter(|e| match e.file_type() {
Some(val) => {
total_files += 1;
val.is_file()
}
None => false,
})
{
let file_metadata = match entry.metadata() {
Ok(meta) => meta,
Err(err) => {
debug!("Skipping file {} with err: {}", entry.path().display(), err);
continue;
}
};
let size = file_metadata.len() as u128; total_size_in_bytes += size;
let file_list = group_files_by_size
.entry(size as u64)
.or_insert_with(Vec::new); file_list.push(entry.path().to_owned()); }
Ok((group_files_by_size, total_files, total_size_in_bytes))
}
#[must_use]
pub fn find_potential_duplicates(files_by_size: HashMap<u64, Vec<PathBuf>>) -> Vec<PathBuf> {
let mut potential_duplicates: Vec<PathBuf> = Vec::new();
for (_, files) in files_by_size {
if files.len() > 1 {
potential_duplicates.extend(files);
}
}
potential_duplicates
}
#[macro_export]
macro_rules! print_filepath {
( $arg:expr,original ) => {{
let (termsize_width, _) = dimensions().unwrap_or((80, 24));
println!("{}", "=".repeat(termsize_width));
println!("\t- {} [Original]", $arg.display());
}};
( $arg:expr,duplicate ) => {{
println!("\t- {} [Duplicate]", $arg.display());
}};
( $arg:expr,deleted ) => {{
println!("\t- {} [Deleted]", $arg.display());
}};
}
pub fn process_duplicates(
duplicates: HashMap<String, Vec<PathBuf>>,
delete: bool,
) -> Result<(u128, u128), Box<dyn Error>> {
let mut duplicate_count: u128 = 0;
let mut size_saved: u128 = 0;
for (_hash, mut files) in duplicates {
if files.len() > 1 {
duplicate_count += (files.len() - 1) as u128;
files.sort_by_key(|file| {
let metadata = metadata(file).unwrap();
metadata
.created()
.unwrap_or_else(|_| metadata.modified().unwrap())
});
for (i, file) in files.iter().enumerate() {
if i == 0 {
print_filepath!(file, original);
} else {
let file_size: u128 = metadata(file)?.len() as u128;
size_saved += file_size;
if delete {
#[cfg(unix)]
{
let mut permissions = metadata(file)?.permissions();
permissions.set_mode(0o644);
fs::set_permissions(file, permissions)?;
}
fs::remove_file(file)?;
print_filepath!(file, deleted);
} else {
print_filepath!(file, duplicate);
}
}
}
}
}
Ok((duplicate_count, size_saved))
}
pub fn full_hash_file(file_path: &PathBuf) -> io::Result<String> {
let mut file = BufReader::new(File::open(file_path)?);
let mut hasher = Blake2b::new();
let mut buffer = [0; BUFFER_SIZE];
loop {
let bytes_read = file.read(&mut buffer)?;
if bytes_read == 0 {
break;
}
hasher.update(&buffer[..bytes_read]);
}
let hash: GenericArray<u8, OutSize> = hasher.finalize_fixed();
Ok(format!("{hash:x}"))
}