use crate::binary::is_binary;
use crate::config::Config;
use crate::error::{FsearchError, FsearchResult};
use glob::Pattern;
use md5::{Digest as _, Md5};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use sha2::Sha256;
use std::collections::HashMap;
use std::fs;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use walkdir::WalkDir;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DuplicateGroup {
pub hash: String,
pub size: u64,
pub paths: Vec<PathBuf>,
pub wasted_bytes: u64,
}
impl DuplicateGroup {
pub fn wasted_human(&self) -> String {
human_bytes(self.wasted_bytes)
}
pub fn size_human(&self) -> String {
human_bytes(self.size)
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DuplicateSummary {
pub files_scanned: usize,
pub groups_found: usize,
pub duplicate_files: usize,
pub wasted_bytes: u64,
}
impl DuplicateSummary {
pub fn wasted_human(&self) -> String {
human_bytes(self.wasted_bytes)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
pub enum HashAlgorithm {
Md5,
#[default]
Sha256,
}
impl HashAlgorithm {
pub fn as_str(&self) -> &'static str {
match self {
Self::Md5 => "md5",
Self::Sha256 => "sha256",
}
}
}
impl std::str::FromStr for HashAlgorithm {
type Err = FsearchError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"md5" => Ok(Self::Md5),
"sha256" => Ok(Self::Sha256),
other => Err(FsearchError::UnsupportedHashAlgorithm(other.into())),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
pub enum DuplicateMode {
#[default]
Content,
Name,
Size,
}
#[derive(Debug, Clone)]
pub struct DuplicateOptions {
pub base_dir: PathBuf,
pub max_depth: u32,
pub mode: DuplicateMode,
pub algorithm: HashAlgorithm,
pub buffer_size: usize,
pub min_size: u64,
pub max_size: u64,
pub skip_binary: bool,
pub binary_check_bytes: usize,
pub include_patterns: Vec<String>,
pub exclude_dirs: Vec<String>,
pub max_results: usize,
}
impl DuplicateOptions {
pub fn from_config(cfg: &Config, base_dir: PathBuf) -> FsearchResult<Self> {
Ok(Self {
base_dir,
max_depth: cfg.default_depth,
mode: DuplicateMode::Content,
algorithm: cfg.hash_algorithm.parse::<HashAlgorithm>()?,
buffer_size: cfg.hash_buffer_size,
min_size: cfg.dup_min_size,
max_size: cfg.dup_max_size,
skip_binary: false,
binary_check_bytes: cfg.binary_check_bytes,
include_patterns: crate::config::split_csv(&cfg.default_include),
exclude_dirs: cfg.excluded_dirs(),
max_results: cfg.max_results,
})
}
pub fn builder(base_dir: impl Into<PathBuf>) -> DuplicateOptionsBuilder {
DuplicateOptionsBuilder::new(base_dir.into())
}
}
pub struct DuplicateOptionsBuilder(DuplicateOptions);
impl DuplicateOptionsBuilder {
fn new(base_dir: PathBuf) -> Self {
Self(DuplicateOptions {
base_dir,
max_depth: 10,
mode: DuplicateMode::Content,
algorithm: HashAlgorithm::Sha256,
buffer_size: 65_536,
min_size: 1,
max_size: 0,
skip_binary: false,
binary_check_bytes: 1024,
include_patterns: vec![],
exclude_dirs: vec![
".git".into(),
"node_modules".into(),
"target".into(),
".svn".into(),
"__pycache__".into(),
".hg".into(),
".cache".into(),
],
max_results: 0,
})
}
pub fn max_depth(mut self, d: u32) -> Self {
self.0.max_depth = d;
self
}
pub fn mode(mut self, m: DuplicateMode) -> Self {
self.0.mode = m;
self
}
pub fn algorithm(mut self, a: HashAlgorithm) -> Self {
self.0.algorithm = a;
self
}
pub fn buffer_size(mut self, b: usize) -> Self {
self.0.buffer_size = b;
self
}
pub fn min_size(mut self, s: u64) -> Self {
self.0.min_size = s;
self
}
pub fn max_size(mut self, s: u64) -> Self {
self.0.max_size = s;
self
}
pub fn skip_binary(mut self, v: bool) -> Self {
self.0.skip_binary = v;
self
}
pub fn include_patterns(mut self, p: Vec<String>) -> Self {
self.0.include_patterns = p;
self
}
pub fn exclude_dirs(mut self, d: Vec<String>) -> Self {
self.0.exclude_dirs = d;
self
}
pub fn max_results(mut self, n: usize) -> Self {
self.0.max_results = n;
self
}
pub fn build(self) -> DuplicateOptions {
self.0
}
}
pub fn find_duplicates(
opts: &DuplicateOptions,
interrupted: Arc<AtomicBool>,
) -> FsearchResult<(Vec<DuplicateGroup>, DuplicateSummary)> {
if !opts.base_dir.exists() {
return Err(FsearchError::DirectoryNotFound(
opts.base_dir.display().to_string(),
));
}
if !opts.base_dir.is_dir() {
return Err(FsearchError::NotADirectory(
opts.base_dir.display().to_string(),
));
}
let files = collect_files(opts, &interrupted);
let total_scanned = files.len();
if interrupted.load(Ordering::Relaxed) {
return Err(FsearchError::Interrupted);
}
let mut groups = match opts.mode {
DuplicateMode::Content => by_content(files, opts, &interrupted)?,
DuplicateMode::Name => by_name(files, opts),
DuplicateMode::Size => by_size_only(files, opts),
};
groups.sort_unstable_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
if opts.max_results > 0 && groups.len() > opts.max_results {
groups.truncate(opts.max_results);
}
let summary = DuplicateSummary {
files_scanned: total_scanned,
groups_found: groups.len(),
duplicate_files: groups.iter().map(|g| g.paths.len() - 1).sum(),
wasted_bytes: groups.iter().map(|g| g.wasted_bytes).sum(),
};
Ok((groups, summary))
}
struct FileEntry {
path: PathBuf,
size: u64,
}
fn collect_files(opts: &DuplicateOptions, interrupted: &AtomicBool) -> Vec<FileEntry> {
WalkDir::new(&opts.base_dir)
.max_depth(opts.max_depth as usize + 1)
.follow_links(false)
.into_iter()
.filter_entry(|e| {
if e.file_type().is_dir() && e.depth() > 0 {
let name = e.file_name().to_string_lossy().to_string();
if is_excluded_dir_dup(&name, &opts.exclude_dirs) {
return false;
}
}
true
})
.filter_map(|e| e.ok())
.filter(|e| {
if interrupted.load(Ordering::Relaxed) {
return false;
}
if !e.file_type().is_file() {
return false;
}
let name = e.file_name().to_string_lossy().to_string();
if !matches_include_dup(&name, &opts.include_patterns) {
return false;
}
if let Ok(meta) = e.metadata() {
let sz = meta.len();
if opts.min_size > 0 && sz < opts.min_size {
return false;
}
if opts.max_size > 0 && sz > opts.max_size {
return false;
}
}
if opts.skip_binary && is_binary(e.path(), opts.binary_check_bytes) {
return false;
}
true
})
.filter_map(|e| {
let size = e.metadata().ok()?.len();
Some(FileEntry {
path: e.path().to_path_buf(),
size,
})
})
.collect()
}
fn by_content(
files: Vec<FileEntry>,
opts: &DuplicateOptions,
interrupted: &AtomicBool,
) -> FsearchResult<Vec<DuplicateGroup>> {
let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
for f in files {
size_map.entry(f.size).or_default().push(f.path);
}
let candidates: Vec<(u64, PathBuf)> = size_map
.into_iter()
.filter(|(_, v)| v.len() > 1)
.flat_map(|(sz, paths)| paths.into_iter().map(move |p| (sz, p)))
.collect();
if candidates.is_empty() {
return Ok(vec![]);
}
let buf = opts.buffer_size;
let algo = opts.algorithm;
let hashed: Vec<(String, u64, PathBuf)> = candidates
.into_par_iter()
.filter_map(|(size, path)| {
if interrupted.load(Ordering::Relaxed) {
return None;
}
let digest = hash_file(&path, algo, buf).ok()?;
Some((digest, size, path))
})
.collect();
let mut hash_map: HashMap<String, (u64, Vec<PathBuf>)> = HashMap::new();
for (hash, size, path) in hashed {
let entry = hash_map.entry(hash).or_insert_with(|| (size, vec![]));
entry.1.push(path);
}
Ok(hash_map
.into_iter()
.filter(|(_, (_, paths))| paths.len() > 1)
.map(|(hash, (size, paths))| {
let wasted = size * (paths.len() as u64 - 1);
DuplicateGroup {
hash,
size,
paths,
wasted_bytes: wasted,
}
})
.collect())
}
fn by_name(files: Vec<FileEntry>, opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
let mut name_map: HashMap<String, Vec<(PathBuf, u64)>> = HashMap::new();
for f in files {
let name = f
.path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_default();
name_map.entry(name).or_default().push((f.path, f.size));
}
name_map
.into_iter()
.filter(|(_, v)| v.len() > 1)
.map(|(name, entries)| {
let size = entries.first().map(|(_, s)| *s).unwrap_or(0);
let paths: Vec<PathBuf> = entries.into_iter().map(|(p, _)| p).collect();
let wasted = size.saturating_mul(paths.len() as u64 - 1);
DuplicateGroup {
hash: format!("name:{}", name),
size,
wasted_bytes: if opts.mode == DuplicateMode::Name {
wasted
} else {
0
},
paths,
}
})
.collect()
}
fn by_size_only(files: Vec<FileEntry>, _opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
for f in files {
size_map.entry(f.size).or_default().push(f.path);
}
size_map
.into_iter()
.filter(|(_, v)| v.len() > 1)
.map(|(size, paths)| {
let wasted = size * (paths.len() as u64 - 1);
DuplicateGroup {
hash: format!("size:{}", size),
size,
paths,
wasted_bytes: wasted,
}
})
.collect()
}
pub fn hash_file(path: &Path, algo: HashAlgorithm, buf_size: usize) -> FsearchResult<String> {
let file = fs::File::open(path).map_err(|e| FsearchError::Io {
path: path.display().to_string(),
source: e,
})?;
let mut reader = BufReader::with_capacity(buf_size, file);
let mut buf = vec![0u8; buf_size];
match algo {
HashAlgorithm::Md5 => {
let mut h = Md5::new();
loop {
let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
path: path.display().to_string(),
source: e,
})?;
if n == 0 {
break;
}
md5::Digest::update(&mut h, &buf[..n]);
}
Ok(format!("{:x}", md5::Digest::finalize(h)))
}
HashAlgorithm::Sha256 => {
let mut h = Sha256::new();
loop {
let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
path: path.display().to_string(),
source: e,
})?;
if n == 0 {
break;
}
sha2::Digest::update(&mut h, &buf[..n]);
}
Ok(format!("{:x}", sha2::Digest::finalize(h)))
}
}
}
fn is_excluded_dir_dup(name: &str, excludes: &[String]) -> bool {
excludes
.iter()
.any(|ex| Pattern::new(ex).map(|p| p.matches(name)).unwrap_or(false) || ex == name)
}
fn matches_include_dup(name: &str, patterns: &[String]) -> bool {
if patterns.is_empty() {
return true;
}
patterns.iter().any(|p| {
Pattern::new(p)
.map(|pat| pat.matches(name))
.unwrap_or(false)
})
}
pub fn human_bytes(bytes: u64) -> String {
const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
if bytes == 0 {
return "0 B".into();
}
let exp = (bytes as f64).log(1024.0).floor() as usize;
let exp = exp.min(UNITS.len() - 1);
let val = bytes as f64 / 1024_f64.powi(exp as i32);
if exp == 0 {
format!("{} {}", bytes, UNITS[0])
} else {
format!("{:.1} {}", val, UNITS[exp])
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn make_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
let p = dir.join(name);
let mut f = fs::File::create(&p).unwrap();
f.write_all(content).unwrap();
p
}
#[test]
fn human_bytes_formatting() {
assert_eq!(human_bytes(0), "0 B");
assert_eq!(human_bytes(512), "512 B");
assert_eq!(human_bytes(1024), "1.0 KiB");
assert_eq!(human_bytes(1024 * 1024), "1.0 MiB");
}
#[test]
fn detects_identical_files() {
let tmp = TempDir::new().unwrap();
make_file(tmp.path(), "a.txt", b"hello world");
make_file(tmp.path(), "b.txt", b"hello world");
make_file(tmp.path(), "c.txt", b"different content");
let opts = DuplicateOptions::builder(tmp.path()).max_depth(1).build();
let interrupted = Arc::new(AtomicBool::new(false));
let (groups, summary) = find_duplicates(&opts, interrupted).unwrap();
assert_eq!(groups.len(), 1, "one duplicate group expected");
assert_eq!(groups[0].paths.len(), 2);
assert_eq!(summary.duplicate_files, 1);
}
#[test]
fn no_false_positives_for_unique_files() {
let tmp = TempDir::new().unwrap();
make_file(tmp.path(), "x.txt", b"aaa");
make_file(tmp.path(), "y.txt", b"bbb");
let opts = DuplicateOptions::builder(tmp.path()).build();
let interrupted = Arc::new(AtomicBool::new(false));
let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
assert!(groups.is_empty());
}
#[test]
fn name_mode_groups_by_filename() {
let tmp = TempDir::new().unwrap();
let sub = tmp.path().join("sub");
fs::create_dir_all(&sub).unwrap();
make_file(tmp.path(), "readme.txt", b"v1");
make_file(&sub, "readme.txt", b"v2");
let opts = DuplicateOptions::builder(tmp.path())
.max_depth(2)
.mode(DuplicateMode::Name)
.build();
let interrupted = Arc::new(AtomicBool::new(false));
let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
assert!(!groups.is_empty(), "name duplicates expected");
}
#[test]
fn hash_file_md5_and_sha256() {
let tmp = TempDir::new().unwrap();
let p = make_file(tmp.path(), "f.txt", b"test content");
let md5 = hash_file(&p, HashAlgorithm::Md5, 4096).unwrap();
let sha256 = hash_file(&p, HashAlgorithm::Sha256, 4096).unwrap();
assert_eq!(md5.len(), 32, "md5 should be 32 hex chars");
assert_eq!(sha256.len(), 64, "sha256 should be 64 hex chars");
}
#[test]
fn size_filter_excludes_small_files() {
let tmp = TempDir::new().unwrap();
make_file(tmp.path(), "small1.txt", b"hi");
make_file(tmp.path(), "small2.txt", b"hi");
let opts = DuplicateOptions::builder(tmp.path())
.min_size(1000) .build();
let interrupted = Arc::new(AtomicBool::new(false));
let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
assert!(groups.is_empty(), "small files should be filtered out");
}
}