1use crate::binary::is_binary;
15use crate::config::Config;
16use crate::error::{FsearchError, FsearchResult};
17use glob::Pattern;
18use md5::{Digest as _, Md5};
19use rayon::prelude::*;
20use serde::{Deserialize, Serialize};
21use sha2::Sha256;
22use std::collections::HashMap;
23use std::fs;
24use std::io::{BufReader, Read};
25use std::path::{Path, PathBuf};
26use std::sync::atomic::{AtomicBool, Ordering};
27use std::sync::Arc;
28use walkdir::WalkDir;
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct DuplicateGroup {
35 pub hash: String,
37 pub size: u64,
39 pub paths: Vec<PathBuf>,
41 pub wasted_bytes: u64,
43}
44
45impl DuplicateGroup {
46 pub fn wasted_human(&self) -> String {
48 human_bytes(self.wasted_bytes)
49 }
50
51 pub fn size_human(&self) -> String {
53 human_bytes(self.size)
54 }
55}
56
57#[derive(Debug, Clone, Default, Serialize, Deserialize)]
59pub struct DuplicateSummary {
60 pub files_scanned: usize,
62 pub groups_found: usize,
64 pub duplicate_files: usize,
66 pub wasted_bytes: u64,
68}
69
70impl DuplicateSummary {
71 pub fn wasted_human(&self) -> String {
72 human_bytes(self.wasted_bytes)
73 }
74}
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
80pub enum HashAlgorithm {
81 Md5,
82 #[default]
83 Sha256,
84}
85
86impl HashAlgorithm {
87 pub fn as_str(&self) -> &'static str {
88 match self {
89 Self::Md5 => "md5",
90 Self::Sha256 => "sha256",
91 }
92 }
93}
94
95impl std::str::FromStr for HashAlgorithm {
96 type Err = FsearchError;
97
98 fn from_str(s: &str) -> Result<Self, Self::Err> {
99 match s.to_lowercase().as_str() {
100 "md5" => Ok(Self::Md5),
101 "sha256" => Ok(Self::Sha256),
102 other => Err(FsearchError::UnsupportedHashAlgorithm(other.into())),
103 }
104 }
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
109pub enum DuplicateMode {
110 #[default]
112 Content,
113 Name,
115 Size,
117}
118
119#[derive(Debug, Clone)]
121pub struct DuplicateOptions {
122 pub base_dir: PathBuf,
124 pub max_depth: u32,
126 pub mode: DuplicateMode,
128 pub algorithm: HashAlgorithm,
130 pub buffer_size: usize,
132 pub min_size: u64,
134 pub max_size: u64,
136 pub skip_binary: bool,
138 pub binary_check_bytes: usize,
140 pub include_patterns: Vec<String>,
142 pub exclude_dirs: Vec<String>,
144 pub max_results: usize,
146}
147
148impl DuplicateOptions {
149 pub fn from_config(cfg: &Config, base_dir: PathBuf) -> FsearchResult<Self> {
151 Ok(Self {
152 base_dir,
153 max_depth: cfg.default_depth,
154 mode: DuplicateMode::Content,
155 algorithm: cfg.hash_algorithm.parse::<HashAlgorithm>()?,
156 buffer_size: cfg.hash_buffer_size,
157 min_size: cfg.dup_min_size,
158 max_size: cfg.dup_max_size,
159 skip_binary: false,
160 binary_check_bytes: cfg.binary_check_bytes,
161 include_patterns: crate::config::split_csv(&cfg.default_include),
162 exclude_dirs: cfg.excluded_dirs(),
163 max_results: cfg.max_results,
164 })
165 }
166
167 pub fn builder(base_dir: impl Into<PathBuf>) -> DuplicateOptionsBuilder {
169 DuplicateOptionsBuilder::new(base_dir.into())
170 }
171}
172
173pub struct DuplicateOptionsBuilder(DuplicateOptions);
175
176impl DuplicateOptionsBuilder {
177 fn new(base_dir: PathBuf) -> Self {
178 Self(DuplicateOptions {
179 base_dir,
180 max_depth: 10,
181 mode: DuplicateMode::Content,
182 algorithm: HashAlgorithm::Sha256,
183 buffer_size: 65_536,
184 min_size: 1,
185 max_size: 0,
186 skip_binary: false,
187 binary_check_bytes: 1024,
188 include_patterns: vec![],
189 exclude_dirs: vec![
190 ".git".into(),
191 "node_modules".into(),
192 "target".into(),
193 ".svn".into(),
194 "__pycache__".into(),
195 ".hg".into(),
196 ".cache".into(),
197 ],
198 max_results: 0,
199 })
200 }
201 pub fn max_depth(mut self, d: u32) -> Self {
202 self.0.max_depth = d;
203 self
204 }
205 pub fn mode(mut self, m: DuplicateMode) -> Self {
206 self.0.mode = m;
207 self
208 }
209 pub fn algorithm(mut self, a: HashAlgorithm) -> Self {
210 self.0.algorithm = a;
211 self
212 }
213 pub fn buffer_size(mut self, b: usize) -> Self {
214 self.0.buffer_size = b;
215 self
216 }
217 pub fn min_size(mut self, s: u64) -> Self {
218 self.0.min_size = s;
219 self
220 }
221 pub fn max_size(mut self, s: u64) -> Self {
222 self.0.max_size = s;
223 self
224 }
225 pub fn skip_binary(mut self, v: bool) -> Self {
226 self.0.skip_binary = v;
227 self
228 }
229 pub fn include_patterns(mut self, p: Vec<String>) -> Self {
230 self.0.include_patterns = p;
231 self
232 }
233 pub fn exclude_dirs(mut self, d: Vec<String>) -> Self {
234 self.0.exclude_dirs = d;
235 self
236 }
237 pub fn max_results(mut self, n: usize) -> Self {
238 self.0.max_results = n;
239 self
240 }
241 pub fn build(self) -> DuplicateOptions {
242 self.0
243 }
244}
245
246pub fn find_duplicates(
254 opts: &DuplicateOptions,
255 interrupted: Arc<AtomicBool>,
256) -> FsearchResult<(Vec<DuplicateGroup>, DuplicateSummary)> {
257 if !opts.base_dir.exists() {
258 return Err(FsearchError::DirectoryNotFound(
259 opts.base_dir.display().to_string(),
260 ));
261 }
262 if !opts.base_dir.is_dir() {
263 return Err(FsearchError::NotADirectory(
264 opts.base_dir.display().to_string(),
265 ));
266 }
267
268 let files = collect_files(opts, &interrupted);
270 let total_scanned = files.len();
271
272 if interrupted.load(Ordering::Relaxed) {
273 return Err(FsearchError::Interrupted);
274 }
275
276 let mut groups = match opts.mode {
278 DuplicateMode::Content => by_content(files, opts, &interrupted)?,
279 DuplicateMode::Name => by_name(files, opts),
280 DuplicateMode::Size => by_size_only(files, opts),
281 };
282
283 groups.sort_unstable_by(|a, b| b.wasted_bytes.cmp(&a.wasted_bytes));
285
286 if opts.max_results > 0 && groups.len() > opts.max_results {
288 groups.truncate(opts.max_results);
289 }
290
291 let summary = DuplicateSummary {
292 files_scanned: total_scanned,
293 groups_found: groups.len(),
294 duplicate_files: groups.iter().map(|g| g.paths.len() - 1).sum(),
295 wasted_bytes: groups.iter().map(|g| g.wasted_bytes).sum(),
296 };
297
298 Ok((groups, summary))
299}
300
301struct FileEntry {
304 path: PathBuf,
305 size: u64,
306}
307
308fn collect_files(opts: &DuplicateOptions, interrupted: &AtomicBool) -> Vec<FileEntry> {
309 WalkDir::new(&opts.base_dir)
310 .max_depth(opts.max_depth as usize + 1)
311 .follow_links(false)
312 .into_iter()
313 .filter_entry(|e| {
314 if e.file_type().is_dir() && e.depth() > 0 {
315 let name = e.file_name().to_string_lossy().to_string();
316 if is_excluded_dir_dup(&name, &opts.exclude_dirs) {
317 return false;
318 }
319 }
320 true
321 })
322 .filter_map(|e| e.ok())
323 .filter(|e| {
324 if interrupted.load(Ordering::Relaxed) {
325 return false;
326 }
327 if !e.file_type().is_file() {
328 return false;
329 }
330
331 let name = e.file_name().to_string_lossy().to_string();
332 if !matches_include_dup(&name, &opts.include_patterns) {
333 return false;
334 }
335
336 if let Ok(meta) = e.metadata() {
337 let sz = meta.len();
338 if opts.min_size > 0 && sz < opts.min_size {
339 return false;
340 }
341 if opts.max_size > 0 && sz > opts.max_size {
342 return false;
343 }
344 }
345
346 if opts.skip_binary && is_binary(e.path(), opts.binary_check_bytes) {
347 return false;
348 }
349 true
350 })
351 .filter_map(|e| {
352 let size = e.metadata().ok()?.len();
353 Some(FileEntry {
354 path: e.path().to_path_buf(),
355 size,
356 })
357 })
358 .collect()
359}
360
361fn by_content(
364 files: Vec<FileEntry>,
365 opts: &DuplicateOptions,
366 interrupted: &AtomicBool,
367) -> FsearchResult<Vec<DuplicateGroup>> {
368 let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
370 for f in files {
371 size_map.entry(f.size).or_default().push(f.path);
372 }
373 let candidates: Vec<(u64, PathBuf)> = size_map
374 .into_iter()
375 .filter(|(_, v)| v.len() > 1)
376 .flat_map(|(sz, paths)| paths.into_iter().map(move |p| (sz, p)))
377 .collect();
378
379 if candidates.is_empty() {
380 return Ok(vec![]);
381 }
382
383 let buf = opts.buffer_size;
385 let algo = opts.algorithm;
386
387 let hashed: Vec<(String, u64, PathBuf)> = candidates
388 .into_par_iter()
389 .filter_map(|(size, path)| {
390 if interrupted.load(Ordering::Relaxed) {
391 return None;
392 }
393 let digest = hash_file(&path, algo, buf).ok()?;
394 Some((digest, size, path))
395 })
396 .collect();
397
398 let mut hash_map: HashMap<String, (u64, Vec<PathBuf>)> = HashMap::new();
400 for (hash, size, path) in hashed {
401 let entry = hash_map.entry(hash).or_insert_with(|| (size, vec![]));
402 entry.1.push(path);
403 }
404
405 Ok(hash_map
406 .into_iter()
407 .filter(|(_, (_, paths))| paths.len() > 1)
408 .map(|(hash, (size, paths))| {
409 let wasted = size * (paths.len() as u64 - 1);
410 DuplicateGroup {
411 hash,
412 size,
413 paths,
414 wasted_bytes: wasted,
415 }
416 })
417 .collect())
418}
419
420fn by_name(files: Vec<FileEntry>, opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
423 let mut name_map: HashMap<String, Vec<(PathBuf, u64)>> = HashMap::new();
424 for f in files {
425 let name = f
426 .path
427 .file_name()
428 .map(|n| n.to_string_lossy().to_string())
429 .unwrap_or_default();
430 name_map.entry(name).or_default().push((f.path, f.size));
431 }
432 name_map
433 .into_iter()
434 .filter(|(_, v)| v.len() > 1)
435 .map(|(name, entries)| {
436 let size = entries.first().map(|(_, s)| *s).unwrap_or(0);
437 let paths: Vec<PathBuf> = entries.into_iter().map(|(p, _)| p).collect();
438 let wasted = size.saturating_mul(paths.len() as u64 - 1);
439 DuplicateGroup {
440 hash: format!("name:{}", name),
441 size,
442 wasted_bytes: if opts.mode == DuplicateMode::Name {
443 wasted
444 } else {
445 0
446 },
447 paths,
448 }
449 })
450 .collect()
451}
452
453fn by_size_only(files: Vec<FileEntry>, _opts: &DuplicateOptions) -> Vec<DuplicateGroup> {
456 let mut size_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
457 for f in files {
458 size_map.entry(f.size).or_default().push(f.path);
459 }
460 size_map
461 .into_iter()
462 .filter(|(_, v)| v.len() > 1)
463 .map(|(size, paths)| {
464 let wasted = size * (paths.len() as u64 - 1);
465 DuplicateGroup {
466 hash: format!("size:{}", size),
467 size,
468 paths,
469 wasted_bytes: wasted,
470 }
471 })
472 .collect()
473}
474
475pub fn hash_file(path: &Path, algo: HashAlgorithm, buf_size: usize) -> FsearchResult<String> {
479 let file = fs::File::open(path).map_err(|e| FsearchError::Io {
480 path: path.display().to_string(),
481 source: e,
482 })?;
483 let mut reader = BufReader::with_capacity(buf_size, file);
484 let mut buf = vec![0u8; buf_size];
485
486 match algo {
487 HashAlgorithm::Md5 => {
488 let mut h = Md5::new();
489 loop {
490 let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
491 path: path.display().to_string(),
492 source: e,
493 })?;
494 if n == 0 {
495 break;
496 }
497 md5::Digest::update(&mut h, &buf[..n]);
498 }
499 Ok(format!("{:x}", md5::Digest::finalize(h)))
500 }
501 HashAlgorithm::Sha256 => {
502 let mut h = Sha256::new();
503 loop {
504 let n = reader.read(&mut buf).map_err(|e| FsearchError::Io {
505 path: path.display().to_string(),
506 source: e,
507 })?;
508 if n == 0 {
509 break;
510 }
511 sha2::Digest::update(&mut h, &buf[..n]);
512 }
513 Ok(format!("{:x}", sha2::Digest::finalize(h)))
514 }
515 }
516}
517
518fn is_excluded_dir_dup(name: &str, excludes: &[String]) -> bool {
521 excludes
522 .iter()
523 .any(|ex| Pattern::new(ex).map(|p| p.matches(name)).unwrap_or(false) || ex == name)
524}
525
526fn matches_include_dup(name: &str, patterns: &[String]) -> bool {
527 if patterns.is_empty() {
528 return true;
529 }
530 patterns.iter().any(|p| {
531 Pattern::new(p)
532 .map(|pat| pat.matches(name))
533 .unwrap_or(false)
534 })
535}
536
537pub fn human_bytes(bytes: u64) -> String {
539 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
540 if bytes == 0 {
541 return "0 B".into();
542 }
543 let exp = (bytes as f64).log(1024.0).floor() as usize;
544 let exp = exp.min(UNITS.len() - 1);
545 let val = bytes as f64 / 1024_f64.powi(exp as i32);
546 if exp == 0 {
547 format!("{} {}", bytes, UNITS[0])
548 } else {
549 format!("{:.1} {}", val, UNITS[exp])
550 }
551}
552
553#[cfg(test)]
554mod tests {
555 use super::*;
556 use std::io::Write;
557 use tempfile::TempDir;
558
559 fn make_file(dir: &Path, name: &str, content: &[u8]) -> PathBuf {
560 let p = dir.join(name);
561 let mut f = fs::File::create(&p).unwrap();
562 f.write_all(content).unwrap();
563 p
564 }
565
566 #[test]
567 fn human_bytes_formatting() {
568 assert_eq!(human_bytes(0), "0 B");
569 assert_eq!(human_bytes(512), "512 B");
570 assert_eq!(human_bytes(1024), "1.0 KiB");
571 assert_eq!(human_bytes(1024 * 1024), "1.0 MiB");
572 }
573
574 #[test]
575 fn detects_identical_files() {
576 let tmp = TempDir::new().unwrap();
577 make_file(tmp.path(), "a.txt", b"hello world");
578 make_file(tmp.path(), "b.txt", b"hello world");
579 make_file(tmp.path(), "c.txt", b"different content");
580
581 let opts = DuplicateOptions::builder(tmp.path()).max_depth(1).build();
582 let interrupted = Arc::new(AtomicBool::new(false));
583 let (groups, summary) = find_duplicates(&opts, interrupted).unwrap();
584
585 assert_eq!(groups.len(), 1, "one duplicate group expected");
586 assert_eq!(groups[0].paths.len(), 2);
587 assert_eq!(summary.duplicate_files, 1);
588 }
589
590 #[test]
591 fn no_false_positives_for_unique_files() {
592 let tmp = TempDir::new().unwrap();
593 make_file(tmp.path(), "x.txt", b"aaa");
594 make_file(tmp.path(), "y.txt", b"bbb");
595
596 let opts = DuplicateOptions::builder(tmp.path()).build();
597 let interrupted = Arc::new(AtomicBool::new(false));
598 let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
599 assert!(groups.is_empty());
600 }
601
602 #[test]
603 fn name_mode_groups_by_filename() {
604 let tmp = TempDir::new().unwrap();
605 let sub = tmp.path().join("sub");
606 fs::create_dir_all(&sub).unwrap();
607 make_file(tmp.path(), "readme.txt", b"v1");
608 make_file(&sub, "readme.txt", b"v2");
609
610 let opts = DuplicateOptions::builder(tmp.path())
611 .max_depth(2)
612 .mode(DuplicateMode::Name)
613 .build();
614 let interrupted = Arc::new(AtomicBool::new(false));
615 let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
616 assert!(!groups.is_empty(), "name duplicates expected");
617 }
618
619 #[test]
620 fn hash_file_md5_and_sha256() {
621 let tmp = TempDir::new().unwrap();
622 let p = make_file(tmp.path(), "f.txt", b"test content");
623 let md5 = hash_file(&p, HashAlgorithm::Md5, 4096).unwrap();
624 let sha256 = hash_file(&p, HashAlgorithm::Sha256, 4096).unwrap();
625 assert_eq!(md5.len(), 32, "md5 should be 32 hex chars");
626 assert_eq!(sha256.len(), 64, "sha256 should be 64 hex chars");
627 }
628
629 #[test]
630 fn size_filter_excludes_small_files() {
631 let tmp = TempDir::new().unwrap();
632 make_file(tmp.path(), "small1.txt", b"hi");
633 make_file(tmp.path(), "small2.txt", b"hi");
634
635 let opts = DuplicateOptions::builder(tmp.path())
636 .min_size(1000) .build();
638 let interrupted = Arc::new(AtomicBool::new(false));
639 let (groups, _) = find_duplicates(&opts, interrupted).unwrap();
640 assert!(groups.is_empty(), "small files should be filtered out");
641 }
642}