yadf/
lib.rs

1//! This is a binary crate. You _can_ use it as a library, but I wouldn't recommend it.
2//! If you do, remember to disable the default features which are used to build
3//! the binary.
4//!
5//! ```toml
6//! [dependencies]
7//! yadf = { version = "0.15.0", default-features = false }
8//! ```
9//!
10//! A collection of functions and structs to find duplicate files.
11//!
12//! # Example :
13//!
14//! Find and display all the duplicate files at the given paths :
15//!
16//! ```no_run
17//! # fn foo(paths: &[std::path::PathBuf]) {
18//! let counter = yadf::Yadf::builder()
19//!     .paths(paths)
20//!     .build()
21//!     .scan::<highway::HighwayHasher>();
22//! println!("{}", counter.duplicates().display::<yadf::Fdupes>());
23//! # }
24//! ```
25#![deny(unsafe_code)]
26#![warn(rust_2018_idioms)]
27
28mod bag;
29mod ext;
30mod fs;
31mod path;
32
33pub use bag::{Factor, Fdupes, Machine, TreeBag};
34pub use globset;
35pub use path::Path;
36pub use regex;
37use std::hash::Hasher;
38use std::rc::Rc;
39
40pub type FileCounter = TreeBag<u64, Path>;
41pub type FileReplicates<'a> = bag::Replicates<'a, u64, Path>;
42
43/// Search configuration.
44///
45/// # Example
46///
47/// ```no_run
48/// # fn foo(paths: &[std::path::PathBuf]) {
49/// let counter = yadf::Yadf::builder()
50///     .paths(paths) // required
51///     .minimum_file_size(64) // optional
52///     .maximum_file_size(1024 * 8) // optional
53///     .regex(None) // optional
54///     .glob(None) // optional
55///     .build()
56///     .scan::<seahash::SeaHasher>();
57/// # }
58/// ```
59///
60/// see the docs for the [`YadfBuilder`](YadfBuilder)
61#[derive(Debug, typed_builder::TypedBuilder)]
62#[builder(doc)]
63pub struct Yadf<P: AsRef<std::path::Path>> {
64    #[builder(setter(into, doc = "Paths that will be checked for duplicate files"))]
65    paths: Rc<[P]>,
66    #[builder(default, setter(into, doc = "Minimum file size"))]
67    minimum_file_size: Option<u64>,
68    #[builder(default, setter(into, doc = "Maximum file size"))]
69    maximum_file_size: Option<u64>,
70    #[builder(default, setter(into, doc = "Maximum recursion depth"))]
71    max_depth: Option<usize>,
72    #[builder(default, setter(into, doc = "File name must match this regex"))]
73    regex: Option<regex::Regex>,
74    #[builder(default, setter(into, doc = "File name must match this glob"))]
75    glob: Option<globset::Glob>,
76    #[cfg(unix)]
77    #[builder(default, setter(doc = "Treat hard links as duplicates"))]
78    hard_links: bool,
79}
80
81impl<P> Yadf<P>
82where
83    P: AsRef<std::path::Path>,
84{
85    /// This will attemps a complete scan according to its configuration.
86    pub fn scan<H>(self) -> FileCounter
87    where
88        H: Hasher + Default,
89    {
90        #[cfg(unix)]
91        let file_filter = fs::filter::FileFilter::new(
92            self.minimum_file_size,
93            self.maximum_file_size,
94            self.regex,
95            self.glob.map(|g| g.compile_matcher()),
96            self.hard_links,
97        );
98        #[cfg(not(unix))]
99        let file_filter = fs::filter::FileFilter::new(
100            self.minimum_file_size,
101            self.maximum_file_size,
102            self.regex,
103            self.glob.map(|g| g.compile_matcher()),
104        );
105        let bag = fs::find_dupes_partial::<H, _>(&self.paths, self.max_depth, file_filter);
106        if log::log_enabled!(log::Level::Info) {
107            log::info!(
108                "scanned {} files",
109                bag.as_inner().values().map(Vec::len).sum::<usize>()
110            );
111            log::info!(
112                "found {} possible duplicates after initial scan",
113                bag.duplicates().iter().map(Vec::len).sum::<usize>()
114            );
115            log::trace!("{:?}", bag);
116        }
117        let bag = fs::dedupe::<H>(bag);
118        if log::log_enabled!(log::Level::Info) {
119            log::info!(
120                "found {} duplicates in {} groups after checksumming",
121                bag.duplicates().iter().map(Vec::len).sum::<usize>(),
122                bag.duplicates().iter().count(),
123            );
124            log::trace!("{:?}", bag);
125        }
126        bag
127    }
128}