1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
//! This a binary crate. You _can_ use it as a library, but I wouldn't recommend it.
//!
//! A collection of functions and structs to find duplicate files.
//!
//! # Example :
//!
//! Find and display all the duplicate files at the given path :
//!
//! ```no_run
//! let counter = yadf::Yadf::builder()
//!     .paths(&["path/to/somewhere", "another/path"]) // required
//!     .minimum_file_size(64) // optional
//!     .maximum_file_size(1024 * 8) // optional
//!     .regex(None) // optional
//!     .glob(None) // optional
//!     .build()
//!     .scan::<highway::HighwayHasher>();
//! println!("{}", counter.duplicates().display::<yadf::Fdupes>());
//! ```

mod bag;
mod fs;
pub mod path;

pub use bag::{Factor, Fdupes, Machine, Replicates, TreeBag};
pub use globset;
pub use regex;
use std::hash::Hasher;
use std::path::Path;

pub type FileCounter = TreeBag<u64, path::Path>;
pub type FileReplicates<'a> = Replicates<'a, u64, path::Path>;

/// Search configuration
///
/// # Example
///
/// ```no_run
/// let counter = yadf::Yadf::builder()
///     .paths(&["path/to/somewhere", "another/path"]) // required
///     .minimum_file_size(64) // optional
///     .maximum_file_size(1024 * 8) // optional
///     .regex(None) // optional
///     .glob(None) // optional
///     .build()
///     .scan::<highway::HighwayHasher>();
/// ```
///
/// see the docs for the [`YadfBuilder`](YadfBuilder)
#[derive(Debug, Default, typed_builder::TypedBuilder)]
#[builder(doc)]
pub struct Yadf<'a, P>
where
    P: AsRef<Path>,
{
    #[builder(setter(doc = "Paths that will be checked for duplicate files"))]
    paths: &'a [P],
    #[builder(default, setter(into, doc = "Minimum file size"))]
    minimum_file_size: Option<u64>,
    #[builder(default, setter(into, doc = "Maximum file size"))]
    maximum_file_size: Option<u64>,
    #[builder(default, setter(into, doc = "Maximum recursion depth"))]
    max_depth: Option<usize>,
    #[builder(default, setter(into, doc = "File name must match this regex"))]
    regex: Option<regex::Regex>,
    #[builder(default, setter(into, doc = "File name must match this glob"))]
    glob: Option<globset::Glob>,
}

impl<P> Yadf<'_, P>
where
    P: AsRef<Path>,
{
    /// This will attemps a complete scan according to its configuration.
    pub fn scan<H>(self) -> FileCounter
    where
        H: Hasher + Default,
    {
        let bag = fs::find_dupes_partial::<H, P>(
            self.paths,
            self.minimum_file_size,
            self.maximum_file_size,
            self.regex,
            self.glob.map(|g| g.compile_matcher()),
            self.max_depth,
        );
        if log::log_enabled!(log::Level::Info) {
            log::info!(
                "scanned {} files",
                bag.0.values().map(|b| b.len()).sum::<usize>()
            );
            log::info!(
                "found {} possible duplicates after initial scan",
                bag.duplicates().iter().map(|b| b.len()).sum::<usize>()
            );
            log::trace!("{:?}", bag);
        }
        let bag = fs::dedupe::<H>(bag);
        if log::log_enabled!(log::Level::Info) {
            log::info!(
                "found {} duplicates in {} groups after checksumming",
                bag.duplicates().iter().map(|b| b.len()).sum::<usize>(),
                bag.duplicates().iter().count(),
            );
            log::trace!("{:?}", bag);
        }
        bag
    }
}