#![deny(unsafe_code)]
#![warn(rust_2018_idioms)]
mod args;
use anyhow::Context;
use clap::{Parser, ValueEnum};
use clap_verbosity_flag::ErrorLevel;
use std::fs::File;
use std::io::{self, Write};
use std::path::PathBuf;
use std::str::FromStr;
use yadf::{Fdupes, Machine};
fn main() -> anyhow::Result<()> {
human_panic::setup_panic!();
let timer = std::time::Instant::now();
let args = Args::init_from_env();
log::debug!("{:?}", args);
let config = build_config(&args);
log::debug!("{:?}", config);
let bag = args.algorithm.run(config);
let rfactor = args.rfactor.unwrap_or_default();
let replicates = bag.replicates(rfactor.into());
match args.output {
Some(path) => {
let context = || format!("writing output to the file: {:?}", path.display());
let file = File::create(&path).with_context(context)?;
args.format.display(file, replicates)
}
None => args.format.display(io::stdout().lock(), replicates),
}
.context("writing output")?;
log::debug!("{:?} elapsed", timer.elapsed());
Ok(())
}
#[cfg(unix)]
fn build_config(args: &Args) -> yadf::Yadf<PathBuf> {
yadf::Yadf::builder()
.paths(args.paths.as_ref())
.minimum_file_size(args.min())
.maximum_file_size(args.max())
.regex(args.regex.clone())
.glob(args.pattern.clone())
.max_depth(args.max_depth)
.hard_links(args.hard_links)
.build()
}
#[cfg(not(unix))]
fn build_config(args: &Args) -> yadf::Yadf<PathBuf> {
yadf::Yadf::builder()
.paths(args.paths.as_ref())
.minimum_file_size(args.min())
.maximum_file_size(args.max())
.regex(args.regex.clone())
.glob(args.pattern.clone())
.max_depth(args.max_depth)
.build()
}
impl Algorithm {
fn run<P>(&self, config: yadf::Yadf<P>) -> yadf::FileCounter
where
P: AsRef<std::path::Path>,
{
log::debug!("using {:?} hashing", self);
match self {
Algorithm::AHash => config.scan::<ahash::AHasher>(),
Algorithm::Highway => config.scan::<highway::HighwayHasher>(),
Algorithm::MetroHash => config.scan::<metrohash::MetroHash>(),
Algorithm::SeaHash => config.scan::<seahash::SeaHasher>(),
Algorithm::XxHash => config.scan::<twox_hash::XxHash64>(),
}
}
}
impl Format {
fn display<W>(&self, writer: W, replicates: yadf::FileReplicates<'_>) -> anyhow::Result<()>
where
W: Write,
{
let mut writer = io::BufWriter::with_capacity(64 * 1024, writer);
match self {
Format::Json => {
serde_json::to_writer(&mut writer, &replicates)?;
writer.write_all(b"\n")?;
}
Format::JsonPretty => {
serde_json::to_writer_pretty(&mut writer, &replicates)?;
writer.write_all(b"\n")?;
}
Format::Csv => csv_to_writer(writer, &replicates)?,
Format::LdJson => ldjson_to_writer(writer, &replicates)?,
Format::Fdupes => writeln!(writer, "{}", replicates.display::<Fdupes>())?,
Format::Machine => writeln!(writer, "{}", replicates.display::<Machine>())?,
};
Ok(())
}
}
#[derive(Parser, Debug)]
pub struct Args {
#[clap(value_parser)]
paths: Vec<PathBuf>,
#[clap(short, long, value_enum, default_value_t, ignore_case = true)]
format: Format,
#[clap(short, long, value_enum, default_value_t, ignore_case = true)]
algorithm: Algorithm,
#[clap(short, long)]
no_empty: bool,
#[clap(long, value_name = "size")]
min: Option<Byte>,
#[clap(long, value_name = "size")]
max: Option<Byte>,
#[clap(short = 'd', long = "depth", value_name = "depth")]
max_depth: Option<usize>,
#[cfg_attr(unix, clap(short = 'H', long))]
#[cfg(unix)]
hard_links: bool,
#[clap(short = 'R', long)]
regex: Option<regex::Regex>,
#[clap(short, long, value_name = "glob")]
pattern: Option<globset::Glob>,
#[clap(flatten)]
verbosity: clap_verbosity_flag::Verbosity<ErrorLevel>,
#[clap(long)]
rfactor: Option<ReplicationFactor>,
#[clap(short, long)]
output: Option<PathBuf>,
}
#[derive(ValueEnum, Debug, Clone, Default)]
enum Format {
Csv,
#[default]
Fdupes,
Json,
JsonPretty,
LdJson,
Machine,
}
#[derive(ValueEnum, Debug, Clone, Default)]
#[clap(rename_all = "lower")]
enum Algorithm {
#[default]
AHash,
Highway,
MetroHash,
SeaHash,
XxHash,
}
#[derive(Debug, Clone)]
struct Byte(byte_unit::Byte);
impl FromStr for Byte {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
byte_unit::Byte::from_str(s)
.map(Byte)
.map_err(|e| e.to_string())
}
}
#[derive(Debug, PartialEq, Clone)]
enum ReplicationFactor {
Under(usize),
Equal(usize),
Over(usize),
}
fn csv_to_writer<W>(writer: W, replicates: &yadf::FileReplicates<'_>) -> csv::Result<()>
where
W: Write,
{
let mut writer = csv::WriterBuilder::new()
.flexible(true)
.has_headers(false)
.from_writer(writer);
writer.serialize(("count", "files"))?;
for files in replicates {
writer.serialize((files.len(), files))?;
}
Ok(())
}
fn ldjson_to_writer<W>(mut writer: W, replicates: &yadf::FileReplicates<'_>) -> anyhow::Result<()>
where
W: Write,
{
for files in replicates {
serde_json::to_writer(&mut writer, &files)?;
writeln!(writer)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use once_cell::sync::Lazy;
static BAG: Lazy<yadf::TreeBag<u64, yadf::Path>> = Lazy::new(|| {
vec![
(77, "hello".into()),
(77, "world".into()),
(3, "foo".into()),
(3, "bar".into()),
]
.into_iter()
.collect()
});
#[test]
fn csv() {
let mut buffer = Vec::new();
let _ = csv_to_writer(&mut buffer, &BAG.duplicates());
let result = String::from_utf8(buffer).unwrap();
let expected = r#"count,files
2,foo,bar
2,hello,world
"#;
assert_eq!(result, expected);
}
#[test]
fn ldjson() {
let mut buffer = Vec::new();
let _ = ldjson_to_writer(&mut buffer, &BAG.duplicates());
let result = String::from_utf8(buffer).unwrap();
let expected = r#"["foo","bar"]
["hello","world"]
"#;
assert_eq!(result, expected);
}
}