dsc 0.1.3

dsc is a cli tool for finding and removing duplicate files on one or multiple file systems, while respecting your gitignore rules.
extern crate clap;
#[macro_use]
extern crate log;

use std::ffi::OsString;
use std::io::{BufRead, Write};
use std::path::{Path, PathBuf};
use std::{fs, io, process};

use anyhow::{anyhow, Context, Result};
use atty::Stream;
use indicatif::HumanBytes;

use crate::candidate_selection::candidate_selector::CandidateSelector;
use crate::duplicate_detection::DuplicateDetector;
use crate::file_descriptor::DeviceDescriptor;
use crate::options::{AnalysisMode, Format, LinkSelectionPreference, Options};
use crate::report::csv::CSVWriter;
use crate::report::json::JSONWriter;
use crate::report::report_writer::ReportWriter;
use crate::types::{Duplicate, FileDescriptorWithPaths};
use crate::ui::exit_codes::ExitCode;
use crate::ui::format::HumanInteger;
use rustc_hash::FxHashMap;

mod app;
mod candidate_selection;
mod concurrency;
mod duplicate_detection;
mod file_descriptor;
mod filesystem;
mod options;
mod report;
mod types;
mod ui;

fn main() {
    pretty_env_logger::init();
    let result = run();
    match result {
        Ok(exit_code) => {
            process::exit(exit_code.into());
        }
        Err(err) => {
            error!("[dsc error]: {:#}", err);
            process::exit(ExitCode::GeneralError.into());
        }
    }
}

enum Subcommand {
    Link,
    Compare,
    Report,
}

fn run() -> Result<ExitCode> {
    let mut matches = app::build_app().get_matches();

    let subcommand = match matches.subcommand_name() {
        Some("report") => Subcommand::Report,
        Some("link") => Subcommand::Link,
        Some("cmp") => Subcommand::Compare,
        _ => {
            app::build_app().print_help()?;
            return Ok(ExitCode::Success);
        }
    };

    if let (_, Some(subcommand)) = matches.subcommand() {
        matches = subcommand.to_owned()
    }

    // Prefer stdin over arg provided paths
    let optional_paths = stdin_paths().or_else(|| {
        matches
            .values_of_os("path")
            .map(|os_values| os_values.map(OsString::from).collect())
    });

    let search_paths = if let Some(paths) = optional_paths {
        let mut directories = vec![];

        for path in paths {
            let path_buffer = PathBuf::from(path);
            if filesystem::is_valid_path(&path_buffer) {
                directories.push(path_buffer.canonicalize()?);
            } else {
                eprintln!(
                    "[dsc error]: {}",
                    format!(
                        "Search path '{}' is not a directory.",
                        path_buffer.to_string_lossy()
                    )
                );
            }
        }

        directories
    } else {
        let current_directory = Path::new(".");

        if !filesystem::is_valid_path(current_directory) {
            return Err(anyhow!(
                "Could not retrieve current directory (has it been deleted?)."
            ));
        }

        vec![current_directory.to_path_buf()]
    };

    if search_paths.is_empty() {
        return Err(anyhow!("No valid search paths given."));
    }

    let options = Options::from_matches(matches)?;

    debug!("options: {:?}", options);
    debug!("paths: {:?}", &search_paths);

    scan(subcommand, &search_paths, options)
}

fn stdin_paths() -> Option<Vec<OsString>> {
    if atty::is(Stream::Stdin) {
        None
    } else {
        let mut lines = vec![];
        let stdin = io::stdin();

        for line in stdin.lock().lines() {
            lines.push(OsString::from(line.unwrap()));
        }

        Some(lines)
    }
}

fn scan(subcommand: Subcommand, path_vec: &[PathBuf], options: Options) -> Result<ExitCode> {
    let duplicate_locator = CandidateSelector::new(options);

    let mut duplicates = duplicate_locator.select_candidates(path_vec)?;

    if options.analysis_mode == AnalysisMode::Exact {
        let duplicate_checker = DuplicateDetector::new(options);

        duplicates = duplicate_checker.check_duplicates(duplicates)?;
    }

    match subcommand {
        Subcommand::Compare => cmp(duplicates, options),
        Subcommand::Link => link(duplicates, options),
        Subcommand::Report => report(duplicates, options),
    }
}

fn cmp(duplicates: Vec<Duplicate>, options: Options) -> Result<ExitCode> {
    if !duplicates.is_empty() && options.error_on_duplicate {
        return Ok(ExitCode::DuplicateFound);
    }

    let total_bytes = duplicates
        .iter()
        .map(|duplicate| duplicate.file_size * ((duplicate.locations.len() - 1) as u64))
        .sum();

    let duplicate_files: u64 = duplicates
        .iter()
        .map(|duplicate| duplicate.locations.len() as u64 - 1)
        .sum();

    println!("Duplicate data        : {}", HumanBytes(total_bytes));
    println!(
        "Total duplicates      : {}",
        HumanInteger(duplicates.len() as u64)
    );
    println!("Total duplicate files : {}", HumanInteger(duplicate_files));

    Ok(ExitCode::Success)
}

fn link(duplicates: Vec<Duplicate>, options: Options) -> Result<ExitCode> {
    if duplicates.is_empty() {
        println!("No duplicates, nothing to be done");
        return Ok(ExitCode::Success);
    }

    let mut assignments: Vec<LinkAssignment> = Vec::new();

    for duplicate in duplicates {
        let mut handles_by_device: FxHashMap<DeviceDescriptor, Vec<FileDescriptorWithPaths>> =
            FxHashMap::default();

        let file_size = duplicate.file_size;
        for location in duplicate.locations {
            let dev = location.file_descriptor.device_descriptor;

            if let Some(vec) = handles_by_device.get_mut(&dev) {
                vec.push(location)
            } else {
                handles_by_device.insert(dev, vec![location]);
            }
        }

        let mut new_assignments: Vec<LinkAssignment> = handles_by_device
            .into_iter()
            .map(|(_, v)| v)
            .filter(|v| v.len() > 1)
            .map(|v| LinkAssignment {
                file_size,
                locations: v,
            })
            .collect();

        assignments.append(&mut new_assignments);
    }

    let count: u64 = assignments
        .iter()
        .map(|v| v.locations.len() as u64 - 1)
        .sum();
    let data_size: u64 = assignments
        .iter()
        .map(|v| (v.locations.len() as u64 - 1) * v.file_size)
        .sum();

    if !options.force {
        if atty::isnt(Stream::Stdin) {
            return Err(anyhow!(
                "--force parameter needs to be provided to allow linking when reading from stdin"
            ));
        } else {
            let dry_run_prefix = if options.dry_run { "(dryrun) " } else { "" };
            let mut line = String::new();
            print!(
                "{}Are you sure you want to link {} files? [y/N]: ",
                dry_run_prefix,
                HumanInteger(count)
            );
            std::io::stdout().flush()?;
            std::io::stdin()
                .read_line(&mut line)
                .context("Could not read from stdin")?;

            match line.trim().to_ascii_lowercase().as_str() {
                "y" => {}
                _ => {
                    println!("Canceling link command");
                    return Ok(ExitCode::Success);
                }
            }
        }
    }

    let uuid = uuid::Uuid::new_v4();
    for (i, mut assignment) in assignments.into_iter().enumerate() {
        assignment.locations.sort_by_key(|location| {
            location
                .paths
                .iter()
                .map(|v: &PathBuf| {
                    let metadata = v.as_path().metadata()?;
                    let created = metadata.created();
                    if created.is_ok() {
                        created
                    } else {
                        metadata.modified()
                    }
                })
                .next()
                .transpose()
                .unwrap()
        });

        let (target, locations) = match options.link_priority {
            LinkSelectionPreference::Oldest => assignment.locations.split_first(),
            _ => assignment.locations.split_last(),
        }
        .unwrap();

        // assignment.locations.pop().unwrap();
        let to_path = target.paths.iter().next().unwrap();

        let mut temp_path = to_path.parent().unwrap().to_path_buf();
        temp_path.push(format!("dsc.{}.{}", i, uuid));

        for location in locations.iter() {
            for from_path in &location.paths {
                if options.dry_run {
                    println!("linking {:?} => {:?}", from_path, to_path)
                } else {
                    if let Err(err) = fs::hard_link(&to_path, &temp_path) {
                        fs::remove_file(temp_path)?;
                        return Err(anyhow!("Unable to replace: {}", err));
                    }

                    if let Err(err) = fs::rename(&temp_path, &from_path) {
                        fs::remove_file(temp_path)?;
                        return Err(anyhow!("Unable to replace {}", err));
                    }
                }
            }
        }
    }

    println!("Done. Reclaimed {} of disk space.", HumanBytes(data_size));
    // create hard link and move it to file
    Ok(ExitCode::Success)
}

struct LinkAssignment {
    file_size: u64,
    locations: Vec<FileDescriptorWithPaths>,
}

fn report(mut duplicates: Vec<Duplicate>, options: Options) -> Result<ExitCode> {
    let mut writer: Box<dyn ReportWriter> = match options.format {
        Format::CSV => Box::new(CSVWriter {}),
        Format::JSON => Box::new(JSONWriter {}),
    };

    writer.write(&mut duplicates, &mut io::stdout())?;

    Ok(ExitCode::Success)
}