woopdedupe 0.1.5

Aggressively deduplicate files in a directory
Documentation
use std::collections::HashMap;
use std::error::Error;
use std::fs;
use std::fs::File;
use std::io::{BufReader, Read, Write};
use std::path::PathBuf;

use clap::Parser;
use data_encoding::HEXUPPER;
use indicatif::ParallelProgressIterator;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use ring::digest::{Context, Digest, SHA256};
use walkdir::WalkDir;
use wax::{Glob, Pattern};

const MAX_LINKS: usize = 1000;


pub fn sha256_digest<R: Read>(mut reader: R) -> Result<Digest, std::io::Error> {
    let mut context = Context::new(&SHA256);
    let mut buffer = [0; 1024];

    loop {
        let count = reader.read(&mut buffer)?;
        if count == 0 {
            break;
        }
        context.update(&buffer[..count]);
    }

    Ok(context.finish())
}

type PathVec = Vec<PathBuf>;
type DedupeMap = HashMap<String, PathVec>;

fn hash_source_files(src_dir: &PathBuf, include: &Option<Vec<String>>) -> Result<(DedupeMap, PathVec), Box<dyn Error>> {
    let include_glob: Option<wax::Any> = match include {
        Some(include) => {
            let patterns: Vec<&str> = include.iter().map(|s| &s[..]).collect();
            Some(wax::any::<Glob, _>(patterns).unwrap())
        }
        _ => None
    };

    let files = WalkDir::new(src_dir)
        .into_iter()
        .filter_map(Result::ok)
        .filter(|e| !e.file_type().is_dir())
        .map(|entry| entry.path().to_path_buf())
        .collect::<Vec<_>>();

    // Separate out files that need to be hashed and those that don't
    let mut ignore = vec![false; files.len()];
    if let Some(glob) = include_glob {
        ignore = files.par_iter().progress_count(files.len() as u64)
            .map(|f_path| !glob.is_match(f_path.to_str().unwrap()))
            .collect();
    }

    let (ignore_list, files): (Vec<_>, Vec<_>) = files.iter().enumerate()
        .partition(|(i, _)| ignore[*i]);
    let ignore_list: PathVec = ignore_list.into_iter().map(|(_, f)| f.to_owned()).collect();
    let files: PathVec = files.into_iter().map(|(_, f)| f.to_owned()).collect();


    // Hash the files
    let file_hashes: Vec<String> = files.par_iter().progress_count(files.len() as u64)
        .map(|f_path| {
            let reader = BufReader::new(File::open(f_path).unwrap());
            let digest = sha256_digest(reader).unwrap();
            HEXUPPER.encode(digest.as_ref())
        }).collect();

    let mut hashes: DedupeMap = HashMap::new();
    files.into_iter().zip(file_hashes.into_iter())
        .for_each(|(f_path, hash)| {
            let entry = hashes.entry(hash).or_insert(Vec::new());
            entry.push(f_path);
        });

    Ok((hashes, ignore_list))
}

pub fn dedupe(src_dir: PathBuf, dst_dir: PathBuf, include: Option<Vec<String>>) -> Result<(), Box<dyn Error>> {
    println!("Indexing files in {:?}", src_dir);
    let (hashes, ignore_list) = hash_source_files(&src_dir, &include)?;

    println!("De-duping files with same data");

    let src_dir_str = &src_dir.to_str().unwrap();
    let dst_dir_str = &dst_dir.to_str().unwrap();

    // Link the files to the new location
    for (hash, paths) in hashes {
        let mut src_index = 0;
        let temp = &paths[src_index];
        let mut ln_source = std::path::Path::new(&temp);

        for (i, path) in paths.iter().enumerate() {
            if i >= MAX_LINKS {
                src_index += 1;
                let temp = &paths[src_index];
                ln_source = std::path::Path::new(temp);
            }

            let ln_target = path.to_string_lossy().replace(src_dir_str, dst_dir_str);
            let ln_target = PathBuf::from(ln_target);

            let parent_dir = ln_target.parent().unwrap();
            fs::create_dir_all(parent_dir).unwrap();

            println!("{}: {:?} -> {:?}", hash, ln_source, ln_target);
            fs::hard_link(ln_source, ln_target).unwrap();
        }
    }

    // Link the ignored files
    for path in &ignore_list {
        let ln_source = path;
        let ln_source = std::path::Path::new(&ln_source);

        let ln_target = path.to_string_lossy().replace(src_dir_str, dst_dir_str);
        let ln_target = PathBuf::from(ln_target);

        let parent_dir = ln_target.parent().unwrap();
        fs::create_dir_all(parent_dir).unwrap();

        println!("{:?} -> {:?}", ln_source, ln_target);
        fs::hard_link(ln_source, ln_target).unwrap();
    }

    Ok(())
}

/// Simple program to create a hard-linked copy of a directory, where all files with the same data
/// are hard-linked to the same file.
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
pub struct Args {
    /// Source directory
    #[arg(
    value_name = "SRC",
    help = "The source directory to de-dupe",
    value_parser = Args::validate_src_dir
    )]
    pub source: PathBuf,

    /// Destination directory
    #[arg(
    value_name = "DST",
    help = "The destination directory where the the de-duped copy will be located",
    value_parser = Args::validate_dst_dir,
    )]
    pub destination: PathBuf,

    /// No confirmation
    #[arg(
    short = 'y',
    long = "yes",
    help = "Don't ask for confirmation before creating the de-duped directory"
    )]
    pub yes: bool,

    // Include filters
    #[arg(
    short = 'i',
    long = "include",
    help = "Optional globs patterns for files to include in the deduplication"
    )]
    pub include: Option<Vec<String>>,
}

impl Args {
    // Custom CLAP parser for validating the source directory
    fn validate_src_dir(src_dir: &str) -> Result<PathBuf, String> {
        let src_dir = PathBuf::from(src_dir);
        if !src_dir.exists() {
            Err(format!("Source directory {:?} doesn't exist", src_dir))
        } else if !src_dir.is_dir() {
            Err(format!("Source directory {:?} is not a directory", src_dir))
        } else {
            Ok(src_dir)
        }
    }

    // Custom CLAP parser for validating the destination directory
    fn validate_dst_dir(dst_dir: &str) -> Result<PathBuf, String> {
        let dst_dir = PathBuf::from(dst_dir);

        // Check destination directory either doesn't exist or is empty
        if dst_dir.exists() {
            if !dst_dir.is_dir() {
                return Err(format!("Destination {:?} is not a directory.", dst_dir));
            } else if dst_dir.read_dir().unwrap().next().is_some() {
                return Err(format!("Destination {:?} is not empty.", dst_dir));
            }
            // Destination directory exists and is empty
            return Ok(dst_dir);
        }

        // Get here if dst_dir does not exist
        if !dst_dir.parent().unwrap().exists() {
            return Err(format!(
                "Destination {:?} must have an existing parent directory.",
                dst_dir
            ));
        }

        // Create the directory
        fs::create_dir_all(&dst_dir).unwrap();
        Ok(dst_dir)
    }

    pub fn confirm_action(self: &Args) -> Result<(), Box<dyn Error>> {
        if self.yes {
            return Ok(());
        }

        println!(
            "This will create a hard-linked copy of {:?} in {:?}",
            self.source, self.destination
        );
        println!("All files with the same data will be hard-linked to the same file.");
        println!("This will not modify the source directory.");
        println!();

        let mut input = String::new();
        print!("Continue? [y/N] ");
        std::io::stdout().flush()?;

        std::io::stdin().read_line(&mut input)?;

        while let Some(c) = input.chars().next() {
            match c {
                'y' | 'Y' => break,
                'n' | 'N' | '\n' => {
                    println!("Aborting");
                    std::process::exit(0);
                }
                _ => {
                    println!("Please enter y or n");
                    input.clear();
                    std::io::stdin().read_line(&mut input)?;
                }
            }
        }

        Ok(())
    }
}