1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
use {
    anyhow::Result,
    crossbeam::channel,
    crate::{
        dup::Dup,
        ext,
        hash::FileHash,
    },
    fnv::FnvHashMap,
    rayon::{
        prelude::ParallelIterator,
        iter::ParallelBridge,
    },
    std::{
        fs,
        path::{Path, PathBuf},
    },
};

#[derive(Default)]
pub struct DupMap {
    pub dups: FnvHashMap<FileHash, Dup>,
    pub seen: usize,
}

impl DupMap {
    pub fn add_file(&mut self, path: &Path) -> Result<()> {
        let hash = FileHash::new(path)?;
        let e = self.dups.entry(hash).or_default();
        e.paths.push(path.to_path_buf());
        self.seen += 1;
        Ok(())
    }
    pub fn compile(&mut self) {
        self.dups.retain(|_, d| d.paths.len()>1);
    }
    pub fn len(&self) -> usize {
        self.dups.len()
    }
    pub fn build(root: PathBuf) -> Result<Self> {
        let (s_matching_files, r_matching_files) = channel::unbounded();
        let (s_hashed_files, r_hashed_files) = channel::unbounded::<(PathBuf, FileHash)>();

        let file_generator = std::thread::spawn(move||{
            let mut dirs = Vec::new();
            dirs.push(root);
            while let Some(dir) = dirs.pop() {
                if let Ok(entries) = fs::read_dir(&dir) {
                    for e in entries.flatten() {
                        if let Ok(md) = e.metadata() {
                            let path = e.path();
                            let name = match path.file_name().and_then(|s| s.to_str()) {
                                Some(s) => s,
                                None => { continue; },
                            };
                            if md.is_dir() {
                                // Until I implement gitignore, I'll just avoid
                                // my ~/dev directory
                                if name == "dev" {
                                    continue;
                                }
                                // we add the directory to the channel of dirs needing
                                // processing
                                dirs.push(path);
                                continue;
                            }
                            let ext = match path.extension().and_then(|s| s.to_str()) {
                                Some(s) => s,
                                None => { continue; },
                            };
                            if !ext::is_image(&ext) {
                                continue;
                            }
                            s_matching_files.send(path).unwrap();
                        }
                    }
                }
            }
        });

        // parallel computation of the hashes
        r_matching_files.into_iter().par_bridge()
            .for_each_with(s_hashed_files, |s, path| {
                if let Ok(hash) = FileHash::new(&path) {
                    s.send((path, hash)).unwrap();
                }
            });

        let mut dups: FnvHashMap<FileHash, Dup> = FnvHashMap::default();
        let mut seen = 0;
        r_hashed_files.iter()
            .for_each(|(path, hash)| {
                let e = dups.entry(hash).or_default();
                e.paths.push(path.to_path_buf());
                seen += 1;
            });


        file_generator.join().unwrap();

        dups.retain(|_, d| d.paths.len()>1);

        Ok(Self{
            dups,
            seen,
        })
    }
}