1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
//! Duplicate file checking functions for Rust.
#![deny(missing_docs)]

extern crate sha2;

#[macro_use]
mod macros;
mod utilities;

use std::error::Error;
use std::io;
use std::path::PathBuf;
use utilities::PathUtilities;

/// Keeps information about a file hash and the files with that hash.
#[derive(Debug)]
pub struct FileHash {
    /// A SHA-256 hash.
    hash: String,
    /// The files with that hash.
    files: Vec<PathBuf>
}

impl FileHash {
    /// Returns the SHA-256 hash.
    pub fn get_hash(&self) -> String {
        self.hash.clone()
    }

    /// Returns a reference to the files associated with this hash.
    pub fn get_files(&self) -> &Vec<PathBuf> {
        &self.files
    }

    /// Returns both the hash and the files reference.
    pub fn get_hash_and_files(&self) -> (String, &Vec<PathBuf>) {
        (self.hash.clone(), &self.files)
    }

    /// Adds a file path.
    pub fn add_file(&mut self, file: PathBuf) {
        self.files.push(file);
    }

    /// Returns the total number of files.
    pub fn total_files(&self) -> usize {
        self.files.len()
    }
}

/// Checks for duplicates of specified files.
///
/// If directories are specified, they will be checked; otherwise, a file's
/// parent directory will be checked.
///
/// # Errors
///
/// Returns an error if any `files` are not files, any paths within `dirs_opt`
/// are not directories or if there are I/O errors while trying to read files
/// or directories.
///
/// # Examples
///
/// Check a file for duplicates within its parent directory:
///
/// ```
/// use std::path::PathBuf;
///
/// let files = vec![PathBuf::from("foo.txt")];
/// let dup_result = dupcheck::duplicates_of(&files, None);
/// ```
///
/// Check a file for duplicates within some other directory:
///
/// ```
/// use std::path::PathBuf;
///
/// let files = vec![PathBuf::from("foo.txt")];
/// let dirs = vec![PathBuf::from("bar")];
/// let dup_result = dupcheck::duplicates_of(&files, Some(&dirs));
/// ```
pub fn duplicates_of(files: &[PathBuf], dirs_opt: Option<&[PathBuf]>)
    -> io::Result<Vec<FileHash>>
{
    let mut check_files = vec![];

    // Make sure the files are files.
    for path in files.iter().filter(|p| !p.is_file()) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
            format!("{} is not a file", path.display())
        ));
    }

    if let Some(dirs) = dirs_opt {

        // Check all directories for all filesizes.
        // ...but first, these are all directories, right?
        for path in dirs.iter().filter(|p| !p.is_dir()) {
            return Err(io::Error::new(
                io::ErrorKind::InvalidInput,
                format!("{} is not a directory", path.display())
            ));
        }

        let mut sizes = vec![];

        for file in files {
            let metadata = try_with_path!(file.metadata(), file);
            sizes.push(metadata.len());
        }

        for dir in dirs {
            let mut dir_files = try_with_path!(
                dir.files_within(Some(&sizes)),
                dir
            );

            check_files.append(&mut dir_files);
        }

        // If the directories aren't ancestors of the files being checked, the
        // files won't be in the check list, so we need to add them.
        for file in files {
            if !check_files.contains(file) {
                check_files.push(file.clone());
            }
        }
    } else {

        // Check only a file's parent directory for other files of its size.
        for file in files {
            let parent = file.parent().unwrap().to_path_buf();
            let metadata = try_with_path!(file.metadata(), file);
            let sizes = vec![metadata.len()];

            let mut dir_files = try_with_path!(
                parent.files_within(Some(&sizes)),
                parent
            );

            check_files.append(&mut dir_files);
        }
    }

    duplicate_files(&check_files)
}

/// Checks for any duplicate files within the specified directories.
///
/// This checks for any duplicates amongst all files within all specified
/// directories.  If multiple directories need to be checked separately, this
/// function will need to be called for each directory individually.
///
/// # Errors
///
/// Returns an error if any paths within `dirs` are not directories or if there
/// are I/O errors while trying to read files or directories.
///
/// # Examples
///
/// ```
/// use std::path::PathBuf;
///
/// let dirs = vec![
///     PathBuf::from("foo"),
///     PathBuf::from("bar")
/// ];
///
/// let dup_result = dupcheck::duplicates_within(&dirs);
/// ```
pub fn duplicates_within(dirs: &[PathBuf]) -> io::Result<Vec<FileHash>> {
    for path in dirs.iter().filter(|p| !p.is_dir()) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
            format!("{} is not a directory", path.display())
        ));
    }

    let mut check_files = vec![];

    for dir in dirs {
        let mut dir_files = try_with_path!(dir.files_within(None), dir);

        check_files.append(&mut dir_files);
    }

    duplicate_files(&check_files)
}

/// Checks `files` for any duplicate files.
///
/// Returns the SHA-256 hashes, and the paths associated, of those found to be
/// duplicates.  Each hash/files group is represented by a `FileHash`.
///
/// # Errors
///
/// Returns an error if any `files` are not files or if there are I/O errors
/// while trying to read files.
///
/// # Examples
///
/// ```
/// use std::path::PathBuf;
///
/// let files = vec![
///     PathBuf::from("foo.txt"),
///     PathBuf::from("bar.txt")
/// ];
///
/// let dup_result = dupcheck::duplicate_files(&files);
/// ```
pub fn duplicate_files(files: &[PathBuf]) -> io::Result<Vec<FileHash>> {

    // Make sure we're dealing with files.
    for path in files.iter().filter(|p| !p.is_file()) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
            format!("{} is not a file", path.display())
        ));
    }

    // Organise the files according to their size.  Any files with a unique
    // size within the check list can't be duplicates, so this will make sure
    // we don't waste time on hash checks of those files later.
    let mut sizes: Vec<(u64, Vec<PathBuf>)> = vec![];

    for file in files {
        let metadata = try_with_path!(file.metadata(), file);
        let size = metadata.len();

        if let Some(i) = sizes.iter().position(|s| s.0 == size) {
            sizes[i].1.push(file.clone())
        } else {
            sizes.push((size, vec![file.clone()]));
        }
    }

    // Check hashes of files where more than one file of its size was found.
    let mut hash_list: Vec<FileHash> = vec![];

    for size in sizes.iter().filter(|s| s.1.len() > 1) {

        for file in &size.1 {
            let hash = try_with_path!(file.sha256(), file);

            if let Some(i) = hash_list.iter().position(|h| h.hash == hash) {
                hash_list[i].add_file(file.clone());
            } else {
                hash_list.push(FileHash {
                    hash: hash,
                    files: vec![file.clone()]
                });
            }
        }
    }

    // Remove hashes with only one file associated.
    let mut remove: Vec<usize> = vec![];

    for (i, hash) in hash_list.iter().enumerate() {
        if hash.total_files() == 1 {
            remove.push(i);
        }
    }

    for r in remove.iter().rev() {
        hash_list.remove(*r);
    }

    Ok(hash_list)
}