dupe_krill/
hasher.rs

1use crate::lazyfile::LazyFile;
2use smallvec::SmallVec;
3use std::cmp::{min, Ordering};
4use std::convert::TryInto;
5use std::io;
6use std::io::{Read, Seek, SeekFrom};
7use std::path::Path;
8
9/// A hashed chunk of data of arbitrary size. Files are compared a bit by bit.
10#[derive(Debug, PartialOrd, Eq, PartialEq, Ord)]
11struct HashedRange {
12    size: u64,
13    hash: [u8; 20],
14}
15
16impl HashedRange {
17    pub fn from_file(file: &mut LazyFile<'_>, start: u64, size: u64) -> Result<Self, io::Error> {
18        let fd = file.fd()?;
19        fd.seek(SeekFrom::Start(start))?;
20        let mut hasher = blake3::Hasher::new();
21        let mut to_read = size as usize;
22        let mut data = vec![0; to_read];
23        loop {
24            match fd.read(&mut data[0..to_read]) {
25                Ok(0) => break,
26                Ok(n) => {
27                    debug_assert!(n <= to_read);
28                    hasher.update(&data[0..n]);
29
30                    to_read -= n;
31                    if to_read == 0 {
32                        break;
33                    }
34                },
35                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {},
36                Err(e) => return Err(e),
37            }
38        }
39        Ok(Self {
40            hash: hasher.finalize().as_bytes()[0..20].try_into().unwrap(),
41            size,
42        })
43    }
44}
45
46#[derive(Debug)]
47pub struct Hasher {
48    ranges: SmallVec<[Option<HashedRange>; 1]>,
49}
50
51/// Compares two files using hashes by hashing incrementally until the first difference is found
52struct HashIter<'a> {
53    pub index: usize,
54    pub start_offset: u64,
55    pub end_offset: u64,
56    next_buffer_size: u64,
57    a_file: LazyFile<'a>,
58    b_file: LazyFile<'a>,
59}
60
61impl<'h> HashIter<'h> {
62    pub fn new(size: u64, a_path: &'h Path, b_path: &'h Path) -> Self {
63        HashIter {
64            index: 0,
65            start_offset: 0,
66            end_offset: size,
67            next_buffer_size: 2048,
68            a_file: LazyFile::new(a_path),
69            b_file: LazyFile::new(b_path),
70        }
71    }
72
73    /// Compare (and compute if needed) the next two hashes
74    pub fn next<'a,'b>(&mut self, a_hash: &'a mut Hasher, b_hash: &'b mut Hasher) -> Result<Option<(&'a HashedRange, &'b HashedRange)>, io::Error> {
75        if self.start_offset >= self.end_offset {
76            return Ok(None);
77        }
78
79        let i = self.index;
80        let (a_none, b_none, size) = {
81            let a = a_hash.ranges.get(i);
82            let b = b_hash.ranges.get(i);
83
84            let failed = a.map_or(false, |a| a.is_none()) || b.map_or(false, |b| b.is_none());
85            if failed {
86                return Err(io::Error::other("cmp i/o"));
87            }
88
89            // If there is an existing hashed chunk, the chunk size used for comparison must obviously be it.
90            let size = a
91                .and_then(|a| a.as_ref().map(|a| a.size))
92                .or(b.and_then(|b| b.as_ref().map(|b| b.size)))
93                .unwrap_or(min(self.end_offset - self.start_offset, self.next_buffer_size));
94            (a.is_none(), b.is_none(), size)
95        };
96
97        // If any of the ranges is missing, compute it
98        if a_none {
99            a_hash.push(HashedRange::from_file(&mut self.a_file, self.start_offset, size));
100        }
101        if b_none {
102            b_hash.push(HashedRange::from_file(&mut self.b_file, self.start_offset, size));
103        }
104
105        self.index += 1;
106        self.start_offset += size;
107        // The buffer size is a trade-off between finding a difference quickly
108        // and reading files one by one without trashing.
109        // Exponential increase is meant to be a compromise that allows finding
110        // the difference in the first few KB, but grow quickly to read identical files faster.
111        self.next_buffer_size = min(size * 16, 128 * 1024 * 1024);
112
113        match (a_hash.ranges.get(i), b_hash.ranges.get(i)) {
114            (Some(Some(a)), Some(Some(b))) => Ok(Some((a, b))),
115            _ => Err(io::Error::other("cmp i/o")),
116        }
117    }
118}
119
120impl Hasher {
121    #[inline]
122    pub fn new() -> Self {
123        Self { ranges: SmallVec::new() }
124    }
125
126    #[inline]
127    fn push(&mut self, range: Result<HashedRange, io::Error>) {
128        let r = match range {
129            Ok(r) => Some(r),
130            Err(err) => {
131                eprintln!("Can't compare files: {err}");
132                None
133            },
134        };
135        self.ranges.push(r);
136    }
137
138    /// Incremental comparison reading files lazily
139    #[inline]
140    pub fn compare(&mut self, other: &mut Self, size: u64, self_path: &Path, other_path: &Path) -> Result<Ordering, io::Error> {
141        let mut iter = HashIter::new(size, self_path, other_path);
142
143        while let Some((a, b)) = iter.next(self, other)? {
144            let ord = a.cmp(b);
145            if ord != Ordering::Equal {
146                return Ok(ord);
147            }
148        }
149        Ok(Ordering::Equal)
150    }
151}
152
153#[cfg(test)]
154mod test {
155    use super::*;
156    use std::fs;
157
158    #[test]
159    fn range_hash() {
160        let tmp = tempdir::TempDir::new("hashtest").expect("tmp");
161        let path = &tmp.path().join("a");
162        fs::write(path, "aaa\n").expect("write");
163        let mut file = LazyFile::new(path);
164        let hashed = HashedRange::from_file(&mut file, 0, 4).expect("hash");
165
166        assert_eq!(4, hashed.size);
167        assert_eq!([22, 179, 164, 66, 194, 34, 185, 88, 69, 62, 115, 203, 129, 138, 81, 160, 96, 190, 209, 11], hashed.hash);
168
169        let hashed = HashedRange::from_file(&mut file, 1, 2).expect("hash2");
170        assert_eq!(2, hashed.size);
171    }
172}