rdiff/
hashing.rs

1use super::{BlockHashes, Diff, Window};
2use std::io::{Read, Write, Result};
3use std::collections::HashMap;
4use crypto::md5::Md5;
5use crypto::digest::Digest;
6use byteorder::{NetworkEndian, ByteOrder};
7
8/// Implements a weak, but easy to calculate hash for a block of bytes
9///
10/// The hash is comprised of two bytes.  The first is the sum of the bytes
11// in the block, the second is the sum of the sum of the bytes in the block
12struct RollingHash {
13    a: u16,
14    b: u16,
15    block_size: u16
16}
17
18impl RollingHash {
19
20    /// Creates a new rolling hash over the bytes in `initial_data`.
21    /// It will be assumed that the size of blocks will be the size of the initial data.
22    pub fn new<'a, I: Iterator<Item=&'a u8>>(initial_data: I) -> RollingHash {
23
24        let mut a:u16 = 0;
25        let mut b:u16 = 0;
26        let mut block_size: u16 = 0;
27        for byte in initial_data {
28            a = a.wrapping_add(*byte as u16);
29            b = b.wrapping_add(a);
30            block_size += 1;
31        }
32        RollingHash {
33            a: a,
34            b: b,
35            block_size: block_size
36        }
37    }
38
39    /// Gets the hash as it currently stands
40    pub fn get_hash(&self) -> u32 {
41        return (self.b as u32) << 16 | self.a as u32;
42    }
43
44    /// Roll the has forward one byte.  This function will remove `old_byte` from its calculation
45    /// and add `new_byte` if it exists.
46    /// To get the hash afterwards, use `get_hash()`.
47    pub fn roll_hash(&mut self, new_byte: Option<u8>, old_byte: u8) {
48        self.a = self.a.wrapping_sub(old_byte as u16);
49        self.b = self.b.wrapping_sub(((old_byte as u16).wrapping_mul(self.block_size as u16)) as u16);
50        if let Some(new_byte) = new_byte {
51            self.a = self.a.wrapping_add(new_byte as u16);
52            self.b = self.b.wrapping_add(self.a);
53        } else {
54            self.block_size -= 1
55        }
56    }
57
58    /// Calculate the hash of a collection of bytes.
59    pub fn hash_buffer(buffer: &[u8]) -> u32 {
60        let mut a:u16 = 0;
61        let mut b:u16 = 0;
62        for byte in buffer {
63            a = a.wrapping_add(*byte as u16);
64            b = b.wrapping_add(a);
65
66        }
67        (b as u32) << 16 | a as u32
68    }
69}
70
71
72impl BlockHashes {
73
74    /// Create a new BlockHash based on the data in data_source.  This method
75    /// will create a hash for every `block_size` set of bytes in `data_source`.
76    ///
77    /// To see the difference after `data_source` has been updated, use `diff_and_update()`
78    ///
79    /// This method returns an error when there is a problem reading from `data_source`.
80    pub fn new<R: Read>(mut data_source: R, block_size: usize) -> Result<BlockHashes> {
81        let mut block = vec![0;block_size];
82        let mut hashes = HashMap::new();
83        let mut block_index = 0;
84        let mut strong_hasher = Md5::new();
85        let mut total_size = 0;
86
87        let mut read_size = try!(data_source.read(&mut block));
88        while read_size > 0 {
89            let weak_hash = RollingHash::hash_buffer(&block[..read_size]);
90
91            let mut strong_hash:[u8;16] = [0;16];
92            strong_hasher.reset();
93            strong_hasher.input(&block[..read_size]);
94            strong_hasher.result(&mut strong_hash);
95
96            hashes.entry(weak_hash).or_insert(Vec::new()).push((block_index, strong_hash));
97
98            block_index += 1;
99            total_size += read_size;
100            read_size = try!(data_source.read(&mut block));
101        }
102        Ok(BlockHashes {
103            hashes: hashes,
104            block_size: block_size,
105            file_size: total_size
106        })
107    }
108
109    /// Construct a new block hash for a file that was just created
110    pub fn empty(block_size: usize) -> BlockHashes {
111        BlockHashes {
112            hashes: HashMap::new(),
113            block_size: block_size,
114            file_size: 0
115        }
116    }
117
118    /// Compare the data in `new_data` with the hashes computed from either
119    /// the most recent call to `diff_and_update()` or when this `BlockHashes` was updated
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// use rdiff::BlockHashes;
125    /// use std::io::Cursor;
126    /// let mut hashes = BlockHashes::new(Cursor::new("It was the best of times"), 6).unwrap();
127    /// let diff = hashes.diff_and_update(Cursor::new("It was not the best of things")).unwrap();
128    /// // prints (6, ' not') and (22, ' things'))
129    /// for insert in diff.inserts() {
130    ///     println!("{:?}", insert);
131    /// }
132    /// // prints (29, 6)
133    /// for delete in diff.deletes() {
134    ///     println!("{:?}", delete);
135    /// }
136    /// assert_eq!("It was not the best of things",
137    ///             diff.apply_to_string("It was the best of times").unwrap());
138    /// ```
139    pub fn diff_and_update<R: Read>(&mut self, new_data: R) -> Result<Diff> {
140        use std::mem;
141        let mut diffs = Diff::new();
142        let mut window = try!(Window::new(new_data, self.block_size));
143        let mut weak_hasher = RollingHash::new(window.frame().0.iter());
144        let mut strong_hasher = Md5::new();
145        let mut last_matching_block_index = -1;
146        let mut insert_buffer = Vec::new();
147        let mut new_hashes = HashMap::new();
148        let mut current_block_index = 0;
149        while window.frame_size() > 0 {
150
151            if let Some(other_block_index) = self.check_match(&weak_hasher, &mut strong_hasher, &mut window, &mut last_matching_block_index) {
152                //create an insert if the insert buffer has anything in it
153                if insert_buffer.len() > 0 {
154                    // XXX with some work here, we could probably track the insert buffer as a piece of the window, which is then
155                    // moved into the diff list.
156                    diffs.add_insert(window.get_bytes_read() - insert_buffer.len(), mem::replace(&mut insert_buffer, Vec::new()));
157                }
158                //create a delete if the index is more than it should be
159                if other_block_index as i32 > last_matching_block_index + 1 {
160                    diffs.add_delete(window.get_bytes_read(), self.block_size * (other_block_index as i32 - last_matching_block_index - 1) as usize)
161                }
162                last_matching_block_index = other_block_index as i32;
163                //advance forward an entire block's worth
164                for i in 0..self.block_size {
165                    if window.on_boundry() {
166                        // This might iterate past the end of the data.  If so, bail out
167                        if window.frame_size() == 0 {
168                            break;
169                        }
170                        let mut strong_hash:[u8;16] = [0;16];
171                        // If the boundry happened where we saw a match, we can skip the
172                        // strong hashing, because it was already done during the
173                        // match checking
174                        if i != 0 {
175                            let (front, back) = window.frame();
176                            strong_hasher.reset();
177                            strong_hasher.input(front);
178                            strong_hasher.input(back);
179                        }
180                        strong_hasher.result(&mut strong_hash);
181
182                        new_hashes.entry(weak_hasher.get_hash()).or_insert(Vec::new()).push((current_block_index, strong_hash));
183                        current_block_index += 1;
184                    }
185                    let (tail, head) = try!(window.advance());
186                    if let Some(tail) = tail {
187                        weak_hasher.roll_hash(head, tail);
188                    } else {
189                        break;
190                    }
191                }
192            } else {
193                //advance forward one byte
194                if window.on_boundry() {
195                    // XXX There is a slight optimization possible here, where
196                    // when the weak checksum matches, but the strong one doesn't
197                    // we are re-computing the strong checksum here.
198                    let mut strong_hash:[u8;16] = [0;16];
199                    let (front, back) = window.frame();
200                    strong_hasher.reset();
201                    strong_hasher.input(front);
202                    strong_hasher.input(back);
203                    strong_hasher.result(&mut strong_hash);
204
205                    new_hashes.entry(weak_hasher.get_hash()).or_insert(Vec::new()).push((current_block_index, strong_hash));
206                    current_block_index += 1;
207                }
208                let (tail, head) = try!(window.advance());
209                weak_hasher.roll_hash(head, tail.unwrap());
210                insert_buffer.push(tail.unwrap());
211            }
212        }
213        if insert_buffer.len() > 0 {
214            diffs.add_insert(window.get_bytes_read() - insert_buffer.len(), insert_buffer);
215        }
216        let old_block_count = (self.file_size + self.block_size - 1) as i32 / self.block_size as i32;
217        if last_matching_block_index + 1 < old_block_count {
218            diffs.add_delete(window.get_bytes_read(), (self.file_size as i32 - (last_matching_block_index + 1) * self.block_size as i32) as usize);
219        }
220        self.hashes = new_hashes;
221        self.file_size = window.get_bytes_read();
222        Ok(diffs)
223    }
224
225    /// Checks if `data_source` has changed since the last time the hashes were updated.
226    ///
227    /// Returns true if `data_source` is identical to what it was when the hashes were generated, false otherwise
228    pub fn verify_unchanged<R: Read>(&self, data_source: &mut R) -> Result<bool> {
229        let mut block = vec![0;self.block_size];
230        let mut block_index = 0;
231        let mut strong_hasher = Md5::new();
232        let mut total_size = 0;
233
234        let mut read_size = try!(data_source.read(&mut block));
235        while read_size > 0 {
236            let weak_hash = RollingHash::hash_buffer(&block[..read_size]);
237            if let Some(entry) = self.hashes.get(&weak_hash) {
238                let mut strong_hash:[u8;16] = [0;16];
239                strong_hasher.reset();
240                strong_hasher.input(&block[..read_size]);
241                strong_hasher.result(&mut strong_hash);
242                if !entry.contains(&(block_index, strong_hash)) {
243                    return Ok(false);
244                }
245            }
246
247
248            block_index += 1;
249            total_size += read_size;
250            read_size = try!(data_source.read(&mut block));
251        }
252        Ok(total_size == self.file_size)
253    }
254
255    /// Compress these Hashes and write to `writer`.  The output can then be expanded
256    /// back into an equivilent Hash collection using `expand_from()`
257    pub fn compress_to<W: Write>(&self, writer: &mut W) -> Result<()> {
258
259        let mut int_buf = [0;4];
260        NetworkEndian::write_u32(&mut int_buf, self.file_size as u32);
261        try!(writer.write(&int_buf));
262        NetworkEndian::write_u32(&mut int_buf, self.block_size as u32);
263        try!(writer.write(&int_buf));
264        let block_count = (self.file_size + self.block_size - 1) / self.block_size;
265        let dummy_hash = [0u8;16];
266        let mut sequential_hashes = Vec::with_capacity(block_count);
267        sequential_hashes.resize(block_count, (0, &dummy_hash));
268        for (weak_hash, entry) in self.hashes.iter() {
269            for &(index, ref strong_hash) in entry.iter() {
270                sequential_hashes[index] = (*weak_hash, strong_hash);
271            }
272        }
273        for (weak, strong) in sequential_hashes {
274            NetworkEndian::write_u32(&mut int_buf, weak);
275            try!(writer.write(&int_buf));
276            try!(writer.write(strong));
277        }
278        Ok(())
279    }
280
281    /// Expand these hashes from previously compressed data in `reader`.  The data in reader
282    /// should have been written using `compress_to()`
283    pub fn expand_from<R: Read>(reader: &mut R) -> Result<BlockHashes> {
284        let mut int_buf = [0;4];
285        let mut strong_hash = [0u8;16];
286        try!(reader.read(&mut int_buf));
287        let file_size = NetworkEndian::read_u32(&mut int_buf) as usize;
288        try!(reader.read(&mut int_buf));
289        let block_size = NetworkEndian::read_u32(&mut int_buf) as usize;
290        let block_count = (file_size + block_size - 1) / block_size;
291        // Might be an overestimate, but not by more than a few
292        let mut hashes = HashMap::with_capacity(block_count);
293
294        for block_index in 0..block_count {
295            try!(reader.read(&mut int_buf));
296            let weak_hash = NetworkEndian::read_u32(&mut int_buf);
297            try!(reader.read(&mut strong_hash));
298            hashes.entry(weak_hash).or_insert(Vec::new()).push((block_index, strong_hash));
299        }
300        Ok(BlockHashes {
301            file_size: file_size,
302            block_size: block_size,
303            hashes: hashes
304        })
305    }
306
307    /// Checks if the current window frame matches any existing block with an index greater than the previously matched block.
308    ///
309    /// Returns the index of the matching block if it does
310    fn check_match<R: Read>(&self, weak_hasher: &RollingHash, mut strong_hasher: &mut Md5, mut window: &Window<R>, last_matching_block_index: &mut i32) -> Option<usize> {
311        if let Some(other_block_index) = self.hash_match(&weak_hasher, &mut strong_hasher, &mut window) {
312            if other_block_index as i32 > *last_matching_block_index {
313                return Some(other_block_index);
314            }
315        }
316        None
317    }
318
319    /// Checks to see if the hash of the current window frame matches an existing hash.
320    ///
321    /// If so, returns the index of the matching block
322    fn hash_match<R: Read>(&self, weak_hasher: &RollingHash,  strong_hasher: &mut Md5, window: &Window<R>) -> Option<usize> {
323        let mut new_result = [0;16];
324        if let Some(matches) = self.hashes.get(&weak_hasher.get_hash()) {
325            for &(index, strong_hash) in matches.iter() {
326                strong_hasher.reset();
327                let (front, back) = window.frame();
328                strong_hasher.input(front);
329                strong_hasher.input(back);
330                strong_hasher.result(&mut new_result);
331                if new_result == strong_hash {
332                    return Some(index)
333                }
334            }
335        }
336        return None
337    }
338}
339
340#[cfg(test)]
341mod test {
342    use super::super::{BlockHashes, Diff, Insert, Delete};
343    use super::{RollingHash};
344    use std::io::{Cursor};
345    use std::collections::HashMap;
346
347    macro_rules! check_diff {
348        ($start: tt | $block_size: tt | $new: tt | $(($insert_pos : tt, $insert_value: tt)),* | $(($delete_pos: tt, $delete_len: tt)),*) => {
349            {
350                check_diff_workaround!($start; $block_size; $new; $(($insert_pos, $insert_value)),*; $(($delete_pos, $delete_len)),*)
351            }
352        };
353    }
354
355    // Caused by a bug in the implementation of the tt macro type.  It currently has to be passed as an expr into another macro
356    // or it throws a fit for no reason.  See https://github.com/rust-lang/rust/issues/5846
357    macro_rules! check_diff_workaround {
358        ($start: expr ; $block_size: expr ; $new: expr ; $(($insert_pos : tt, $insert_value: tt)),* ; $(($delete_pos: tt, $delete_len: tt)),*) => {
359            {
360                let mut hashes = BlockHashes::new(Cursor::new($start), $block_size).unwrap();
361                let diff = hashes.diff_and_update(Cursor::new($new)).unwrap();
362                assert_eq!(Diff {
363                    inserts: vec![$(Insert{position: $insert_pos, data: $insert_value.bytes().collect()}),*],
364                    deletes: vec![$(Delete{position: $delete_pos, len: $delete_len}),*]
365                }, diff);
366                check_hashes(&hashes, $new);
367            }
368        };
369    }
370
371    fn check_hashes(hashes: &BlockHashes, starting_data: &'static str) {
372        let expected_hashes = BlockHashes::new(Cursor::new(starting_data), hashes.block_size).unwrap();
373        assert_eq!(hashes, &expected_hashes);
374    }
375
376    #[test]
377    fn rolling_hash_small() {
378        let mut hash = RollingHash::new(vec![7, 2, 9, 1, 7, 8].iter());
379        assert_eq!(hash.get_hash(), 0x710022); // a: 34 b: 113
380        hash.roll_hash(Some(12), 7); // [2, 9, 1, 7, 8, 12]
381        assert_eq!(hash.get_hash(), 0x6E0027); // a: 39 b:110
382        hash.roll_hash(Some(1), 2); // [9, 1, 7, 8, 12, 1]
383        assert_eq!(hash.get_hash(), 0x880026); // a: 38 b:136
384        hash.roll_hash(None, 9); // [1, 7, 8, 12, 1]
385        assert_eq!(hash.get_hash(), 0x52001D); // a: 29 b:82
386        hash.roll_hash(None, 1); // [7, 8, 12, 1]
387        assert_eq!(hash.get_hash(), 0x4D001C); // a: 28 b: 77
388        hash.roll_hash(None, 7); // [8, 12, 1]
389        assert_eq!(hash.get_hash(), 0x310015); // a: 21 b: 49
390        hash.roll_hash(None, 8); // [12, 1]
391        assert_eq!(hash.get_hash(), 0x19000D); // a: 13 b: 25
392        hash.roll_hash(None, 12); // [1]
393        assert_eq!(hash.get_hash(), 0x10001); // a: 1 b: 1
394        hash.roll_hash(None, 1); // []
395        assert_eq!(hash.get_hash(), 0x0); // a: 0 b: 0
396    }
397    #[test]
398    fn rolling_hash_big() {
399        let mut numbers = Vec::new();
400        for i in 0..4000 {
401            numbers.push((200 + i * i) as u8);
402        }
403        let mut hash = RollingHash::new(numbers.iter());
404        assert_eq!(hash.get_hash(), 0x1880A9F0); // a: A9f0 b: 1880
405        hash.roll_hash(Some(237), 200);
406        assert_eq!(hash.get_hash(), 0x8D95AA15); // a: AA15 b: 8D95
407        hash.roll_hash(None, 201);
408        assert_eq!(hash.get_hash(), 0x48F5A94C) // a: A94C b: 48F5
409
410    }
411
412    #[test]
413    fn hash_blocks_init() {
414        let test_string = "It was the best of times, it was the worst of times";
415        // Blocks:
416        // It was t : 202900156 - ad721d63c3dabb32cc9096824071a919
417        // he best  : 211944123 - 2712A22DDA5585758AEBC4D298142F8B
418        // of times : 225313559 - 3160523454fa59e4c14badf9435d6212
419        // , it was : 169083540 - 5fa8fa659adc38997bb365f17648ea8a
420        //  the wor : 197788377 - d7aad88e1f5098bdae1da2e564749322
421        // st of ti : 217580249 - 1c64811671e43ea5f82da6ffc4a5bbee
422        // mes      : 42205509  - d2db8a610f8c7c0785d2d92a6e8c450e
423        let hashes = BlockHashes::new(Cursor::new(test_string), 8).unwrap();
424
425        let mut expected_hashes:HashMap<u32, Vec<(usize, [u8;16])>> = HashMap::new();
426        expected_hashes.insert(202900156, vec![(0, [0xad, 0x72, 0x1d, 0x63, 0xc3, 0xda, 0xbb, 0x32, 0xcc, 0x90, 0x96, 0x82, 0x40, 0x71, 0xa9, 0x19])]);
427        expected_hashes.insert(211944123, vec![(1, [0x27, 0x12, 0xA2, 0x2D, 0xDA, 0x55, 0x85, 0x75, 0x8A, 0xEB, 0xC4, 0xD2, 0x98, 0x14, 0x2F, 0x8B])]);
428        expected_hashes.insert(225313559, vec![(2, [0x31, 0x60, 0x52, 0x34, 0x54, 0xfa, 0x59, 0xe4, 0xc1, 0x4b, 0xad, 0xf9, 0x43, 0x5d, 0x62, 0x12])]);
429        expected_hashes.insert(169083540, vec![(3, [0x5f, 0xa8, 0xfa, 0x65, 0x9a, 0xdc, 0x38, 0x99, 0x7b, 0xb3, 0x65, 0xf1, 0x76, 0x48, 0xea, 0x8a])]);
430        expected_hashes.insert(197788377, vec![(4, [0x6B, 0xF2, 0x9B, 0x2C, 0xD5, 0x03, 0x3E, 0xFC, 0x07, 0x9C, 0x2E, 0xA1, 0x27, 0xFD, 0x7B, 0x13])]);
431        expected_hashes.insert(217580249, vec![(5, [0x1c, 0x64, 0x81, 0x16, 0x71, 0xe4, 0x3e, 0xa5, 0xf8, 0x2d, 0xa6, 0xff, 0xc4, 0xa5, 0xbb, 0xee])]);
432        expected_hashes.insert(42205509,  vec![(6, [0xd2, 0xdb, 0x8a, 0x61, 0x0f, 0x8c, 0x7c, 0x07, 0x85, 0xd2, 0xd9, 0x2a, 0x6e, 0x8c, 0x45, 0x0e])]);
433
434        assert_eq!(hashes, BlockHashes {
435            hashes: expected_hashes,
436            block_size: 8,
437            file_size: 51
438        });
439    }
440
441
442    #[test]
443    fn empty_hashes() {
444        check_diff!("" |
445                    16 |
446                    "The New Data" |
447                    (0, "The New Data") |
448
449                );
450    }
451
452    #[test]
453    fn no_change() {
454        check_diff!("Same Data" |
455                    8 |
456                    "Same Data" |
457                    |
458
459                );
460    }
461
462    #[test]
463    fn multiple_overwrites() {
464        check_diff!("" |
465                    8 |
466                    "New Data" |
467                    (0, "New Data")|
468
469                );
470        check_diff!("New Data" |
471                    8 |
472                    "Other Stuff" |
473                    (0, "Other Stuff")|
474                    (11, 8)
475                );
476        check_diff!("Other Stuff" |
477                    8 |
478                    "More Things" |
479                    (0, "More Things")|
480                    (11, 11)
481                );
482    }
483
484    #[test]
485    fn insertions() {
486        check_diff!("Starting data is a long sentence" |
487                    8 |
488                    "Starting data is now a long sentence" |
489                    (16, " now") |
490
491                );
492        check_diff!("Starting data is a long sentence" |
493                    8 |
494                    "This Starting data is a long sentence" |
495                    (0, "This ") |
496
497                );
498        check_diff!("Starting data is a long sentence" |
499                    8 |
500                    "Starting data is a long sentence. With more" |
501                    (32, ". With more") |
502
503                );
504        check_diff!("Starting data is a long sentence" |
505                    8 |
506                    "This Starting data is now a long sentence. With more" |
507                    (0, "This "),
508                    (21, " now"),
509                    (41, ". With more") |
510
511                );
512    }
513
514    #[test]
515    fn delete_on_boundry() {
516        check_diff!("13 chars long, no longer" |
517                    13 |
518                    "13 chars long" |
519                    |
520                    (13, 11)
521                );
522    }
523
524    #[test]
525    fn deletions() {
526        check_diff!("Starting data is a long sentence" |
527                    8 |
528                    "Starting a long sentence" |
529                    |
530                    (8, 8)
531                );
532        check_diff!("Starting data is a long sentence" |
533                    8 |
534                    "Starting data is a long " |
535                    |
536                    (24, 8)
537                );
538        check_diff!("Starting data is a long sentence" |
539                    8 |
540                    " data is a long sentence" |
541                    |
542                    (0, 8)
543                );
544        check_diff!("Starting data is a long sentence" |
545                    8 |
546                    " a long " |
547                    |
548                    (0, 16), (8, 8)
549                );
550
551    }
552
553    #[test]
554    fn insertions_and_deletions() {
555        check_diff!("Starting data is a long sentence" |
556                    8 |
557                    "Starting data a long sentence" |
558                    (8, " data") |
559                    (13, 8)
560                );
561        check_diff!("Starting data is a long sentence" |
562                    8 |
563                    "Starting data is a long sentenc" |
564                    (24, "sentenc")|
565                    (31, 8)
566                );
567        check_diff!("Starting data is a long sentence" |
568                    8 |
569                    "This Starting data a very long sentence" |
570                    (0, "This "), (13, " data a very long ") |
571                    (31, 16)
572                );
573
574    }
575}