1use super::{BlockHashes, Diff, Window};
2use std::io::{Read, Write, Result};
3use std::collections::HashMap;
4use crypto::md5::Md5;
5use crypto::digest::Digest;
6use byteorder::{NetworkEndian, ByteOrder};
7
8struct RollingHash {
13 a: u16,
14 b: u16,
15 block_size: u16
16}
17
18impl RollingHash {
19
20 pub fn new<'a, I: Iterator<Item=&'a u8>>(initial_data: I) -> RollingHash {
23
24 let mut a:u16 = 0;
25 let mut b:u16 = 0;
26 let mut block_size: u16 = 0;
27 for byte in initial_data {
28 a = a.wrapping_add(*byte as u16);
29 b = b.wrapping_add(a);
30 block_size += 1;
31 }
32 RollingHash {
33 a: a,
34 b: b,
35 block_size: block_size
36 }
37 }
38
39 pub fn get_hash(&self) -> u32 {
41 return (self.b as u32) << 16 | self.a as u32;
42 }
43
44 pub fn roll_hash(&mut self, new_byte: Option<u8>, old_byte: u8) {
48 self.a = self.a.wrapping_sub(old_byte as u16);
49 self.b = self.b.wrapping_sub(((old_byte as u16).wrapping_mul(self.block_size as u16)) as u16);
50 if let Some(new_byte) = new_byte {
51 self.a = self.a.wrapping_add(new_byte as u16);
52 self.b = self.b.wrapping_add(self.a);
53 } else {
54 self.block_size -= 1
55 }
56 }
57
58 pub fn hash_buffer(buffer: &[u8]) -> u32 {
60 let mut a:u16 = 0;
61 let mut b:u16 = 0;
62 for byte in buffer {
63 a = a.wrapping_add(*byte as u16);
64 b = b.wrapping_add(a);
65
66 }
67 (b as u32) << 16 | a as u32
68 }
69}
70
71
72impl BlockHashes {
73
74 pub fn new<R: Read>(mut data_source: R, block_size: usize) -> Result<BlockHashes> {
81 let mut block = vec![0;block_size];
82 let mut hashes = HashMap::new();
83 let mut block_index = 0;
84 let mut strong_hasher = Md5::new();
85 let mut total_size = 0;
86
87 let mut read_size = try!(data_source.read(&mut block));
88 while read_size > 0 {
89 let weak_hash = RollingHash::hash_buffer(&block[..read_size]);
90
91 let mut strong_hash:[u8;16] = [0;16];
92 strong_hasher.reset();
93 strong_hasher.input(&block[..read_size]);
94 strong_hasher.result(&mut strong_hash);
95
96 hashes.entry(weak_hash).or_insert(Vec::new()).push((block_index, strong_hash));
97
98 block_index += 1;
99 total_size += read_size;
100 read_size = try!(data_source.read(&mut block));
101 }
102 Ok(BlockHashes {
103 hashes: hashes,
104 block_size: block_size,
105 file_size: total_size
106 })
107 }
108
109 pub fn empty(block_size: usize) -> BlockHashes {
111 BlockHashes {
112 hashes: HashMap::new(),
113 block_size: block_size,
114 file_size: 0
115 }
116 }
117
118 pub fn diff_and_update<R: Read>(&mut self, new_data: R) -> Result<Diff> {
140 use std::mem;
141 let mut diffs = Diff::new();
142 let mut window = try!(Window::new(new_data, self.block_size));
143 let mut weak_hasher = RollingHash::new(window.frame().0.iter());
144 let mut strong_hasher = Md5::new();
145 let mut last_matching_block_index = -1;
146 let mut insert_buffer = Vec::new();
147 let mut new_hashes = HashMap::new();
148 let mut current_block_index = 0;
149 while window.frame_size() > 0 {
150
151 if let Some(other_block_index) = self.check_match(&weak_hasher, &mut strong_hasher, &mut window, &mut last_matching_block_index) {
152 if insert_buffer.len() > 0 {
154 diffs.add_insert(window.get_bytes_read() - insert_buffer.len(), mem::replace(&mut insert_buffer, Vec::new()));
157 }
158 if other_block_index as i32 > last_matching_block_index + 1 {
160 diffs.add_delete(window.get_bytes_read(), self.block_size * (other_block_index as i32 - last_matching_block_index - 1) as usize)
161 }
162 last_matching_block_index = other_block_index as i32;
163 for i in 0..self.block_size {
165 if window.on_boundry() {
166 if window.frame_size() == 0 {
168 break;
169 }
170 let mut strong_hash:[u8;16] = [0;16];
171 if i != 0 {
175 let (front, back) = window.frame();
176 strong_hasher.reset();
177 strong_hasher.input(front);
178 strong_hasher.input(back);
179 }
180 strong_hasher.result(&mut strong_hash);
181
182 new_hashes.entry(weak_hasher.get_hash()).or_insert(Vec::new()).push((current_block_index, strong_hash));
183 current_block_index += 1;
184 }
185 let (tail, head) = try!(window.advance());
186 if let Some(tail) = tail {
187 weak_hasher.roll_hash(head, tail);
188 } else {
189 break;
190 }
191 }
192 } else {
193 if window.on_boundry() {
195 let mut strong_hash:[u8;16] = [0;16];
199 let (front, back) = window.frame();
200 strong_hasher.reset();
201 strong_hasher.input(front);
202 strong_hasher.input(back);
203 strong_hasher.result(&mut strong_hash);
204
205 new_hashes.entry(weak_hasher.get_hash()).or_insert(Vec::new()).push((current_block_index, strong_hash));
206 current_block_index += 1;
207 }
208 let (tail, head) = try!(window.advance());
209 weak_hasher.roll_hash(head, tail.unwrap());
210 insert_buffer.push(tail.unwrap());
211 }
212 }
213 if insert_buffer.len() > 0 {
214 diffs.add_insert(window.get_bytes_read() - insert_buffer.len(), insert_buffer);
215 }
216 let old_block_count = (self.file_size + self.block_size - 1) as i32 / self.block_size as i32;
217 if last_matching_block_index + 1 < old_block_count {
218 diffs.add_delete(window.get_bytes_read(), (self.file_size as i32 - (last_matching_block_index + 1) * self.block_size as i32) as usize);
219 }
220 self.hashes = new_hashes;
221 self.file_size = window.get_bytes_read();
222 Ok(diffs)
223 }
224
225 pub fn verify_unchanged<R: Read>(&self, data_source: &mut R) -> Result<bool> {
229 let mut block = vec![0;self.block_size];
230 let mut block_index = 0;
231 let mut strong_hasher = Md5::new();
232 let mut total_size = 0;
233
234 let mut read_size = try!(data_source.read(&mut block));
235 while read_size > 0 {
236 let weak_hash = RollingHash::hash_buffer(&block[..read_size]);
237 if let Some(entry) = self.hashes.get(&weak_hash) {
238 let mut strong_hash:[u8;16] = [0;16];
239 strong_hasher.reset();
240 strong_hasher.input(&block[..read_size]);
241 strong_hasher.result(&mut strong_hash);
242 if !entry.contains(&(block_index, strong_hash)) {
243 return Ok(false);
244 }
245 }
246
247
248 block_index += 1;
249 total_size += read_size;
250 read_size = try!(data_source.read(&mut block));
251 }
252 Ok(total_size == self.file_size)
253 }
254
255 pub fn compress_to<W: Write>(&self, writer: &mut W) -> Result<()> {
258
259 let mut int_buf = [0;4];
260 NetworkEndian::write_u32(&mut int_buf, self.file_size as u32);
261 try!(writer.write(&int_buf));
262 NetworkEndian::write_u32(&mut int_buf, self.block_size as u32);
263 try!(writer.write(&int_buf));
264 let block_count = (self.file_size + self.block_size - 1) / self.block_size;
265 let dummy_hash = [0u8;16];
266 let mut sequential_hashes = Vec::with_capacity(block_count);
267 sequential_hashes.resize(block_count, (0, &dummy_hash));
268 for (weak_hash, entry) in self.hashes.iter() {
269 for &(index, ref strong_hash) in entry.iter() {
270 sequential_hashes[index] = (*weak_hash, strong_hash);
271 }
272 }
273 for (weak, strong) in sequential_hashes {
274 NetworkEndian::write_u32(&mut int_buf, weak);
275 try!(writer.write(&int_buf));
276 try!(writer.write(strong));
277 }
278 Ok(())
279 }
280
281 pub fn expand_from<R: Read>(reader: &mut R) -> Result<BlockHashes> {
284 let mut int_buf = [0;4];
285 let mut strong_hash = [0u8;16];
286 try!(reader.read(&mut int_buf));
287 let file_size = NetworkEndian::read_u32(&mut int_buf) as usize;
288 try!(reader.read(&mut int_buf));
289 let block_size = NetworkEndian::read_u32(&mut int_buf) as usize;
290 let block_count = (file_size + block_size - 1) / block_size;
291 let mut hashes = HashMap::with_capacity(block_count);
293
294 for block_index in 0..block_count {
295 try!(reader.read(&mut int_buf));
296 let weak_hash = NetworkEndian::read_u32(&mut int_buf);
297 try!(reader.read(&mut strong_hash));
298 hashes.entry(weak_hash).or_insert(Vec::new()).push((block_index, strong_hash));
299 }
300 Ok(BlockHashes {
301 file_size: file_size,
302 block_size: block_size,
303 hashes: hashes
304 })
305 }
306
307 fn check_match<R: Read>(&self, weak_hasher: &RollingHash, mut strong_hasher: &mut Md5, mut window: &Window<R>, last_matching_block_index: &mut i32) -> Option<usize> {
311 if let Some(other_block_index) = self.hash_match(&weak_hasher, &mut strong_hasher, &mut window) {
312 if other_block_index as i32 > *last_matching_block_index {
313 return Some(other_block_index);
314 }
315 }
316 None
317 }
318
319 fn hash_match<R: Read>(&self, weak_hasher: &RollingHash, strong_hasher: &mut Md5, window: &Window<R>) -> Option<usize> {
323 let mut new_result = [0;16];
324 if let Some(matches) = self.hashes.get(&weak_hasher.get_hash()) {
325 for &(index, strong_hash) in matches.iter() {
326 strong_hasher.reset();
327 let (front, back) = window.frame();
328 strong_hasher.input(front);
329 strong_hasher.input(back);
330 strong_hasher.result(&mut new_result);
331 if new_result == strong_hash {
332 return Some(index)
333 }
334 }
335 }
336 return None
337 }
338}
339
340#[cfg(test)]
341mod test {
342 use super::super::{BlockHashes, Diff, Insert, Delete};
343 use super::{RollingHash};
344 use std::io::{Cursor};
345 use std::collections::HashMap;
346
347 macro_rules! check_diff {
348 ($start: tt | $block_size: tt | $new: tt | $(($insert_pos : tt, $insert_value: tt)),* | $(($delete_pos: tt, $delete_len: tt)),*) => {
349 {
350 check_diff_workaround!($start; $block_size; $new; $(($insert_pos, $insert_value)),*; $(($delete_pos, $delete_len)),*)
351 }
352 };
353 }
354
355 macro_rules! check_diff_workaround {
358 ($start: expr ; $block_size: expr ; $new: expr ; $(($insert_pos : tt, $insert_value: tt)),* ; $(($delete_pos: tt, $delete_len: tt)),*) => {
359 {
360 let mut hashes = BlockHashes::new(Cursor::new($start), $block_size).unwrap();
361 let diff = hashes.diff_and_update(Cursor::new($new)).unwrap();
362 assert_eq!(Diff {
363 inserts: vec![$(Insert{position: $insert_pos, data: $insert_value.bytes().collect()}),*],
364 deletes: vec![$(Delete{position: $delete_pos, len: $delete_len}),*]
365 }, diff);
366 check_hashes(&hashes, $new);
367 }
368 };
369 }
370
371 fn check_hashes(hashes: &BlockHashes, starting_data: &'static str) {
372 let expected_hashes = BlockHashes::new(Cursor::new(starting_data), hashes.block_size).unwrap();
373 assert_eq!(hashes, &expected_hashes);
374 }
375
376 #[test]
377 fn rolling_hash_small() {
378 let mut hash = RollingHash::new(vec![7, 2, 9, 1, 7, 8].iter());
379 assert_eq!(hash.get_hash(), 0x710022); hash.roll_hash(Some(12), 7); assert_eq!(hash.get_hash(), 0x6E0027); hash.roll_hash(Some(1), 2); assert_eq!(hash.get_hash(), 0x880026); hash.roll_hash(None, 9); assert_eq!(hash.get_hash(), 0x52001D); hash.roll_hash(None, 1); assert_eq!(hash.get_hash(), 0x4D001C); hash.roll_hash(None, 7); assert_eq!(hash.get_hash(), 0x310015); hash.roll_hash(None, 8); assert_eq!(hash.get_hash(), 0x19000D); hash.roll_hash(None, 12); assert_eq!(hash.get_hash(), 0x10001); hash.roll_hash(None, 1); assert_eq!(hash.get_hash(), 0x0); }
397 #[test]
398 fn rolling_hash_big() {
399 let mut numbers = Vec::new();
400 for i in 0..4000 {
401 numbers.push((200 + i * i) as u8);
402 }
403 let mut hash = RollingHash::new(numbers.iter());
404 assert_eq!(hash.get_hash(), 0x1880A9F0); hash.roll_hash(Some(237), 200);
406 assert_eq!(hash.get_hash(), 0x8D95AA15); hash.roll_hash(None, 201);
408 assert_eq!(hash.get_hash(), 0x48F5A94C) }
411
412 #[test]
413 fn hash_blocks_init() {
414 let test_string = "It was the best of times, it was the worst of times";
415 let hashes = BlockHashes::new(Cursor::new(test_string), 8).unwrap();
424
425 let mut expected_hashes:HashMap<u32, Vec<(usize, [u8;16])>> = HashMap::new();
426 expected_hashes.insert(202900156, vec![(0, [0xad, 0x72, 0x1d, 0x63, 0xc3, 0xda, 0xbb, 0x32, 0xcc, 0x90, 0x96, 0x82, 0x40, 0x71, 0xa9, 0x19])]);
427 expected_hashes.insert(211944123, vec![(1, [0x27, 0x12, 0xA2, 0x2D, 0xDA, 0x55, 0x85, 0x75, 0x8A, 0xEB, 0xC4, 0xD2, 0x98, 0x14, 0x2F, 0x8B])]);
428 expected_hashes.insert(225313559, vec![(2, [0x31, 0x60, 0x52, 0x34, 0x54, 0xfa, 0x59, 0xe4, 0xc1, 0x4b, 0xad, 0xf9, 0x43, 0x5d, 0x62, 0x12])]);
429 expected_hashes.insert(169083540, vec![(3, [0x5f, 0xa8, 0xfa, 0x65, 0x9a, 0xdc, 0x38, 0x99, 0x7b, 0xb3, 0x65, 0xf1, 0x76, 0x48, 0xea, 0x8a])]);
430 expected_hashes.insert(197788377, vec![(4, [0x6B, 0xF2, 0x9B, 0x2C, 0xD5, 0x03, 0x3E, 0xFC, 0x07, 0x9C, 0x2E, 0xA1, 0x27, 0xFD, 0x7B, 0x13])]);
431 expected_hashes.insert(217580249, vec![(5, [0x1c, 0x64, 0x81, 0x16, 0x71, 0xe4, 0x3e, 0xa5, 0xf8, 0x2d, 0xa6, 0xff, 0xc4, 0xa5, 0xbb, 0xee])]);
432 expected_hashes.insert(42205509, vec![(6, [0xd2, 0xdb, 0x8a, 0x61, 0x0f, 0x8c, 0x7c, 0x07, 0x85, 0xd2, 0xd9, 0x2a, 0x6e, 0x8c, 0x45, 0x0e])]);
433
434 assert_eq!(hashes, BlockHashes {
435 hashes: expected_hashes,
436 block_size: 8,
437 file_size: 51
438 });
439 }
440
441
442 #[test]
443 fn empty_hashes() {
444 check_diff!("" |
445 16 |
446 "The New Data" |
447 (0, "The New Data") |
448
449 );
450 }
451
452 #[test]
453 fn no_change() {
454 check_diff!("Same Data" |
455 8 |
456 "Same Data" |
457 |
458
459 );
460 }
461
462 #[test]
463 fn multiple_overwrites() {
464 check_diff!("" |
465 8 |
466 "New Data" |
467 (0, "New Data")|
468
469 );
470 check_diff!("New Data" |
471 8 |
472 "Other Stuff" |
473 (0, "Other Stuff")|
474 (11, 8)
475 );
476 check_diff!("Other Stuff" |
477 8 |
478 "More Things" |
479 (0, "More Things")|
480 (11, 11)
481 );
482 }
483
484 #[test]
485 fn insertions() {
486 check_diff!("Starting data is a long sentence" |
487 8 |
488 "Starting data is now a long sentence" |
489 (16, " now") |
490
491 );
492 check_diff!("Starting data is a long sentence" |
493 8 |
494 "This Starting data is a long sentence" |
495 (0, "This ") |
496
497 );
498 check_diff!("Starting data is a long sentence" |
499 8 |
500 "Starting data is a long sentence. With more" |
501 (32, ". With more") |
502
503 );
504 check_diff!("Starting data is a long sentence" |
505 8 |
506 "This Starting data is now a long sentence. With more" |
507 (0, "This "),
508 (21, " now"),
509 (41, ". With more") |
510
511 );
512 }
513
514 #[test]
515 fn delete_on_boundry() {
516 check_diff!("13 chars long, no longer" |
517 13 |
518 "13 chars long" |
519 |
520 (13, 11)
521 );
522 }
523
524 #[test]
525 fn deletions() {
526 check_diff!("Starting data is a long sentence" |
527 8 |
528 "Starting a long sentence" |
529 |
530 (8, 8)
531 );
532 check_diff!("Starting data is a long sentence" |
533 8 |
534 "Starting data is a long " |
535 |
536 (24, 8)
537 );
538 check_diff!("Starting data is a long sentence" |
539 8 |
540 " data is a long sentence" |
541 |
542 (0, 8)
543 );
544 check_diff!("Starting data is a long sentence" |
545 8 |
546 " a long " |
547 |
548 (0, 16), (8, 8)
549 );
550
551 }
552
553 #[test]
554 fn insertions_and_deletions() {
555 check_diff!("Starting data is a long sentence" |
556 8 |
557 "Starting data a long sentence" |
558 (8, " data") |
559 (13, 8)
560 );
561 check_diff!("Starting data is a long sentence" |
562 8 |
563 "Starting data is a long sentenc" |
564 (24, "sentenc")|
565 (31, 8)
566 );
567 check_diff!("Starting data is a long sentence" |
568 8 |
569 "This Starting data a very long sentence" |
570 (0, "This "), (13, " data a very long ") |
571 (31, 16)
572 );
573
574 }
575}