1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
#![allow(exceeding_bitshifts)]

use std::fs::{
  File
};

use std::io::{
  Read,
  Result
};

/// How big the fingerprint should be.
pub type Fingerprint = u32;

/// Hold the address into the file and the fingerprint identifier.
#[derive(Clone)]
pub struct AddressAndFingerprint {
  address: Option<usize>,
  fingerprint: Fingerprint
}

pub type Fingerprints     = Vec<AddressAndFingerprint>;
pub type FingerprintsLite = Vec<Fingerprint>;
pub type LargestIntType   = u64;

pub const DEFAULT_PRIME: LargestIntType        = 982451653;
pub const DEFAULT_WINDOW_SIZE: usize           = 50;
pub const DEFAULT_SELECTION_BITS_LENGTH: usize = 8;

pub trait ManberFingerprints: Read {
  /// To understand what these arguments imply, please read Udi Manber's 1994 Usenix paper.
  fn manber_fingerprints(&mut self, opt_prime: Option<LargestIntType>, opt_window_size: Option<usize>, opt_selection_bits_length: Option<usize>) -> Result<Fingerprints> {
    let mut fingerprints      = Fingerprints::new();
    let prime                 = opt_prime.or(Some(DEFAULT_PRIME)).unwrap();
    let window_size           = opt_window_size.or(Some(DEFAULT_WINDOW_SIZE)).unwrap();
    let selection_bits_length = opt_selection_bits_length.or(Some(DEFAULT_SELECTION_BITS_LENGTH)).unwrap();

    let fingerprint_table:Vec<LargestIntType> = (0..256).map(|e| e as LargestIntType * prime).collect();

    let mut file_bytes:Vec<u8> = Vec::new();
    self.read_to_end(&mut file_bytes)?;

    let window = &file_bytes[0..window_size];
    fingerprints.push(
      AddressAndFingerprint {
        address: Some(0),
        fingerprint: Self::calculate_fingerprint(&window, &fingerprint_table)
      }
    );
    Self::calculate_fingerprints(&file_bytes, &mut fingerprints, prime, window_size, selection_bits_length);

    Ok(fingerprints)
  }

  /// Calculate a list of fingerprints. `fingerprints` must be initialized with one fingerprint.
  /// You can use `calculate_fingerprint` for that.
  fn calculate_fingerprints(
    file_bytes: &[u8],
    fingerprints: &mut Fingerprints,
    prime: LargestIntType,
    window_size: usize,
    selection_bits_length: usize
  ) -> () {
    let mut current_byte_index = 1;
    let mut current_fingerprint = 0;

    let file_bytes_length:usize = file_bytes.len();

    while current_byte_index < file_bytes_length {
      let head_byte = file_bytes[current_byte_index - window_size] as LargestIntType;
      let tail_byte = file_bytes[current_byte_index] as LargestIntType;
      let new_part = (fingerprints[current_fingerprint].clone().fingerprint as LargestIntType)
        + (prime * tail_byte);
      let old_part = head_byte * prime;

      let fingerprint = ((new_part - old_part) % (
          LargestIntType::pow(2, (::std::mem::size_of::<Fingerprint>() * 8) as u32)
        )
      ) as Fingerprint;

      // If all upper selection_bits are zero, we add the fingerprint to our list.
      let selection_bits = fingerprint >> 
        ((::std::mem::size_of::<Fingerprint>() * 8) - selection_bits_length) as LargestIntType;

      if selection_bits == 0 || selection_bits_length == 0 {
        fingerprints.push(
          AddressAndFingerprint {
            address: Some(current_byte_index),
            fingerprint
          }
        );
        current_fingerprint = current_fingerprint + 1;
      }

      current_byte_index  = current_byte_index + 1;
    }
  }

  /// Calculate a single fingerprint based on a "window" of data.
  /// fingerprint_table is a pre-computed table of byte -> unique number mappings.
  fn calculate_fingerprint(window: &[u8], fingerprint_table: &[LargestIntType]) -> Fingerprint {
    (window.iter().map(|&byte| fingerprint_table[byte as usize]).sum::<u64>() % (
      LargestIntType::pow(2, (::std::mem::size_of::<Fingerprint>() * 8) as u32)
    )) as Fingerprint
  }

}

impl ManberFingerprints for File { }