bytecmp/
hashmatch.rs

1//! HashMatch is a binary matching algorithm based on a `HashMap` to retrieve the begining of 
2//! matching strings.
3//!
4//! It relies on using a [`HashMatchKey`](trait.HashMatchKey.html) long enough to
5//! weed out "random" matches to obtain linear time performances. This 
6//! [`HashMatchKey`](trait.HashMatchKey.html) offers a tradeoff between the speed and the minimal 
7//! matching length.
8
9use std::collections::HashMap;
10use std::hash::Hash;
11use std::io::Cursor;
12use std::mem::size_of;
13
14use bytepack::{Packed, Unpacker};
15
16use Match;
17
18/// Trait marking types which can be used as a matching key in the `HashMap`.
19///
20/// The larger the `HashMatchKey` is, the faster the implementation will be but the minimal matching 
21/// length will proportionally increase. For example a `u32` `HashMatchKey` allows to find common 
22/// substring equal or longer than 4 bytes while a `u64` `HashMatchKey` will be significantly faster 
23/// but only allows to find common substring equal or longer than 8 bytes.
24pub trait HashMatchKey: Packed + Hash + Eq + Copy {}
25
26impl HashMatchKey for u8      {}
27impl HashMatchKey for [u8;2]  {}
28impl HashMatchKey for [u8;3]  {}
29impl HashMatchKey for [u8;4]  {}
30impl HashMatchKey for [u8;5]  {}
31impl HashMatchKey for [u8;6]  {}
32impl HashMatchKey for [u8;7]  {}
33impl HashMatchKey for [u8;8]  {}
34impl HashMatchKey for u16     {}
35impl HashMatchKey for [u16;2] {}
36impl HashMatchKey for [u16;3] {}
37impl HashMatchKey for [u16;4] {}
38impl HashMatchKey for [u16;5] {}
39impl HashMatchKey for [u16;6] {}
40impl HashMatchKey for [u16;7] {}
41impl HashMatchKey for [u16;8] {}
42impl HashMatchKey for u32     {}
43impl HashMatchKey for [u32;2] {}
44impl HashMatchKey for [u32;3] {}
45impl HashMatchKey for [u32;4] {}
46impl HashMatchKey for [u32;5] {}
47impl HashMatchKey for [u32;6] {}
48impl HashMatchKey for [u32;7] {}
49impl HashMatchKey for [u32;8] {}
50impl HashMatchKey for u64     {}
51impl HashMatchKey for [u64;2] {}
52impl HashMatchKey for [u64;3] {}
53impl HashMatchKey for [u64;4] {}
54impl HashMatchKey for [u64;5] {}
55impl HashMatchKey for [u64;6] {}
56impl HashMatchKey for [u64;7] {}
57impl HashMatchKey for [u64;8] {}
58
59fn build_map<T: HashMatchKey>(c: &mut Cursor<&[u8]>) -> HashMap<T,Vec<usize>> {
60    let size = c.get_ref().len() - size_of::<T>() + 1;
61    let mut map = HashMap::<T, Vec<usize>>::with_capacity(size);
62    for i in 0..size {
63        c.set_position(i as u64);
64        let v = c.unpack::<T>().unwrap();
65        if !map.contains_key(&v) {
66            map.insert(v, Vec::<usize>::new());
67        }
68        map.get_mut(&v).unwrap().push(i);
69    }
70    return map;
71}
72
73/// An iterator over all the [`Match`](../struct.Match.html) bewteen two pieces of data.
74///
75/// # Examples
76/// 
77/// ```
78/// use bytecmp::hashmatch::HashMatchIterator;
79///
80/// let a = "abcdefg";
81/// let b = "012abc34cdef56efg78abcdefg";
82/// let match_iter = HashMatchIterator::<u16>::new(a.as_bytes(), b.as_bytes());
83/// for m in match_iter {
84///     println!("Match: {:}", &a[m.first_pos..m.first_end()]);
85/// }
86/// ```
87pub struct HashMatchIterator<'a, T: HashMatchKey> {
88    first: Cursor<&'a [u8]>,
89    second: Cursor<&'a [u8]>,
90    second_len: usize,
91    i: usize,
92    j: usize,
93    map: HashMap<T,Vec<usize>>,
94    matched: HashMap<isize, usize>
95}
96
97impl<'a, T: HashMatchKey> HashMatchIterator<'a, T> {
98    /// Allocate a new iterator over the matches between two byte slices
99    pub fn new(first: &'a [u8], second: &'a [u8]) -> HashMatchIterator<'a, T> {
100        let second_len = second.len() - size_of::<T>() + 1;
101        let mut first_cursor = Cursor::new(first);
102        let second_cursor = Cursor::new(second);
103        let map = build_map(&mut first_cursor);
104        HashMatchIterator {
105            first: first_cursor,
106            second: second_cursor,
107            second_len: second_len,
108            i: 0,
109            j: 0,
110            map: map,
111            matched: HashMap::new()
112        }
113    }
114    /// Reset the iterator to its start. This allows to iterate multiple times over the matches 
115    /// without wasting time regenerating the `HashMap`.
116    pub fn reset(&mut self) {
117        self.i = 0;
118        self.j = 0;
119        self.matched.clear();
120    }
121}
122
123impl<'a, T: HashMatchKey> Iterator for HashMatchIterator<'a, T> {
124    type Item = Match;
125    fn next(&mut self) -> Option<Match> {
126        while self.j < self.second_len {
127            self.second.set_position(self.j as u64);
128            let v = self.second.unpack::<T>().unwrap();
129            if let Some(positions) = self.map.get(&v) {
130                while self.i < positions.len() {
131                    let first_pos = positions[self.i];
132                    self.i += 1;
133                    // Check if this is a not part of a match already returned
134                    let delta = first_pos as isize - self.j as isize;
135                    if !(self.matched.contains_key(&delta) && self.matched.get(&delta).unwrap() >= &self.j) {
136                        let first_data = self.first.get_ref();
137                        let second_data = self.second.get_ref();
138                        // Compute match length
139                        let mut idx = 0;
140                        while (first_pos + idx) < first_data.len() && 
141                              (self.j + idx) < second_data.len() &&
142                              first_data[first_pos + idx] == second_data[self.j + idx] {
143                            idx += 1;
144                        }
145                        // Update matched
146                        self.matched.insert(delta, self.j + idx);
147                        return Some(Match::new(first_pos, self.j, idx));
148                    }
149                }
150            }
151            self.j += 1;
152            self.i = 0;
153        }
154        return None;
155    }
156}
bytecmp/hashmatch.rs

bytecmp/
hashmatch.rs