1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
//! TreeMatch is a binary matching algorithm based on a suffix tree to retrieve matching strings. 
//!
//! The suffix tree is built in linear time using Ukkonen's algorithm.

use std::collections::HashMap;
use std::iter::Iterator;
use std::usize;

use Match;

/// A node in the [`SuffixTree`](struct.SuffixTree.html)
pub struct Node {
    /// The index in the data where the edge leading to this node starts.
    pub start: usize,
    /// The index in the data where the edge leading to this node ends.
    pub end: usize,
    /// The potential sub nodes under this one. Each index in the array represent on of the 
    /// possible byte value. The index `256` is reserved for the end of data. Each element value is 
    /// an index in the `SuffixTree::nodes`(struct.SuffixTree.html#nodes.v) vector.
    pub edges: [Option<usize>; 257],
    /// Suffix link (see Ukkonen's algorithm).
    pub suffix_link: Option<usize>,
}

/// A suffix tree.
pub struct SuffixTree {
    /// A vector of [`Node`](struct.Node.html) composing this tree. The first element is the root 
    /// node.
    pub nodes: Vec<Node>,
}

impl Node {
    /// Allocate a new node with a leading edge `[start..end]`.
    pub fn new(start: usize, end: usize) -> Node {
        Node {
            start: start,
            end: end,
            edges: [None; 257],
            suffix_link: None,
        }
    }
    /// Returns this node leading edge length.
    pub fn edge_length(&self) -> usize {
        self.end - self.start
    }
}

impl SuffixTree {
    /// Build a new suffix tree for `data` using Ukkonen's algorithm.
    pub fn new(data: &[u8]) -> SuffixTree {
        let mut nodes = Vec::<Node>::new();
        nodes.push(Node::new(0, 0));
        let mut tree = SuffixTree {
            nodes: nodes,
        };
        tree.extend_tree(data);
        return tree;
    }

    #[allow(unused_assignments)]
    fn extend_tree(&mut self, data: &[u8]) {
        let mut last_new_node: Option<usize> = None;
        let mut active_node: usize = 0;
        let mut active_length: usize = 0;
        let mut active_edge: usize = data[0] as usize;
        let mut remaining_suffix: usize = 0;
        for i in 0..data.len() {
            last_new_node = None;
            remaining_suffix += 1;
            while remaining_suffix > 0 {
                if active_length == 0 {
                    active_edge = data[i] as usize;
                }
                if let Some(next_node) = self.nodes[active_node].edges[active_edge] {
                    // If the active length is longer than the current edge, we walk down the edge 
                    // to the next node.
                    if active_length >= self.nodes[next_node].edge_length() {
                        active_node = next_node;
                        active_length -= self.nodes[next_node].edge_length();
                        active_edge = data[i - active_length] as usize;
                        continue;
                    }
                    // Rule 3: the current character is on the edge
                    else if data[self.nodes[next_node].start + active_length] == data[i] {
                        // Make a suffix link to the active node if there is a node waiting and if 
                        // the active node is not the root node
                        if last_new_node.is_some() && active_node > 0 {
                            self.nodes[last_new_node.unwrap()].suffix_link = Some(active_node);
                            last_new_node = None;
                        }
                        active_length += 1;
                        break;
                    }
                    // We need to split the edge at the current character
                    else {
                        let start = self.nodes[next_node].start;
                        let split_pos = self.nodes[next_node].start + active_length;
                        self.nodes.push(Node::new(start, split_pos));
                        let split = self.nodes.len() - 1;
                        self.nodes[next_node].start = split_pos;
                        self.nodes[active_node].edges[data[start] as usize] = Some(split);
                        self.nodes[split].edges[data[split_pos] as usize] = Some(next_node);
                        self.nodes.push(Node::new(i, data.len()));
                        let leaf = self.nodes.len() - 1;
                        self.nodes[split].edges[data[i] as usize] = Some(leaf);
                        // Make a suffix link to our next node
                        if last_new_node.is_some() {
                            self.nodes[last_new_node.unwrap()].suffix_link = Some(split);
                        }
                        last_new_node = Some(split);
                    }
                }
                else {
                    // Rule 2: we create a new leaf edge
                    self.nodes.push(Node::new(i, data.len()));
                    let leaf = self.nodes.len() - 1;
                    self.nodes[active_node].edges[active_edge] = Some(leaf);
                    // Make a suffix link if there is a node waiting
                    if last_new_node.is_some() {
                        self.nodes[last_new_node.unwrap()].suffix_link = Some(active_node);
                    }
                    last_new_node = Some(active_node);
                }
                
                remaining_suffix -= 1;
                if active_node == 0 && active_length > 0 {
                    active_length -= 1;
                    active_edge = data[i - remaining_suffix + 1] as usize;
                }
                else if active_node != 0 {
                    active_node = match self.nodes[active_node].suffix_link {
                        Some(linked) => linked,
                        None => 0
                    };
                }
            }
        }
        // Simulate end character by doing another step with false character 256
        last_new_node = None;
        remaining_suffix += 1;
        while remaining_suffix > 0 {
            // Active length is zero, so the current character is *i* and no walk down is needed.
            if active_length == 0 {
                // Special end character
                active_edge = 256;
            }
            if let Some(next_node) = self.nodes[active_node].edges[active_edge] {
                // If the active length is longer than the current edge, we walk down the edge
                if active_length >= self.nodes[next_node].edge_length() {
                    active_length -= self.nodes[next_node].edge_length();
                    active_node = next_node;
                    active_edge = match active_length {
                       0 => 256,
                       _ => data[data.len() - active_length] as usize
                    };
                    continue;
                }
                else if self.nodes[next_node].start + active_length == data.len() {
                    // Make a suffix link to the active node if there is a node waiting and if 
                    // the active node is not the root node
                    if last_new_node.is_some() && active_node > 0 {
                        self.nodes[last_new_node.unwrap()].suffix_link = Some(active_node);
                        last_new_node = None;
                    }
                    active_length += 1;
                    break;
                }
                // We need to split the edge at the current character
                else {
                    let start = self.nodes[next_node].start;
                    let split_pos = self.nodes[next_node].start + active_length;
                    self.nodes.push(Node::new(start, split_pos));
                    let split = self.nodes.len() - 1;
                    self.nodes[next_node].start = split_pos;
                    self.nodes[active_node].edges[data[start] as usize] = Some(split);
                    self.nodes[split].edges[data[split_pos] as usize] = Some(next_node);
                    self.nodes.push(Node::new(data.len(), data.len()));
                    let leaf = self.nodes.len() - 1;
                    self.nodes[split].edges[256] = Some(leaf);
                    // Make a suffix link to our next node
                    if last_new_node.is_some() {
                        self.nodes[last_new_node.unwrap()].suffix_link = Some(split);
                    }
                    last_new_node = Some(split);
                }
            }
            else {
                // Rule 2: we create a new leaf edge
                self.nodes.push(Node::new(data.len(), data.len()));
                let leaf = self.nodes.len() - 1;
                self.nodes[active_node].edges[active_edge] = Some(leaf);
                // Make a suffix link if there is a node waiting
                if last_new_node.is_some() {
                    self.nodes[last_new_node.unwrap()].suffix_link = Some(active_node);
                }
                last_new_node = Some(active_node);
            }
            
            remaining_suffix -= 1;
            if active_node == 0 && active_length > 0 {
                active_length -= 1;
                if remaining_suffix < 2 {
                    active_edge = 256;
                }
                else {
                    active_edge = data[data.len() - remaining_suffix + 1] as usize;
                }
            }
            else if active_node != 0 {
                active_node = match self.nodes[active_node].suffix_link {
                    Some(linked) => linked,
                    None => 0
                };
            }
        }
    }

    pub fn to_graphviz(&self, data: &[u8]) -> String {
        let mut graphviz = String::new();
        graphviz.push_str("digraph {\n");
        for i in 0..self.nodes.len() {
            graphviz.push_str(&format!("    NODE_{};\n", i));
        }
        for i in 0..self.nodes.len() {
            for j in 0..self.nodes[i].edges.len() {
                if let Some(edge) = self.nodes[i].edges[j] {
                    let start = self.nodes[edge].start;
                    let end = self.nodes[edge].end;
                    if let Ok(s) = String::from_utf8(data[start..end].to_owned()) {
                        graphviz.push_str(&format!("    NODE_{} -> NODE_{} [ label = \"{}\" ];\n", i, edge, &s));
                    }
                    else {
                        graphviz.push_str(&format!("    NODE_{} -> NODE_{} [ label = \"{:?}\" ];\n", i, edge, &data[start..end]));
                    }
                }
            }
            if let Some(linked) = self.nodes[i].suffix_link {
                graphviz.push_str(&format!("    NODE_{} -> NODE_{} [ style = \"dashed\" ];\n", i, linked));
            }
        }
        graphviz.push_str("}");
        return graphviz;
    }
}

/// An iterator over all the [`Match`](../struct.Match.html) bewteen two pieces of data.
///
/// # Examples
/// 
/// ```
/// use bcmp::treematch::TreeMatchIterator;
///
/// let a = "abcdefg";
/// let b = "012abc34cdef56efg78abcdefg";
/// let match_iter = TreeMatchIterator::new(a.as_bytes(), b.as_bytes(), 2);
/// for m in match_iter {
///     println!("Match: {:}", &a[m.first_pos..m.first_end()]);
/// }
/// ```
pub struct TreeMatchIterator<'a> {
    first: &'a [u8],
    second: &'a [u8],
    tree: SuffixTree,
    minimal_length: usize,
    i: usize,
    backtrace: Vec<(usize,usize)>,
    match_length: usize,
    depth: usize,
    matched: HashMap<isize, usize>
}

impl<'a> TreeMatchIterator<'a> {
    /// Allocate a new iterator over the matches between two byte slices with a minimal matching length.
    pub fn new(first: &'a[u8], second: &'a [u8], minimal_length: usize) -> TreeMatchIterator<'a> {
        let tree = SuffixTree::new(first);
        TreeMatchIterator {
            first: first,
            second: second,
            tree: tree,
            minimal_length: minimal_length,
            i: 0,
            backtrace: Vec::new(),
            match_length: 0,
            depth: 0,
            matched: HashMap::new()
        }
    }
    /// Reset the iterator to its start. This allows to iterate multiple times over the matches 
    /// without wasting time regenerating the `HashMap`.
    pub fn reset(&mut self) {
        self.i = 0;
        self.backtrace.clear();
        self.matched.clear();
    }
}

impl<'a> Iterator for TreeMatchIterator<'a> {
    type Item = Match;
    fn next(&mut self) -> Option<Match> {
        while self.i < self.second.len() {
            // Starting a backtrace at position i
            if self.backtrace.is_empty() {
                self.match_length = 0;
                self.depth = 0;
                let mut cur = 0;
                // Dive of at least minimal length
                while self.match_length == self.depth && self.match_length < self.minimal_length {
                    let second_idx = self.i + self.depth;
                    if second_idx >= self.second.len() {
                        break;
                    }
                    if let Some(next) = self.tree.nodes[cur].edges[self.second[second_idx] as usize] {
                        for j in 0..self.tree.nodes[next].edge_length() {
                            let first_idx = self.tree.nodes[next].start + j;
                            let second_idx = self.i + self.depth + j;
                            if second_idx < self.second.len() && self.first[first_idx] == self.second[second_idx] {
                                self.match_length += 1;
                            }
                            else {
                                break;
                            }
                        }
                        self.depth += self.tree.nodes[next].edge_length();
                        cur = next;
                    }
                    else {
                        break;
                    }
                }
                // Was the dive successful? If not, go to the next index in second
                if self.match_length < self.minimal_length {
                    self.i += 1;
                    continue;
                }
                // Mark this node as the start
                self.backtrace.push((cur,0));
            }
            while self.backtrace.len() > 0 {
                let (cur, mut idx) = self.backtrace.last().unwrap().clone();
                while idx < 257 {
                    if let Some(next) = self.tree.nodes[cur].edges[idx] {
                        // Are we still matching? or just enumerating the terminating leaf?
                        if self.match_length == self.depth {
                            for j in 0..self.tree.nodes[next].edge_length() {
                                let first_idx = self.tree.nodes[next].start + j;
                                let second_idx = self.i + self.depth + j;
                                if second_idx < self.second.len() && self.first[first_idx] == self.second[second_idx] {
                                    self.match_length += 1;
                                }
                                else {
                                    break;
                                }
                            }
                        }
                        // Update the idx
                        self.backtrace.last_mut().unwrap().1 = idx + 1;
                        // Go down
                        self.depth += self.tree.nodes[next].edge_length();
                        self.backtrace.push((next,0));
                        break;
                    }
                    idx += 1;
                }
                // If we are still on the same node
                if cur == self.backtrace.last().unwrap().0 {
                    // If we went over all the possible edges without finding a node, we were on a leaf
                    if self.backtrace.last().unwrap().1 == 0 {
                        // Update the idx
                        self.backtrace.last_mut().unwrap().1 = idx + 1;
                        // Handle the match
                        let m = Match::new(self.tree.nodes[cur].end - self.depth, self.i, self.match_length);
                        let delta = m.first_pos as isize - m.second_pos as isize;
                        if !(self.matched.contains_key(&delta) && self.matched.get(&delta).unwrap() >= &m.second_pos) {
                            self.matched.insert(delta, m.second_pos + m.length);
                            return Some(m);
                        }
                    }
                    // Else we just backtrack
                    else {
                        // Go up one level
                        self.depth -= self.tree.nodes[cur].edge_length();
                        // Update the match length
                        if self.depth < self.match_length {
                            self.match_length = self.depth;
                        }
                        self.backtrace.pop();
                    }
                }
            }
            self.i += 1;
        }
        return None;
    }
}