seqalign/
dynprog.rs

1use std::collections::{HashSet, VecDeque};
2
3use crate::op::{Backtrack, BestCost, IndexedOperation, Operation};
4use crate::{Measure, SeqPair};
5
6/// Trait enabling alignment of all `Measure`s.
7///
8/// This trait is used to implement alignment using dynamic programming
9/// for every type that implements the `Measure` trait.
10pub trait Align<'a, M, T>
11where
12    M: Measure<T>,
13    T: Eq,
14{
15    /// Align two sequences.
16    ///
17    /// This function aligns two sequences and returns the alignment.
18    fn align(&'a self, source: &'a [T], target: &'a [T]) -> Alignment<'a, M, T>;
19}
20
21impl<'a, M, T> Align<'a, M, T> for M
22where
23    M: Measure<T>,
24    T: Eq,
25{
26    fn align(&'a self, source: &'a [T], target: &'a [T]) -> Alignment<'a, M, T> {
27        let pair = SeqPair { source, target };
28
29        let source_len = pair.source.len() + 1;
30        let target_len = pair.target.len() + 1;
31
32        let mut cost_matrix = vec![vec![0; target_len]; source_len];
33
34        // Fill first row. This is separated from the rest of the matrix fill
35        // because we do not want to fill cell [0][0].
36        for target_idx in 1..target_len {
37            cost_matrix[0][target_idx] = self
38                .best_cost(&pair, &cost_matrix, 0, target_idx)
39                .expect("No applicable operation");
40        }
41
42        // Fill the matrix
43        for source_idx in 1..source_len {
44            for target_idx in 0..target_len {
45                cost_matrix[source_idx][target_idx] = self
46                    .best_cost(&pair, &cost_matrix, source_idx, target_idx)
47                    .expect("No applicable operation");
48            }
49        }
50
51        Alignment {
52            measure: self,
53            pair,
54            cost_matrix,
55        }
56    }
57}
58
59/// Edit distance cost matrix.
60pub struct Alignment<'a, M, T>
61where
62    M: Measure<T>,
63    T: Eq,
64{
65    measure: &'a M,
66    pair: SeqPair<'a, T>,
67    cost_matrix: Vec<Vec<usize>>,
68}
69
70impl<'a, M, T> Alignment<'a, M, T>
71where
72    M: Measure<T>,
73    T: Eq,
74{
75    /// Get the edit distance.
76    pub fn distance(&self) -> usize {
77        self.cost_matrix[self.cost_matrix.len() - 1][self.cost_matrix[0].len() - 1]
78    }
79
80    /// Return the script of edit operations to rewrite the source sequence
81    /// to the target sequence. If there are multiple possible edit scripts,
82    /// this method will return one of the possible edit scripts. If you want
83    /// to retrieve all possible edit scripts, use the `edit_scripts` method.
84    pub fn edit_script(&self) -> Vec<IndexedOperation<M::Operation>> {
85        let mut source_idx = self.pair.source.len();
86        let mut target_idx = self.pair.target.len();
87        let mut script = Vec::new();
88
89        while let Some(op) =
90            self.measure
91                .backtrack(&self.pair, &self.cost_matrix, source_idx, target_idx)
92        {
93            let (new_source_idx, new_target_idx) = op
94                .backtrack(&self.pair, source_idx, target_idx)
95                .expect("Cannot backtrack");
96            source_idx = new_source_idx;
97            target_idx = new_target_idx;
98
99            script.push(IndexedOperation::new(op, source_idx, target_idx));
100
101            if source_idx == 0 && target_idx == 0 {
102                break;
103            }
104        }
105
106        assert_eq!(source_idx, 0, "Cannot backtrack to cell 0, 0");
107        assert_eq!(target_idx, 0, "Cannot backtrack to cell 0, 0");
108
109        script.reverse();
110
111        script
112    }
113
114    /// Return all the edit scripts to rewrite the source sequence to the
115    /// target sequence. If you want just one edit script, use the
116    /// `edit_script` method instead.
117    pub fn edit_scripts(&self) -> HashSet<Vec<IndexedOperation<M::Operation>>> {
118        // Find all scripts that lead to the lowest edit distance using
119        // breadth-first search.
120
121        // Start the search in the lower-right corner with the final cost.
122        let mut q: VecDeque<BacktrackState<M, T>> = VecDeque::new();
123        q.push_back(BacktrackState {
124            source_idx: self.pair.source.len(),
125            target_idx: self.pair.target.len(),
126            script: Vec::new(),
127        });
128
129        let mut scripts = HashSet::new();
130        while let Some(BacktrackState {
131            source_idx,
132            target_idx,
133            script,
134        }) = q.pop_front()
135        {
136            // Process all operations/origins that can lead to the current cell's
137            // cost.
138            for op in self
139                .measure
140                .backtracks(&self.pair, &self.cost_matrix, source_idx, target_idx)
141            {
142                let (new_source_idx, new_target_idx) = op
143                    .backtrack(&self.pair, source_idx, target_idx)
144                    .expect("Cannot backtrack");
145                let mut new_script = script.clone();
146
147                new_script.push(IndexedOperation::new(op, new_source_idx, new_target_idx));
148
149                if new_source_idx == 0 && new_target_idx == 0 {
150                    // If we are in the upper-left cell, we have a complete script.
151                    new_script.reverse();
152                    scripts.insert(new_script);
153                } else {
154                    // Otherwise, add the state to the queue to explore later.
155                    q.push_back(BacktrackState {
156                        source_idx: new_source_idx,
157                        target_idx: new_target_idx,
158                        script: new_script,
159                    })
160                }
161            }
162        }
163
164        assert!(!scripts.is_empty(), "Cannot backtrack to cell 0, 0");
165
166        scripts
167    }
168
169    /// Get the cost matrix.
170    pub fn cost_matrix(&self) -> &Vec<Vec<usize>> {
171        &self.cost_matrix
172    }
173
174    /// Get the sequence pair associated with this cost matrix.
175    pub fn seq_pair(&self) -> &SeqPair<T> {
176        &self.pair
177    }
178}
179
180struct BacktrackState<M, T>
181where
182    M: Measure<T>,
183{
184    source_idx: usize,
185    target_idx: usize,
186    script: Vec<IndexedOperation<M::Operation>>,
187}
188
189#[cfg(test)]
190mod tests {
191    use crate::measures::Levenshtein;
192    use crate::measures::LevenshteinOp::*;
193    use crate::op::IndexedOperation;
194
195    use super::Align;
196
197    #[test]
198    fn distance_test() {
199        let applet: Vec<char> = "applet".chars().collect();
200        let pineapple: Vec<char> = "pineapple".chars().collect();
201        let pen: Vec<char> = "pen".chars().collect();
202
203        let levenshtein = Levenshtein::new(1, 1, 1);
204
205        assert_eq!(levenshtein.align(&pineapple, &pen).distance(), 7);
206        assert_eq!(levenshtein.align(&pen, &pineapple).distance(), 7);
207        assert_eq!(levenshtein.align(&pineapple, &applet).distance(), 5);
208        assert_eq!(levenshtein.align(&applet, &pen).distance(), 4);
209    }
210
211    #[test]
212    fn edit_script_test() {
213        let applet: Vec<char> = "applet".chars().collect();
214        let pineapple: Vec<char> = "pineapple".chars().collect();
215        let pen: Vec<char> = "pen".chars().collect();
216
217        let levenshtein = Levenshtein::new(1, 1, 1);
218
219        assert_eq!(
220            vec![
221                IndexedOperation::new(Match, 0, 0),
222                IndexedOperation::new(Substitute(1), 1, 1),
223                IndexedOperation::new(Match, 2, 2),
224                IndexedOperation::new(Delete(1), 3, 3),
225                IndexedOperation::new(Delete(1), 4, 3),
226                IndexedOperation::new(Delete(1), 5, 3),
227                IndexedOperation::new(Delete(1), 6, 3),
228                IndexedOperation::new(Delete(1), 7, 3),
229                IndexedOperation::new(Delete(1), 8, 3),
230            ],
231            levenshtein.align(&pineapple, &pen).edit_script()
232        );
233
234        assert_eq!(
235            vec![
236                IndexedOperation::new(Match, 0, 0),
237                IndexedOperation::new(Substitute(1), 1, 1),
238                IndexedOperation::new(Match, 2, 2),
239                IndexedOperation::new(Insert(1), 3, 3),
240                IndexedOperation::new(Insert(1), 3, 4),
241                IndexedOperation::new(Insert(1), 3, 5),
242                IndexedOperation::new(Insert(1), 3, 6),
243                IndexedOperation::new(Insert(1), 3, 7),
244                IndexedOperation::new(Insert(1), 3, 8),
245            ],
246            levenshtein.align(&pen, &pineapple).edit_script()
247        );
248
249        assert_eq!(
250            vec![
251                IndexedOperation::new(Delete(1), 0, 0),
252                IndexedOperation::new(Delete(1), 1, 0),
253                IndexedOperation::new(Delete(1), 2, 0),
254                IndexedOperation::new(Delete(1), 3, 0),
255                IndexedOperation::new(Match, 4, 0),
256                IndexedOperation::new(Match, 5, 1),
257                IndexedOperation::new(Match, 6, 2),
258                IndexedOperation::new(Match, 7, 3),
259                IndexedOperation::new(Match, 8, 4),
260                IndexedOperation::new(Insert(1), 9, 5),
261            ],
262            levenshtein.align(&pineapple, &applet).edit_script()
263        );
264    }
265
266    #[test]
267    fn edit_script_tests() {
268        let applet: Vec<char> = "applet".chars().collect();
269        let pineapple: Vec<char> = "pineapple".chars().collect();
270        let aplpet: Vec<char> = "aplpet".chars().collect();
271
272        let levenshtein = Levenshtein::new(1, 1, 1);
273        assert_eq!(
274            hashset![vec![
275                IndexedOperation::new(Delete(1), 0, 0),
276                IndexedOperation::new(Delete(1), 1, 0),
277                IndexedOperation::new(Delete(1), 2, 0),
278                IndexedOperation::new(Delete(1), 3, 0),
279                IndexedOperation::new(Match, 4, 0),
280                IndexedOperation::new(Match, 5, 1),
281                IndexedOperation::new(Match, 6, 2),
282                IndexedOperation::new(Match, 7, 3),
283                IndexedOperation::new(Match, 8, 4),
284                IndexedOperation::new(Insert(1), 9, 5),
285            ],],
286            levenshtein.align(&pineapple, &applet).edit_scripts()
287        );
288
289        assert_eq!(
290            hashset![
291                vec![
292                    IndexedOperation::new(Match, 0, 0),
293                    IndexedOperation::new(Match, 1, 1),
294                    IndexedOperation::new(Substitute(1), 2, 2),
295                    IndexedOperation::new(Substitute(1), 3, 3),
296                    IndexedOperation::new(Match, 4, 4),
297                    IndexedOperation::new(Match, 5, 5),
298                ],
299                vec![
300                    IndexedOperation::new(Match, 0, 0),
301                    IndexedOperation::new(Match, 1, 1),
302                    IndexedOperation::new(Delete(1), 2, 2),
303                    IndexedOperation::new(Match, 3, 2),
304                    IndexedOperation::new(Insert(1), 4, 3),
305                    IndexedOperation::new(Match, 4, 4),
306                    IndexedOperation::new(Match, 5, 5),
307                ],
308                vec![
309                    IndexedOperation::new(Match, 0, 0),
310                    IndexedOperation::new(Delete(1), 1, 1),
311                    IndexedOperation::new(Match, 2, 1),
312                    IndexedOperation::new(Match, 3, 2),
313                    IndexedOperation::new(Insert(1), 4, 3),
314                    IndexedOperation::new(Match, 4, 4),
315                    IndexedOperation::new(Match, 5, 5),
316                ],
317                vec![
318                    IndexedOperation::new(Match, 0, 0),
319                    IndexedOperation::new(Match, 1, 1),
320                    IndexedOperation::new(Insert(1), 2, 2),
321                    IndexedOperation::new(Match, 2, 3),
322                    IndexedOperation::new(Delete(1), 3, 4),
323                    IndexedOperation::new(Match, 4, 4),
324                    IndexedOperation::new(Match, 5, 5),
325                ],
326            ],
327            levenshtein.align(&applet, &aplpet).edit_scripts()
328        );
329
330        assert_eq!(
331            hashset![vec![
332                IndexedOperation::new(Match, 0, 0),
333                IndexedOperation::new(Match, 1, 1),
334                IndexedOperation::new(Match, 2, 2),
335                IndexedOperation::new(Match, 3, 3),
336                IndexedOperation::new(Match, 4, 4),
337                IndexedOperation::new(Match, 5, 5),
338            ]],
339            levenshtein.align(&applet, &applet).edit_scripts()
340        );
341    }
342
343    #[test]
344    fn align_empty_test() {
345        let empty: &[char] = &[];
346        let non_empty: Vec<char> = "hello".chars().collect();
347
348        let levenshtein = Levenshtein::new(1, 1, 1);
349
350        assert_eq!(levenshtein.align(empty, empty).distance(), 0);
351        assert_eq!(levenshtein.align(non_empty.as_slice(), empty).distance(), 5);
352        assert_eq!(levenshtein.align(empty, non_empty.as_slice()).distance(), 5);
353    }
354}