edit_tree/
edit_tree.rs

1use std::cmp::Eq;
2use std::cmp::Ordering;
3use std::fmt;
4use std::fmt::Debug;
5
6use lazy_static::lazy_static;
7use seqalign::measures::LCSOp;
8use seqalign::measures::LCS;
9use seqalign::op::IndexedOperation;
10use seqalign::Align;
11use serde::{Deserialize, Serialize};
12
13lazy_static! {
14    static ref MEASURE: LCS = LCS::new(1, 1);
15}
16
17/// Enum representing either a [EditTree::MatchNode] with a prefix and suffix
18/// length or a [EditTree::ReplaceNode] describing a replace operation.
19#[derive(Debug, PartialEq, Hash, Eq, Clone, Serialize, Deserialize)]
20pub enum EditTree<T> {
21    MatchNode {
22        pre: usize,
23        suf: usize,
24        left: Option<Box<EditTree<T>>>,
25        right: Option<Box<EditTree<T>>>,
26    },
27
28    ReplaceNode {
29        replacee: Vec<T>,
30        replacement: Vec<T>,
31    },
32}
33
34impl<T> EditTree<T>
35where
36    T: PartialEq + Eq + Clone,
37{
38    /// Returns a edit tree specifying how to derive `b` from `a`.
39    ///
40    /// **Caution:** when using with stringy types. UTF-8 multi byte
41    /// chars will not be treated well. Consider passing in &[char]
42    /// instead.
43    pub fn create_tree(a: &[T], b: &[T]) -> Option<Self> {
44        build_tree(a, b).map(|tree| *tree)
45    }
46
47    /// Returns a s-String representation of the EditTree.
48    ///
49    /// `format_vec` defines how to transform the `Vec<T>` of a `ReplaceNode` into a
50    /// `String`. This is useful when implementing `Display` for your own types where
51    /// conversion to `String` may not be straight forward.
52    pub fn pretty_print(&self, format_vec: impl (Fn(&[T]) -> String) + Copy) -> String {
53        match self {
54            EditTree::MatchNode {
55                pre,
56                suf,
57                left,
58                right,
59            } => {
60                let left_str = left
61                    .as_ref()
62                    .map(|left| left.pretty_print(format_vec))
63                    .unwrap_or_else(|| "()".to_string());
64                let right_str = right
65                    .as_ref()
66                    .map(|right| right.pretty_print(format_vec))
67                    .unwrap_or_else(|| "()".to_string());
68
69                format!("(match {} {} {} {})", pre, suf, left_str, right_str)
70            }
71            EditTree::ReplaceNode {
72                replacee,
73                replacement,
74            } => format!(
75                "(replace \"{}\" \"{}\")",
76                format_vec(replacee),
77                format_vec(replacement),
78            ),
79        }
80    }
81}
82
83/// Struct representing a continuous match between two sequences.
84#[derive(Debug, PartialEq, Eq, Hash)]
85pub struct LCSMatch {
86    start_src: usize,
87    start_targ: usize,
88    length: usize,
89}
90
91impl LCSMatch {
92    pub fn new(start_src: usize, start_targ: usize, length: usize) -> Self {
93        LCSMatch {
94            start_src,
95            start_targ,
96            length,
97        }
98    }
99    fn empty() -> Self {
100        LCSMatch::new(0, 0, 0)
101    }
102}
103
104impl PartialOrd for LCSMatch {
105    fn partial_cmp(&self, other: &LCSMatch) -> Option<Ordering> {
106        Some(self.length.cmp(&other.length))
107    }
108}
109
110// Returns the start and end index of the longest match. Returns none if no match is found.
111fn longest_match(script: &[IndexedOperation<LCSOp>]) -> Option<LCSMatch> {
112    let mut longest = LCSMatch::empty();
113
114    let mut script_slice = script;
115    while !script_slice.is_empty() {
116        let op = &script_slice[0];
117
118        match op.operation() {
119            LCSOp::Match => {
120                let in_start = op.source_idx();
121                let o_start = op.target_idx();
122                let end = match script_slice
123                    .iter()
124                    .position(|x| !matches!(x.operation(), LCSOp::Match))
125                {
126                    Some(idx) => idx,
127                    None => script_slice.len(),
128                };
129                if end > longest.length {
130                    longest = LCSMatch::new(in_start, o_start, end);
131                };
132
133                script_slice = &script_slice[end..];
134            }
135            _ => {
136                script_slice = &script_slice[1..];
137            }
138        }
139    }
140
141    if longest.length != 0 {
142        Some(longest)
143    } else {
144        None
145    }
146}
147
148/// Recursively builds an edit tree by applying itself to pre and suffix of the longest common substring.
149fn build_tree<T: PartialEq + Eq + Clone>(form_ch: &[T], lem_ch: &[T]) -> Option<Box<EditTree<T>>> {
150    if form_ch.is_empty() && lem_ch.is_empty() {
151        return None;
152    }
153
154    let alignment = MEASURE.align(form_ch, lem_ch);
155    let root = match longest_match(&alignment.edit_script()[..]) {
156        Some(m) => EditTree::MatchNode {
157            pre: m.start_src,
158            suf: (form_ch.len() - m.start_src) - m.length,
159            left: build_tree(&form_ch[..m.start_src], &lem_ch[..m.start_targ]),
160            right: build_tree(
161                &form_ch[m.start_src + m.length..],
162                &lem_ch[m.start_targ + m.length..],
163            ),
164        },
165        None => EditTree::ReplaceNode {
166            replacee: form_ch.to_vec(),
167            replacement: lem_ch.to_vec(),
168        },
169    };
170    Some(Box::new(root))
171}
172
173/// Trait to apply an edit tree to a given form.
174pub trait Apply<T: PartialEq> {
175    fn apply(&self, form: &[T]) -> Option<Vec<T>>;
176}
177
178impl<T: PartialEq + Eq + Clone + Debug> Apply<T> for EditTree<T> {
179    /// Recursively applies the nodes stored in the edit tree. Returns `None` if the tree is not applicable to
180    /// `form`.
181    fn apply(&self, form: &[T]) -> Option<Vec<T>> {
182        let form_len = form.len();
183        match self {
184            EditTree::MatchNode {
185                pre,
186                suf,
187                left,
188                right,
189            } => {
190                if pre + suf >= form_len {
191                    return None;
192                }
193
194                let mut left = match left {
195                    Some(left) => left.apply(&form[..*pre])?,
196                    None => vec![],
197                };
198
199                left.extend(form[*pre..form_len - *suf].iter().cloned());
200
201                if let Some(right) = right {
202                    left.extend(right.apply(&form[form_len - *suf..])?)
203                }
204
205                Some(left)
206            }
207
208            EditTree::ReplaceNode {
209                ref replacee,
210                ref replacement,
211            } => {
212                if form == &replacee[..] || replacee.is_empty() {
213                    Some(replacement.clone())
214                } else {
215                    None
216                }
217            }
218        }
219    }
220}
221
222impl fmt::Display for EditTree<char> {
223    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
224        write!(
225            f,
226            "{}",
227            self.pretty_print(|chars: &[char]| chars.iter().collect::<String>())
228        )
229    }
230}
231
232#[cfg(test)]
233mod tests {
234    use super::{Apply, EditTree};
235    use crate::ToLowerCharVec;
236    use std::collections::HashSet;
237
238    #[test]
239    fn test_graph_equality_outcome() {
240        let a = "hates".to_lower_char_vec();
241        let b = "hate".to_lower_char_vec();
242        let g = EditTree::create_tree(a.as_slice(), b.as_slice()).unwrap();
243
244        let a = "loves".to_lower_char_vec();
245        let b = "love".to_lower_char_vec();
246        let g1 = EditTree::create_tree(a.as_slice(), b.as_slice()).unwrap();
247
248        let f = "loves".to_lower_char_vec();
249        let f1 = "hates".to_lower_char_vec();
250        let exp = "love".to_lower_char_vec();
251        let exp1 = "hate".to_lower_char_vec();
252
253        assert_eq!(g.apply(&f1).unwrap(), exp1);
254        assert_eq!(g1.apply(&f).unwrap(), exp);
255        assert_eq!(g, g1);
256    }
257
258    #[test]
259    fn test_graph_equality_outcome_2() {
260        let g = EditTree::create_tree(
261            &"machen".to_lower_char_vec(),
262            &"gemacht".to_lower_char_vec(),
263        )
264        .unwrap();
265        let g1 = EditTree::create_tree(
266            &"lachen".to_lower_char_vec(),
267            &"gelacht".to_lower_char_vec(),
268        )
269        .unwrap();
270
271        let f = "machen".to_lower_char_vec();
272        let f1 = "lachen".to_lower_char_vec();
273        let exp = "gemacht".to_lower_char_vec();
274        let exp1 = "gelacht".to_lower_char_vec();
275
276        assert_eq!(g.apply(&f1).unwrap(), exp1);
277        assert_eq!(g1.apply(&f).unwrap(), exp);
278        assert_eq!(g, g1);
279    }
280
281    #[test]
282    fn test_graph_equality_outcome_3() {
283        let a = "aaaaaaaaen".to_lower_char_vec();
284        let b = "geaaaaaaaat".to_lower_char_vec();
285        let g = EditTree::create_tree(&a, &b).unwrap();
286
287        let a = "lachen".to_lower_char_vec();
288        let b = "gelacht".to_lower_char_vec();
289        let g1 = EditTree::create_tree(&a, &b).unwrap();
290
291        let f = "lachen".to_lower_char_vec();
292        let f1 = "aaaaaaaaen".to_lower_char_vec();
293        let exp = "gelacht".to_lower_char_vec();
294        let exp1 = "geaaaaaaaat".to_lower_char_vec();
295
296        assert_eq!(g.apply(&f).unwrap(), exp);
297        assert_eq!(g1.apply(&f1).unwrap(), exp1);
298        assert_eq!(g, g1);
299    }
300
301    #[test]
302    fn test_graph_equality_and_applicability() {
303        let mut set: HashSet<EditTree<char>> = HashSet::default();
304        let a = "abc".to_lower_char_vec();
305        let b = "ab".to_lower_char_vec();
306        let g1 = EditTree::create_tree(&a, &b).unwrap();
307
308        let a = "aaa".to_lower_char_vec();
309        let b = "aa".to_lower_char_vec();
310        let g2 = EditTree::create_tree(&a, &b).unwrap();
311
312        let a = "cba".to_lower_char_vec();
313        let b = "ba".to_lower_char_vec();
314        let g3 = EditTree::create_tree(&a, &b).unwrap();
315        let g4 = EditTree::create_tree(&a, &b).unwrap();
316
317        let a = "aaa".to_lower_char_vec();
318        let b = "aac".to_lower_char_vec();
319        let g5 = EditTree::create_tree(&a, &b).unwrap();
320
321        let a = "dec".to_lower_char_vec();
322        let b = "decc".to_lower_char_vec();
323        let g6 = EditTree::create_tree(&a, &a).unwrap();
324        let g7 = EditTree::create_tree(&a, &b).unwrap();
325
326        set.insert(g1);
327        assert_eq!(set.len(), 1);
328        set.insert(g2);
329        assert_eq!(set.len(), 2);
330        set.insert(g3);
331        assert_eq!(set.len(), 3);
332        set.insert(g4);
333        assert_eq!(set.len(), 3);
334        set.insert(g5);
335        assert_eq!(set.len(), 4);
336        set.insert(g6);
337        set.insert(g7);
338        assert_eq!(set.len(), 6);
339
340        let v = "yyyy".to_lower_char_vec();
341        let res: HashSet<String> = set
342            .iter()
343            .map(|x| x.apply(&v))
344            .filter(|x| x.is_some())
345            .map(|x| x.unwrap().iter().collect::<String>())
346            .collect();
347
348        assert_eq!(res.len(), 2);
349
350        let v = "yyy".to_lower_char_vec();
351        let res: HashSet<String> = set
352            .iter()
353            .map(|x| x.apply(&v))
354            .filter(|x| x.is_some())
355            .map(|x| x.unwrap().iter().collect::<String>())
356            .collect();
357        assert!(res.contains("yyyc"));
358        assert!(res.contains("yyy"));
359        assert_eq!(res.len(), 2);
360
361        let v = "bba".to_lower_char_vec();
362        let res: HashSet<String> = set
363            .iter()
364            .map(|x| x.apply(&v))
365            .filter(|x| x.is_some())
366            .map(|x| x.unwrap().iter().collect::<String>())
367            .collect();
368
369        assert!(res.contains("bbac"));
370        assert!(res.contains("bba"));
371        assert!(res.contains("bb"));
372        assert!(res.contains("bbc"));
373        assert_eq!(res.len(), 4);
374
375        let v = a.as_slice();
376        let res: HashSet<String> = set
377            .iter()
378            .map(|x| x.apply(&v))
379            .filter(|x| x.is_some())
380            .map(|x| x.unwrap().iter().collect::<String>())
381            .collect();
382        assert!(res.contains("dec"));
383        assert!(res.contains("decc"));
384        assert!(res.contains("de"));
385        assert_eq!(res.len(), 3);
386
387        let a = "die".to_lower_char_vec();
388        let b = "das".to_lower_char_vec();
389        let c = "die".to_lower_char_vec();
390        let g = EditTree::create_tree(a.as_slice(), b.as_slice()).unwrap();
391        assert!(g.apply(&c).is_some());
392    }
393    #[test]
394    fn test_graphs_inapplicable() {
395        let g = EditTree::create_tree(
396            "abcdefg".to_lower_char_vec().as_slice(),
397            "abc".to_lower_char_vec().as_slice(),
398        )
399        .unwrap();
400        assert!(g.apply(&"abc".to_lower_char_vec().as_slice()).is_none());
401
402        let g = EditTree::create_tree(
403            "abcdefg".to_lower_char_vec().as_slice(),
404            "efg".to_lower_char_vec().as_slice(),
405        )
406        .unwrap();
407        assert!(g.apply(&"efg".to_lower_char_vec().as_slice()).is_none());
408    }
409
410    #[test]
411    fn display_edit_tree() {
412        let tree =
413            EditTree::create_tree(&['l', 'o', 'o', 'p', 't'], &['l', 'o', 'p', 'e', 'n']).unwrap();
414
415        assert_eq!(
416            tree.to_string(),
417            "(match 0 3 () (match 1 1 (replace \"o\" \"\") (replace \"t\" \"en\")))"
418        );
419    }
420}