patience_diff_rs/
lib.rs

1// Sources:
2// * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/
3// * https://bramcohen.livejournal.com/73318.html
4// * https://alfedenzo.livejournal.com/170301.html
5
6use std::collections::hash_map::Entry;
7use std::collections::HashMap;
8
9enum UniqueCheck {
10    Line(usize),
11    Duplicated,
12}
13
14fn longest_increasing_subsequence<T: Ord + Copy>(v: &Vec<T>) -> Vec<T> {
15    if v.len() < 2 {
16        return v.clone();
17    }
18    // start with one element in list so we can skip empty-list check
19    // in binary_search_by below.
20    let mut piles: Vec<Vec<T>> = vec![vec![v[0]]];
21    let mut out: Vec<T> = Vec::new();
22
23    // For the backpointer we use the fact that we only ever append on the right, i.e. once a
24    // column c exists the previous index will always be (c - 1), so we just need to store
25    // (c - 1).len() at the time of insertion.
26    // Note that we don't keep backpointers for the first heap (see index > 0 check below)
27    let mut backpointer: Vec<Vec<usize>> = vec![];
28
29    for x in v.iter().skip(1) {
30        let index = match piles.binary_search_by(|probe| probe.last().unwrap().cmp(x)) {
31            Ok(index) => index,
32            Err(index) => index,
33        };
34        if piles.len() <= index {
35            piles.push(vec![*x]);
36            backpointer.push(vec![piles[index - 1].len() - 1]);
37        } else {
38            piles[index].push(*x);
39            if index > 0 {
40                backpointer[index - 1].push(piles[index - 1].len() - 1);
41            }
42        }
43    }
44    // Pick _a_ longest increasing subsequence, not necessarily unique
45    let mut i = piles.len() - 1;
46    let mut j = 0;
47    while i > 0 {
48        out.push(piles[i][j]);
49        j = backpointer[i - 1][j];
50        i = i - 1;
51    }
52    out.push(piles[i][j]);
53    out.reverse();
54    out
55}
56
57fn unique_check<T, I>(iter: I, offset: usize) -> HashMap<T, UniqueCheck>
58where
59    T: Eq + Copy + std::hash::Hash,
60    I: Iterator<Item = T>,
61{
62    let mut unique_map: HashMap<T, UniqueCheck> = HashMap::new();
63    for (ix, x) in iter.enumerate() {
64        match unique_map.entry(x) {
65            Entry::Vacant(xe) => {
66                xe.insert(UniqueCheck::Line(ix + offset));
67            }
68            Entry::Occupied(mut xe) => {
69                let xem = xe.get_mut();
70                if let UniqueCheck::Line(_) = xem {
71                    *xem = UniqueCheck::Duplicated
72                }
73            }
74        }
75    }
76    unique_map
77}
78
79#[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy)]
80pub struct Range {
81    pub start: usize,
82    pub end: usize, // exclusive
83}
84
85#[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy)]
86pub struct Hunk {
87    pub remove: Range,
88    pub insert: Range,
89}
90
91pub fn patience_diff<T: Ord + Eq + Clone + std::hash::Hash + std::fmt::Debug>(
92    a: &Vec<T>,
93    b: &Vec<T>,
94) -> Vec<Hunk> {
95    let an = a.len();
96    let bn = b.len();
97    let mut queue: Vec<(usize, usize, usize, usize)> = vec![(0, 0, an, bn)];
98    let mut out: Vec<Hunk> = Vec::new();
99
100    while let Some((mut a0, mut b0, mut a1, mut b1)) = queue.pop() {
101        // 1. Walk from start and end until mismatch. This removes
102        //    code common to both a and b.
103        while a0 < a1 && b0 < b1 && a[a0] == b[b0] {
104            a0 += 1;
105            b0 += 1;
106        }
107        while a1 > a0 && b1 > b0 && a[a1 - 1] == b[b1 - 1] {
108            a1 -= 1;
109            b1 -= 1;
110        }
111
112        // 2. find matching uniques, keeping line numbers. We're using an enum
113        // to keep track of the line number, and set it to Duplicated if already
114        // in the map.
115        let a_map = unique_check(a[a0..a1].iter(), a0);
116        let b_map = unique_check(b[b0..b1].iter(), b0);
117
118        let mut rhs: Vec<(usize, usize)> = Vec::new();
119        for (ix, x) in a[a0..a1].iter().enumerate() {
120            if a_map.contains_key(x) && b_map.contains_key(x) {
121                match b_map.get(x) {
122                    Some(UniqueCheck::Line(z)) => {
123                        // somewhat unintuitive: We use tuples of (right-side, left-size), that
124                        // way the Ord trait works correctly in longest_increasing_subsequence later.
125                        rhs.push((*z, a0 + ix));
126                    }
127                    _ => {}
128                }
129            }
130        }
131        let rhs2 = longest_increasing_subsequence(&rhs);
132        if rhs2.is_empty() {
133            // only push if either a or b make an interesting contribution to
134            // the diff.
135            if a1 - a0 > 0 || b1 - b0 > 0 {
136                out.push(Hunk {
137                    remove: Range { start: a0, end: a1 },
138                    insert: Range { start: b0, end: b1 },
139                });
140            }
141        } else {
142            let start = vec![(b0, a0)];
143            let end = vec![(b1, a1)];
144            let together = start.iter().chain(rhs2.iter()).chain(end.iter());
145
146            // note that a and b are flipped because of the reversed tuple used in partience_argsort.
147            for ((b_start, a_start), (b_end, a_end)) in together.clone().zip(together.skip(1)) {
148                queue.push((*a_start, *b_start, *a_end, *b_end));
149            }
150        }
151    }
152    out.sort(); // TODO - do we need to pay down the sort here?
153    out
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    // Bring the macros and other important things into scope.
160    use proptest::prelude::*;
161
162    #[test]
163    fn check_diff() {
164        let before = vec!["x", "y", "c", "z", "0"];
165        let after = vec!["x", "b", "y", "z", "1"];
166        assert_eq!(
167            patience_diff(&before, &after),
168            [
169                Hunk {
170                    remove: Range { start: 1, end: 1 },
171                    insert: Range { start: 1, end: 2 }
172                },
173                Hunk {
174                    remove: Range { start: 2, end: 3 },
175                    insert: Range { start: 3, end: 3 }
176                },
177                Hunk {
178                    remove: Range { start: 4, end: 5 },
179                    insert: Range { start: 4, end: 5 }
180                }
181            ]
182        )
183    }
184    #[test]
185    fn check_file_example() {
186        let before = include_str!("testdata/before.c").lines().collect();
187        let after = include_str!("testdata/after.c").lines().collect();
188        assert_eq!(
189            patience_diff(&before, &after),
190            [Hunk {
191                remove: Range { start: 4, end: 4 },
192                insert: Range { start: 4, end: 8 }
193            }]
194        );
195    }
196    #[test]
197    fn check_argsort() {
198        let v = vec![9, 13, 7, 12, 2, 1, 4, 6, 5, 8, 3, 11, 10];
199        assert_eq!(longest_increasing_subsequence(&v), vec![1, 4, 5, 8, 11]);
200    }
201
202    #[test]
203    fn check_lis() {
204        let v = vec!["a", "b", "f", "e", "c"];
205        assert_eq!(longest_increasing_subsequence(&v), vec!["a", "b", "f"]);
206    }
207
208    proptest! {
209        #[test]
210        fn propcheck_lis(v in prop::collection::vec(0u32..1_000, 0..10)) {
211            longest_increasing_subsequence(&v);
212        }
213
214
215        #[test]
216        fn propcheck_smoketest_diff(
217            v1 in prop::collection::vec("[abcdef]", 0..30),
218            v2 in prop::collection::vec("[abcdef]", 0..30)
219        ) {
220            patience_diff(&v1, &v2);
221        }
222    }
223}