ojo_diff/
lib.rs

1// Copyright 2018-2019 Joe Neeman.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8//
9// See the LICENSE-APACHE or LICENSE-MIT files at the top-level directory
10// of this distribution.
11
12#[cfg(test)]
13#[macro_use]
14extern crate proptest;
15
16use std::collections::hash_map::Entry;
17use std::collections::HashMap;
18use std::hash::{Hash, Hasher};
19
20mod lis;
21
22#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
23pub enum LineDiff {
24    /// This line was introduced in the second file, and the `usize` is the line number in the
25    /// second file.
26    New(usize),
27    /// This line was in the first file, but got deleted.
28    Delete(usize),
29    /// This line was present in both files; the first `usize` is the line number in the first
30    /// file, and the second is the line number in the second file.
31    Keep(usize, usize),
32}
33
34// This is a little trick for associating an element with its line number in a file. The point is
35// that our implementation of Hash and Eq will ignore the index, so we can put `WithIndex` in
36// hash maps and the index will just be transparently carried along.
37#[derive(Eq, PartialOrd)]
38struct WithIndex<T> {
39    idx: usize,
40    elem: T,
41}
42
43impl<T: PartialEq> PartialEq for WithIndex<T> {
44    fn eq(&self, other: &WithIndex<T>) -> bool {
45        self.elem.eq(&other.elem)
46    }
47}
48
49impl<T: Hash> Hash for WithIndex<T> {
50    fn hash<H: Hasher>(&self, state: &mut H) {
51        self.elem.hash(state);
52    }
53}
54
55// Returns a map pointing from lines to the number of times they appeared in the file. Each line is
56// also associated with the index where it first appeared.
57fn line_counts<T: Hash + Eq>(lines: &[T]) -> HashMap<WithIndex<&T>, usize> {
58    let mut line_counts = HashMap::new();
59
60    for (line_idx, line) in lines.iter().enumerate() {
61        let elem = WithIndex {
62            elem: line,
63            idx: line_idx,
64        };
65        *line_counts.entry(elem).or_insert(0) += 1;
66    }
67    line_counts
68}
69
70// The returned value `ret` is the largest number such that the first `ret` elements of `a` are the
71// same as the first `ret` elements of `b`.
72fn prefix_len<T: Eq>(a: &[T], b: &[T]) -> usize {
73    a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count()
74}
75
76// The returned value `ret` is the largest number such that the last `ret` elements of `a` are the
77// same as the last `ret` elements of `b`.
78fn suffix_len<T: Eq>(a: &[T], b: &[T]) -> usize {
79    a.iter()
80        .rev()
81        .zip(b.iter().rev())
82        .take_while(|(x, y)| x == y)
83        .count()
84}
85
86// Returns the tuple (pref_len, a_mid, b_mid, suff_len), where `pref_len` is the length of the
87// common prefix of `a` and `b`, `suff_len` is the length of the common suffix of `a` and `b`, and
88// `a_mid` and `b_mid` are the parts of `a` and `b` that are left after the prefixes and suffixes
89// are removed.
90fn match_ends<'a, T: Eq>(a: &'a [T], b: &'a [T]) -> (usize, &'a [T], &'a [T], usize) {
91    let pref_len = prefix_len(a, b);
92    let suff_len = suffix_len(&a[pref_len..], &b[pref_len..]);
93    let a_mid = &a[pref_len..(a.len() - suff_len)];
94    let b_mid = &b[pref_len..(b.len() - suff_len)];
95    (pref_len, a_mid, b_mid, suff_len)
96}
97
98// This function calculates an extremely simple kind of diff: find the longest common prefix and
99// suffix of the two files. Everything that isn't part of the prefix and suffix gets marked as
100// changed.
101//
102// We also support adding an offset to the line numbers of the two files, since they might actually
103// refer just to smaller parts of larger files.
104fn diff_ends<T: Eq>(a: &[T], a_offset: usize, b: &[T], b_offset: usize, diff: &mut Vec<LineDiff>) {
105    let (pref_len, a_mid, b_mid, suff_len) = match_ends(a, b);
106    for i in 0..pref_len {
107        diff.push(LineDiff::Keep(a_offset + i, b_offset + i));
108    }
109    for i in 0..a_mid.len() {
110        diff.push(LineDiff::Delete(a_offset + pref_len + i));
111    }
112    for i in 0..b_mid.len() {
113        diff.push(LineDiff::New(b_offset + pref_len + i));
114    }
115    for i in 0..suff_len {
116        diff.push(LineDiff::Keep(
117            a_offset + pref_len + a_mid.len() + i,
118            b_offset + pref_len + b_mid.len() + i,
119        ));
120    }
121}
122
123pub fn diff<T: Hash + Eq>(a: &[T], b: &[T]) -> Vec<LineDiff> {
124    let (pref_len, a_mid, b_mid, suff_len) = match_ends(a, b);
125    let a_line_counts = line_counts(a_mid);
126    let mut b_line_counts = line_counts(b_mid);
127    let a_unique = a_line_counts
128        .into_iter()
129        .filter(|(_, count)| *count == 1)
130        .map(|(line, _)| line);
131
132    // `both_unique` is a Vec of (usize, usize) pairs corresponding to lines that are unique in
133    // both files. The first usize is the index *in file b* and the second is the index in file a,
134    // and `both_unique` will be sorted according to the index in file a. The order of the indices
135    // seems backwards, but the point is that we'll look for a longest increasing subsequence and
136    // we want "increasing" here to mean according to appearance in file b.
137    let mut both_unique = a_unique
138        .filter_map(|a_line| {
139            // TODO: This is a bit awkward, but it can get better if HashMap::get_key_value is
140            // stabilized.
141            let a_idx = a_line.idx;
142            if let Entry::Occupied(entry) = b_line_counts.entry(a_line) {
143                if entry.get() == &1 {
144                    return Some((entry.key().idx, a_idx));
145                }
146            }
147            None
148        })
149        .collect::<Vec<(usize, usize)>>();
150    both_unique.sort_unstable_by_key(|(_b_idx, a_idx)| *a_idx);
151
152    let mut ret = Vec::with_capacity(a.len().max(b.len()));
153    for i in 0..pref_len {
154        ret.push(LineDiff::Keep(i, i));
155    }
156
157    let lis = lis::longest_increasing_subsequence(&both_unique);
158    let mut prev_b_idx = 0;
159    let mut prev_a_idx = 0;
160    for i in lis {
161        let (next_b_idx, next_a_idx) = both_unique[i];
162        let a_chunk = &a_mid[prev_a_idx..next_a_idx];
163        let b_chunk = &b_mid[prev_b_idx..next_b_idx];
164        diff_ends(
165            a_chunk,
166            pref_len + prev_a_idx,
167            b_chunk,
168            pref_len + prev_b_idx,
169            &mut ret,
170        );
171        prev_b_idx = next_b_idx;
172        prev_a_idx = next_a_idx;
173    }
174
175    let a_chunk = &a_mid[prev_a_idx..];
176    let b_chunk = &b_mid[prev_b_idx..];
177    diff_ends(
178        a_chunk,
179        pref_len + prev_a_idx,
180        b_chunk,
181        pref_len + prev_b_idx,
182        &mut ret,
183    );
184
185    for i in 0..suff_len {
186        ret.push(LineDiff::Keep(
187            a.len() - suff_len + i,
188            b.len() - suff_len + i,
189        ));
190    }
191
192    ret
193}
194
195#[cfg(test)]
196mod tests {
197    use proptest::prelude::*;
198    use std::fmt::Debug;
199
200    use super::LineDiff::*;
201    use super::*;
202
203    macro_rules! test_diff_ends {
204        ($name:ident, $a:expr, $b:expr, $expected:expr) => {
205            #[test]
206            fn $name() {
207                let a: &[_] = &$a[..];
208                let b: &[_] = &$b[..];
209                let expected: &[_] = &$expected[..];
210                let mut diff = Vec::new();
211                diff_ends(a, 0, b, 0, &mut diff);
212                assert_eq!(diff.as_slice(), expected);
213            }
214        };
215    }
216
217    test_diff_ends!(
218        diff_ends_all,
219        [1, 2, 3],
220        [1, 2, 3],
221        [Keep(0, 0), Keep(1, 1), Keep(2, 2),]
222    );
223    test_diff_ends!(
224        diff_ends_shorter_first,
225        [1, 1],
226        [1, 1, 1],
227        [Keep(0, 0), Keep(1, 1), New(2),]
228    );
229    test_diff_ends!(
230        diff_ends_longer_first,
231        [1, 1, 1],
232        [1, 1],
233        [Keep(0, 0), Keep(1, 1), Delete(2),]
234    );
235
236    // A diff between two files is valid if and only if
237    // - every input index appears exactly once in the diff, in increasing order
238    // - every output index appears exactly once in the diff, in increasing order
239    // - for every Keep line in the diff, the input and output lines are the same.
240    fn assert_valid<T: Debug + Eq>(a: &[T], b: &[T], diff: &[LineDiff]) {
241        let input_indices = diff
242            .iter()
243            .filter_map(|line| match *line {
244                New(_) => None,
245                Keep(i, _) => Some(i),
246                Delete(i) => Some(i),
247            })
248            .collect::<Vec<_>>();
249        assert_eq!(input_indices, (0..a.len()).into_iter().collect::<Vec<_>>());
250
251        let output_indices = diff
252            .iter()
253            .filter_map(|line| match *line {
254                New(i) => Some(i),
255                Keep(_, i) => Some(i),
256                Delete(_) => None,
257            })
258            .collect::<Vec<_>>();
259        assert_eq!(output_indices, (0..b.len()).into_iter().collect::<Vec<_>>());
260
261        for line in diff {
262            if let &Keep(i, j) = line {
263                assert_eq!(a[i], b[j]);
264            }
265        }
266    }
267
268    // We generate files by mostly generating common numbers (up to 10), and occasionally
269    // rare numbers (up to 1000). The common numbers are to make diff's job harder, and the rare
270    // numbers are to ensure that there are some unique lines.
271    fn file() -> BoxedStrategy<Vec<i32>> {
272        prop::collection::vec(
273            prop::strategy::Union::new_weighted(vec![(10, 0..10), (1, 0..1000)]),
274            1..100,
275        )
276        .boxed()
277    }
278
279    // Generates two files for diffing by first generating one, and then making another by changing
280    // the first one a bit.
281    fn two_files() -> BoxedStrategy<(Vec<i32>, Vec<i32>)> {
282        file()
283            .prop_perturb(|f, mut rng| {
284                let mut g = f.clone();
285                // Make between 0 and 19 random changes.
286                for _ in 0..rng.gen_range(0, 20) {
287                    let g_len = g.len();
288                    match rng.choose(&[1, 2, 3]).unwrap() {
289                        1 => {
290                            // delete a line
291                            if !g.is_empty() {
292                                g.remove(rng.gen_range(0, g_len));
293                            }
294                        }
295                        2 => {
296                            // insert a line
297                            g.insert(rng.gen_range(0, g_len + 1), rng.gen_range(0, 10));
298                        }
299                        3 => {
300                            // swap two lines
301                            if !g.is_empty() {
302                                g.swap(rng.gen_range(0, g_len), rng.gen_range(0, g_len));
303                            }
304                        }
305                        _ => unreachable!(),
306                    }
307                }
308                (f, g)
309            })
310            .boxed()
311    }
312
313    proptest! {
314        #[test]
315        fn test_valid_diff((f, g) in two_files()) {
316            let d = diff(&f, &g);
317            assert_valid(&f, &g, &d);
318        }
319    }
320}