patience_diff/
lib.rs

1//! This crate is for computing *patience diffs*, which require more effort to compute than normal
2//! diffs but are usually more human-readable.
3//!
4//! A diff describes the difference between two lists `a` and `b`; namely, the diff between `a` and
5//! `b` describes how to go from `a` to `b` by inserting, deleting, or keeping elements.
6//!
7//! # Why use a patience diff?
8//!
9//! Patience diffs are often more readable than ordinary, [longest common
10//! subsequence][wiki-lcs]-based diffs. For example, if you go from:
11//!
12//! ```c
13//! int func_1() {
14//!     return 1;
15//! }
16//!
17//! int func_2() {
18//!     return 2;
19//! }
20//! ```
21//!
22//! To:
23//!
24//! ```c
25//! int func_1() {
26//!     return 1;
27//! }
28//!
29//! int func_new() {
30//!     return 0;
31//! }
32//!
33//! int func_2() {
34//!     return 2;
35//! }
36//! ```
37//!
38//! The LCS diff between these two sequences of lines is:
39//!
40//! ```c
41//!   int func_1() {
42//!       return 1;
43//! + }
44//! +
45//! + int func_new() {
46//! +     return 0;
47//!   }
48//!
49//!   int func_2() {
50//!       return 2;
51//!   }
52//! ```
53//!
54//! Their patience diff, on the other hand, is:
55//!
56//! ```c
57//!   int func_1() {
58//!       return 1;
59//!   }
60//!
61//! + int func_new() {
62//! +     return 0;
63//! + }
64//! +
65//!   int func_2() {
66//!       return 2;
67//!   }
68//! ```
69//!
70//! ## How a patience diff is computed
71//!
72//! An "ordinary" diff is based on a longest common subsequence between `a` and `b`. A patience
73//! diff is very similar, but first finds the longest common subsequence between the *unique*
74//! elements of `a` and `b` to find "unambiguous" matches. Then, a patience diff is recursively
75//! computed for ranges between matched elements.
76//!
77//! You can read Bram Cohen, "discoverer" of patience diff, describe patience diff in his own words
78//! [here][bram-blog].
79//!
80//! [wiki-lcs]: https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
81//! [bram-blog]: http://bramcohen.livejournal.com/73318.html
82
83extern crate lcs;
84
85use std::collections::hash_map::{HashMap, Entry};
86use std::hash::{Hash, Hasher};
87
88#[derive(Debug, Eq)]
89struct Indexed<T> {
90    index: usize,
91    value: T
92}
93
94impl<T> PartialEq for Indexed<T> where T: PartialEq {
95    fn eq(&self, other: &Indexed<T>) -> bool {
96        self.value == other.value
97    }
98}
99
100impl<T> Hash for Indexed<T> where T: Hash {
101    fn hash<H>(&self, state: &mut H) where H: Hasher {
102        self.value.hash(state);
103    }
104}
105
106#[derive(Clone, Debug, PartialEq, Eq)]
107pub enum DiffComponent<T> {
108    Insertion(T),
109    Unchanged(T, T),
110    Deletion(T)
111}
112
113/// Computes the patience diff betwen `a` and `b`. The `DiffComponent`s hold references to the
114/// elements in `a` and `b` they correspond to.
115///
116/// ```
117/// use patience_diff::DiffComponent;
118///
119/// let a: Vec<_> = "AaaxZ".chars().collect();
120/// let b: Vec<_> = "AxaaZ".chars().collect();
121///
122/// let diff = patience_diff::patience_diff(&a, &b);
123/// assert_eq!(diff, vec![
124///     DiffComponent::Unchanged(&'A', &'A'),
125///     DiffComponent::Deletion(&'a'),
126///     DiffComponent::Deletion(&'a'),
127///     DiffComponent::Unchanged(&'x', &'x'),
128///     DiffComponent::Insertion(&'a'),
129///     DiffComponent::Insertion(&'a'),
130///     DiffComponent::Unchanged(&'Z', &'Z')
131/// ]);
132/// ```
133pub fn patience_diff<'a, T>(a: &'a [T], b: &'a [T]) -> Vec<DiffComponent<&'a T>>
134        where T: Eq + Hash {
135    if a.len() == 0 && b.len() == 0 {
136        return vec![];
137    }
138
139    if a.len() == 0 {
140        return b.iter().map(DiffComponent::Insertion).collect();
141    }
142
143    if b.len() == 0 {
144        return a.iter().map(DiffComponent::Deletion).collect();
145    }
146
147    let mut common_prefix = common_prefix(a.iter(), b.iter());
148    if !common_prefix.is_empty() {
149        let rest_a = &a[common_prefix.len()..];
150        let rest_b = &b[common_prefix.len()..];
151        common_prefix.extend(patience_diff(rest_a, rest_b));
152        return common_prefix;
153    }
154
155    let common_suffix = common_suffix(a.iter(), b.iter());
156    if !common_suffix.is_empty() {
157        let prev_a = &a[..a.len() - common_suffix.len()];
158        let prev_b = &b[..b.len() - common_suffix.len()];
159        let mut prev_diff = patience_diff(prev_a, prev_b);
160        prev_diff.extend(common_suffix);
161
162        return prev_diff;
163    }
164
165    let indexed_a: Vec<_> = a.iter()
166        .enumerate()
167        .map(|(i, val)| Indexed { index: i, value: val })
168        .collect();
169    let indexed_b: Vec<_> = b.iter()
170        .enumerate()
171        .map(|(i, val)| Indexed { index: i, value: val })
172        .collect();
173
174    let uniq_a = unique_elements(&indexed_a);
175    let uniq_b = unique_elements(&indexed_b);
176
177    let table = lcs::LcsTable::new(&uniq_a, &uniq_b);
178    let lcs = table.longest_common_subsequence();
179
180    if lcs.is_empty() {
181        let table = lcs::LcsTable::new(&indexed_a, &indexed_b);
182        return table.diff().into_iter().map(|c| {
183            match c {
184                lcs::DiffComponent::Insertion(elem_b) => {
185                    DiffComponent::Insertion(&b[elem_b.index])
186                },
187                lcs::DiffComponent::Unchanged(elem_a, elem_b) => {
188                    DiffComponent::Unchanged(&a[elem_a.index], &b[elem_b.index])
189                },
190                lcs::DiffComponent::Deletion(elem_a) => {
191                    DiffComponent::Deletion(&a[elem_a.index])
192                }
193            }
194        }).collect();
195    }
196
197    let mut ret = Vec::new();
198    let mut last_index_a = 0;
199    let mut last_index_b = 0;
200
201    for (match_a, match_b) in lcs {
202        let subset_a = &a[last_index_a..match_a.index];
203        let subset_b = &b[last_index_b..match_b.index];
204
205        ret.extend(patience_diff(subset_a, subset_b));
206
207        ret.push(DiffComponent::Unchanged(match_a.value, match_b.value));
208
209        last_index_a = match_a.index + 1;
210        last_index_b = match_b.index + 1;
211    }
212
213    let subset_a = &a[last_index_a..a.len()];
214    let subset_b = &b[last_index_b..b.len()];
215    ret.extend(patience_diff(subset_a, subset_b));
216
217    ret
218}
219
220fn common_prefix<'a, T, I>(a: I, b: I) -> Vec<DiffComponent<I::Item>>
221        where I: Iterator<Item = &'a T>, T: Eq {
222    a.zip(b)
223        .take_while(|&(elem_a, elem_b)| elem_a == elem_b)
224        .map(|(elem_a, elem_b)| DiffComponent::Unchanged(elem_a, elem_b))
225        .collect()
226}
227
228fn common_suffix<'a, T, I>(a: I, b: I) -> Vec<DiffComponent<I::Item>>
229        where I: DoubleEndedIterator<Item = &'a T>, T: Eq {
230    common_prefix(a.rev(), b.rev())
231}
232
233fn unique_elements<'a, T: Eq + Hash>(elems: &'a [T]) -> Vec<&'a T> {
234    let mut counts: HashMap<&T, usize> = HashMap::new();
235
236    for elem in elems {
237        match counts.entry(elem) {
238            Entry::Occupied(mut e) => {
239                *e.get_mut() = e.get() + 1;
240            },
241            Entry::Vacant(e) => {
242                e.insert(1);
243            }
244        }
245    }
246
247    elems.iter()
248        .filter(|elem| counts.get(elem) == Some(&1))
249        .collect()
250}
251
252#[test]
253fn test_patience_diff() {
254    let a: Vec<_> = "aax".chars().collect();
255    let b: Vec<_> = "xaa".chars().collect();
256
257    let diff = patience_diff(&a, &b);
258    assert_eq!(diff, vec![
259        DiffComponent::Deletion(&'a'),
260        DiffComponent::Deletion(&'a'),
261        DiffComponent::Unchanged(&'x', &'x'),
262        DiffComponent::Insertion(&'a'),
263        DiffComponent::Insertion(&'a'),
264    ]);
265
266    let a = vec![1, 10, 11, 4];
267    let b = vec![1, 10, 11, 2, 3, 10, 11, 4];
268
269    let diff = patience_diff(&a, &b);
270    assert_eq!(diff, vec![
271        DiffComponent::Unchanged(&1, &1),
272        DiffComponent::Unchanged(&10, &10),
273        DiffComponent::Unchanged(&11, &11),
274        DiffComponent::Insertion(&2),
275        DiffComponent::Insertion(&3),
276        DiffComponent::Insertion(&10),
277        DiffComponent::Insertion(&11),
278        DiffComponent::Unchanged(&4, &4)
279    ]);
280}
281
282#[test]
283fn test_unique_elements() {
284    assert_eq!(vec![&2, &4, &5], unique_elements(&[1, 2, 3, 3, 4, 5, 1]));
285}