automerge/text_diff/
myers.rs

1// This file was copied from the similar crate at
2// https://github.com/mitsuhiko/similar/blob/2b31f65445df9093ba007ca5a5ae6a71b899d491/src/algorithms/myers.rs
3// The original license is in the LICENSE file in the same directory as this file
4//
5// This file was modified to use a Diff trait defined in this file rather than the DiffHook trait
6// defined in `similar` and to remote the deadline parameter to the `diff` function.
7//! Myers' diff algorithm.
8//!
9//! * time: `O((N+M)D)`
10//! * space `O(N+M)`
11//!
12//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
13//! describing it.
14//!
15//! The implementation of this algorithm is based on the implementation by
16//! Brandon Williams.
17//!
18//! # Heuristics
19//!
20//! At present this implementation of Myers' does not implement any more advanced
21//! heuristics that would solve some pathological cases.  For instance passing two
22//! large and completely distinct sequences to the algorithm will make it spin
23//! without making reasonable progress.
24
25use std::ops::{Index, IndexMut, Range};
26
27use super::utils::{common_prefix_len, common_suffix_len, is_empty_range};
28
29pub(super) trait DiffHook: Sized {
30    type Error;
31    fn equal(&mut self, old_index: usize, new_index: usize, len: usize) -> Result<(), Self::Error>;
32    fn delete(
33        &mut self,
34        old_index: usize,
35        old_len: usize,
36        new_index: usize,
37    ) -> Result<(), Self::Error>;
38    fn insert(
39        &mut self,
40        old_index: usize,
41        new_index: usize,
42        new_len: usize,
43    ) -> Result<(), Self::Error>;
44    fn replace(
45        &mut self,
46        old_index: usize,
47        old_len: usize,
48        new_index: usize,
49        new_len: usize,
50    ) -> Result<(), Self::Error>;
51    fn finish(&mut self) -> Result<(), Self::Error>;
52}
53
54/// Myers' diff algorithm.
55///
56/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
57pub(super) fn diff<Old, New, D>(
58    d: &mut D,
59    old: &Old,
60    old_range: Range<usize>,
61    new: &New,
62    new_range: Range<usize>,
63) -> Result<(), D::Error>
64where
65    Old: Index<usize> + ?Sized,
66    New: Index<usize> + ?Sized,
67    D: DiffHook,
68    New::Output: PartialEq<Old::Output>,
69{
70    let max_d = max_d(old_range.len(), new_range.len());
71    let mut vb = V::new(max_d);
72    let mut vf = V::new(max_d);
73    conquer(d, old, old_range, new, new_range, &mut vf, &mut vb)?;
74    d.finish()
75}
76
77// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
78// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
79// and then a possibly empty sequence of diagonal edges called a snake.
80
81/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
82/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
83/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
84/// where `V[k]` contains the row index of the endpoint of the furthest reaching
85/// path in diagonal `k`.
86///
87/// We can't use a traditional Vec to represent `V` since we use `k` as an index
88/// and it can take on negative values. So instead `V` is represented as a
89/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
90/// `k` can take on in order to map negative `k`'s back to a value >= 0.
91#[derive(Debug)]
92struct V {
93    offset: isize,
94    v: Vec<usize>, // Look into initializing this to -1 and storing isize
95}
96
97impl V {
98    fn new(max_d: usize) -> Self {
99        Self {
100            offset: max_d as isize,
101            v: vec![0; 2 * max_d],
102        }
103    }
104
105    fn len(&self) -> usize {
106        self.v.len()
107    }
108}
109
110impl Index<isize> for V {
111    type Output = usize;
112
113    fn index(&self, index: isize) -> &Self::Output {
114        &self.v[(index + self.offset) as usize]
115    }
116}
117
118impl IndexMut<isize> for V {
119    fn index_mut(&mut self, index: isize) -> &mut Self::Output {
120        &mut self.v[(index + self.offset) as usize]
121    }
122}
123
124fn max_d(len1: usize, len2: usize) -> usize {
125    // XXX look into reducing the need to have the additional '+ 1'
126    (len1 + len2).div_ceil(2) + 1
127}
128
129#[inline(always)]
130fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
131    (range.start..at, at..range.end)
132}
133
134/// A `Snake` is a sequence of diagonal edges in the edit graph.  Normally
135/// a snake has a start end end point (and it is possible for a snake to have
136/// a length of zero, meaning the start and end points are the same) however
137/// we do not need the end point which is why it's not implemented here.
138///
139/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
140/// some of which may be empty. The divide step requires finding the ceil(D/2) +
141/// 1 or middle snake of an optimal D-path. The idea for doing so is to
142/// simultaneously run the basic algorithm in both the forward and reverse
143/// directions until furthest reaching forward and reverse paths starting at
144/// opposing corners 'overlap'.
145fn find_middle_snake<Old, New>(
146    old: &Old,
147    old_range: Range<usize>,
148    new: &New,
149    new_range: Range<usize>,
150    vf: &mut V,
151    vb: &mut V,
152) -> Option<(usize, usize)>
153where
154    Old: Index<usize> + ?Sized,
155    New: Index<usize> + ?Sized,
156    New::Output: PartialEq<Old::Output>,
157{
158    let n = old_range.len();
159    let m = new_range.len();
160
161    // By Lemma 1 in the paper, the optimal edit script length is odd or even as
162    // `delta` is odd or even.
163    let delta = n as isize - m as isize;
164    let odd = delta & 1 == 1;
165
166    // The initial point at (0, -1)
167    vf[1] = 0;
168    // The initial point at (N, M+1)
169    vb[1] = 0;
170
171    // We only need to explore ceil(D/2) + 1
172    let d_max = max_d(n, m);
173    assert!(vf.len() >= d_max);
174    assert!(vb.len() >= d_max);
175
176    for d in 0..d_max as isize {
177        // Forward path
178        for k in (-d..=d).rev().step_by(2) {
179            let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
180                vf[k + 1]
181            } else {
182                vf[k - 1] + 1
183            };
184            let y = (x as isize - k) as usize;
185
186            // The coordinate of the start of a snake
187            let (x0, y0) = (x, y);
188            //  While these sequences are identical, keep moving through the
189            //  graph with no cost
190            if x < old_range.len() && y < new_range.len() {
191                let advance = common_prefix_len(
192                    old,
193                    old_range.start + x..old_range.end,
194                    new,
195                    new_range.start + y..new_range.end,
196                );
197                x += advance;
198            }
199
200            // This is the new best x value
201            vf[k] = x;
202
203            // Only check for connections from the forward search when N - M is
204            // odd and when there is a reciprocal k line coming from the other
205            // direction.
206            if odd && (k - delta).abs() <= (d - 1) {
207                // TODO optimize this so we don't have to compare against n
208                if vf[k] + vb[-(k - delta)] >= n {
209                    // Return the snake
210                    return Some((x0 + old_range.start, y0 + new_range.start));
211                }
212            }
213        }
214
215        // Backward path
216        for k in (-d..=d).rev().step_by(2) {
217            let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
218                vb[k + 1]
219            } else {
220                vb[k - 1] + 1
221            };
222            let mut y = (x as isize - k) as usize;
223
224            // The coordinate of the start of a snake
225            if x < n && y < m {
226                let advance = common_suffix_len(
227                    old,
228                    old_range.start..old_range.start + n - x,
229                    new,
230                    new_range.start..new_range.start + m - y,
231                );
232                x += advance;
233                y += advance;
234            }
235
236            // This is the new best x value
237            vb[k] = x;
238
239            if !odd && (k - delta).abs() <= d {
240                // TODO optimize this so we don't have to compare against n
241                if vb[k] + vf[-(k - delta)] >= n {
242                    // Return the snake
243                    return Some((n - x + old_range.start, m - y + new_range.start));
244                }
245            }
246        }
247
248        // TODO: Maybe there's an opportunity to optimize and bail early?
249    }
250
251    // deadline reached
252    None
253}
254
255#[allow(clippy::too_many_arguments)]
256fn conquer<Old, New, D>(
257    d: &mut D,
258    old: &Old,
259    mut old_range: Range<usize>,
260    new: &New,
261    mut new_range: Range<usize>,
262    vf: &mut V,
263    vb: &mut V,
264) -> Result<(), D::Error>
265where
266    Old: Index<usize> + ?Sized,
267    New: Index<usize> + ?Sized,
268    D: DiffHook,
269    New::Output: PartialEq<Old::Output>,
270{
271    // Check for common prefix
272    let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
273    if common_prefix_len > 0 {
274        d.equal(old_range.start, new_range.start, common_prefix_len)?;
275    }
276    old_range.start += common_prefix_len;
277    new_range.start += common_prefix_len;
278
279    // Check for common suffix
280    let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
281    let common_suffix = (
282        old_range.end - common_suffix_len,
283        new_range.end - common_suffix_len,
284    );
285    old_range.end -= common_suffix_len;
286    new_range.end -= common_suffix_len;
287
288    if is_empty_range(&old_range) && is_empty_range(&new_range) {
289        // Do nothing
290    } else if is_empty_range(&new_range) {
291        d.delete(old_range.start, old_range.len(), new_range.start)?;
292    } else if is_empty_range(&old_range) {
293        d.insert(old_range.start, new_range.start, new_range.len())?;
294    } else if let Some((x_start, y_start)) =
295        find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb)
296    {
297        let (old_a, old_b) = split_at(old_range, x_start);
298        let (new_a, new_b) = split_at(new_range, y_start);
299        conquer(d, old, old_a, new, new_a, vf, vb)?;
300        conquer(d, old, old_b, new, new_b, vf, vb)?;
301    } else {
302        d.delete(
303            old_range.start,
304            old_range.end - old_range.start,
305            new_range.start,
306        )?;
307        d.insert(
308            old_range.start,
309            new_range.start,
310            new_range.end - new_range.start,
311        )?;
312    }
313
314    if common_suffix_len > 0 {
315        d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
316    }
317
318    Ok(())
319}
320
321#[test]
322fn test_find_middle_snake() {
323    let a = &b"ABCABBA"[..];
324    let b = &b"CBABAC"[..];
325    let max_d = max_d(a.len(), b.len());
326    let mut vf = V::new(max_d);
327    let mut vb = V::new(max_d);
328    let (x_start, y_start) =
329        find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb).unwrap();
330    assert_eq!(x_start, 4);
331    assert_eq!(y_start, 1);
332}