diffy_fork_filenames/diff/
myers.rs

1use crate::range::{DiffRange, Range};
2use std::ops::{Index, IndexMut};
3
4// A D-path is a path which starts at (0,0) that has exactly D non-diagonal edges. All D-paths
5// consist of a (D - 1)-path followed by a non-diagonal edge and then a possibly empty sequence of
6// diagonal edges called a snake.
7
8/// `V` contains the endpoints of the furthest reaching `D-paths`. For each recorded endpoint
9/// `(x,y)` in diagonal `k`, we only need to retain `x` because `y` can be computed from `x - k`.
10/// In other words, `V` is an array of integers where `V[k]` contains the row index of the endpoint
11/// of the furthest reaching path in diagonal `k`.
12///
13/// We can't use a traditional Vec to represent `V` since we use `k` as an index and it can take on
14/// negative values. So instead `V` is represented as a light-weight wrapper around a Vec plus an
15/// `offset` which is the maximum value `k` can take on in order to map negative `k`'s back to a
16/// value >= 0.
17#[derive(Debug, Clone)]
18struct V {
19    offset: isize,
20    v: Vec<usize>, // Look into initializing this to -1 and storing isize
21}
22
23impl V {
24    fn new(max_d: usize) -> Self {
25        Self {
26            offset: max_d as isize,
27            v: vec![0; 2 * max_d],
28        }
29    }
30
31    fn len(&self) -> usize {
32        self.v.len()
33    }
34}
35
36impl Index<isize> for V {
37    type Output = usize;
38
39    fn index(&self, index: isize) -> &Self::Output {
40        &self.v[(index + self.offset) as usize]
41    }
42}
43
44impl IndexMut<isize> for V {
45    fn index_mut(&mut self, index: isize) -> &mut Self::Output {
46        &mut self.v[(index + self.offset) as usize]
47    }
48}
49
50/// A `Snake` is a sequence of diagonal edges in the edit graph. It is possible for a snake to have
51/// a length of zero, meaning the start and end points are the same.
52#[derive(Debug)]
53struct Snake {
54    x_start: usize,
55    y_start: usize,
56    x_end: usize,
57    y_end: usize,
58}
59
60impl ::std::fmt::Display for Snake {
61    fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
62        write!(
63            f,
64            "({}, {}) -> ({}, {})",
65            self.x_start, self.y_start, self.x_end, self.y_end
66        )
67    }
68}
69
70fn max_d(len1: usize, len2: usize) -> usize {
71    // XXX look into reducing the need to have the additional '+ 1'
72    (len1 + len2 + 1) / 2 + 1
73}
74
75// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes some of which may
76// be empty. The divide step requires finding the ceil(D/2) + 1 or middle snake of an optimal
77// D-path. The idea for doing so is to simultaneously run the basic algorithm in both the
78// forward and reverse directions until furthest reaching forward and reverse paths starting at
79// opposing corners 'overlap'.
80fn find_middle_snake<T: PartialEq>(
81    old: Range<'_, [T]>,
82    new: Range<'_, [T]>,
83    vf: &mut V,
84    vb: &mut V,
85) -> (isize, Snake) {
86    let n = old.len();
87    let m = new.len();
88
89    // By Lemma 1 in the paper, the optimal edit script length is odd or even as `delta` is odd
90    // or even.
91    let delta = n as isize - m as isize;
92    let odd = delta & 1 == 1;
93
94    // The initial point at (0, -1)
95    vf[1] = 0;
96    // The initial point at (N, M+1)
97    vb[1] = 0;
98
99    // We only need to explore ceil(D/2) + 1
100    let d_max = max_d(n, m);
101    assert!(vf.len() >= d_max);
102    assert!(vb.len() >= d_max);
103
104    for d in 0..d_max as isize {
105        // Forward path
106        for k in (-d..=d).rev().step_by(2) {
107            let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
108                vf[k + 1]
109            } else {
110                vf[k - 1] + 1
111            };
112            let mut y = (x as isize - k) as usize;
113
114            // The coordinate of the start of a snake
115            let (x0, y0) = (x, y);
116            //  While these sequences are identical, keep moving through the graph with no cost
117            if let (Some(s1), Some(s2)) = (old.get(x..), new.get(y..)) {
118                let advance = s1.common_prefix_len(s2);
119                x += advance;
120                y += advance;
121            }
122
123            // This is the new best x value
124            vf[k] = x;
125            // Only check for connections from the forward search when N - M is odd
126            // and when there is a reciprocal k line coming from the other direction.
127            if odd && (k - delta).abs() <= (d - 1) {
128                // TODO optimize this so we don't have to compare against n
129                if vf[k] + vb[-(k - delta)] >= n {
130                    // Return the snake
131                    let snake = Snake {
132                        x_start: x0,
133                        y_start: y0,
134                        x_end: x,
135                        y_end: y,
136                    };
137                    // Edit distance to this snake is `2 * d - 1`
138                    return (2 * d - 1, snake);
139                }
140            }
141        }
142
143        // Backward path
144        for k in (-d..=d).rev().step_by(2) {
145            let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
146                vb[k + 1]
147            } else {
148                vb[k - 1] + 1
149            };
150            let mut y = (x as isize - k) as usize;
151
152            // The coordinate of the start of a snake
153            let (x0, y0) = (x, y);
154            if x < n && y < m {
155                let advance = old.slice(..n - x).common_suffix_len(new.slice(..m - y));
156                x += advance;
157                y += advance;
158            }
159
160            // This is the new best x value
161            vb[k] = x;
162
163            if !odd && (k - delta).abs() <= d {
164                // TODO optimize this so we don't have to compare against n
165                if vb[k] + vf[-(k - delta)] >= n {
166                    // Return the snake
167                    let snake = Snake {
168                        x_start: n - x,
169                        y_start: m - y,
170                        x_end: n - x0,
171                        y_end: m - y0,
172                    };
173                    // Edit distance to this snake is `2 * d`
174                    return (2 * d, snake);
175                }
176            }
177        }
178
179        // TODO: Maybe there's an opportunity to optimize and bail early?
180    }
181
182    unreachable!("unable to find a middle snake");
183}
184
185fn conquer<'a, 'b, T: PartialEq>(
186    mut old: Range<'a, [T]>,
187    mut new: Range<'b, [T]>,
188    vf: &mut V,
189    vb: &mut V,
190    solution: &mut Vec<DiffRange<'a, 'b, [T]>>,
191) {
192    // Check for common prefix
193    let common_prefix_len = old.common_prefix_len(new);
194    if common_prefix_len > 0 {
195        let common_prefix = DiffRange::Equal(
196            old.slice(..common_prefix_len),
197            new.slice(..common_prefix_len),
198        );
199        solution.push(common_prefix);
200    }
201
202    old = old.slice(common_prefix_len..old.len());
203    new = new.slice(common_prefix_len..new.len());
204
205    // Check for common suffix
206    let common_suffix_len = old.common_suffix_len(new);
207    let common_suffix = DiffRange::Equal(
208        old.slice(old.len() - common_suffix_len..),
209        new.slice(new.len() - common_suffix_len..),
210    );
211    old = old.slice(..old.len() - common_suffix_len);
212    new = new.slice(..new.len() - common_suffix_len);
213
214    if old.is_empty() && new.is_empty() {
215        // Do nothing
216    } else if old.is_empty() {
217        // Inserts
218        solution.push(DiffRange::Insert(new));
219    } else if new.is_empty() {
220        // Deletes
221        solution.push(DiffRange::Delete(old));
222    } else {
223        // Divide & Conquer
224        let (_shortest_edit_script_len, snake) = find_middle_snake(old, new, vf, vb);
225
226        let (old_a, old_b) = old.split_at(snake.x_start);
227        let (new_a, new_b) = new.split_at(snake.y_start);
228
229        conquer(old_a, new_a, vf, vb, solution);
230        conquer(old_b, new_b, vf, vb, solution);
231    }
232
233    if common_suffix_len > 0 {
234        solution.push(common_suffix);
235    }
236}
237
238pub fn diff<'a, 'b, T: PartialEq>(old: &'a [T], new: &'b [T]) -> Vec<DiffRange<'a, 'b, [T]>> {
239    let old_recs = Range::new(old, ..);
240    let new_recs = Range::new(new, ..);
241
242    let mut solution = Vec::new();
243
244    // The arrays that hold the 'best possible x values' in search from:
245    // `vf`: top left to bottom right
246    // `vb`: bottom right to top left
247    let max_d = max_d(old.len(), new.len());
248    let mut vf = V::new(max_d);
249    let mut vb = V::new(max_d);
250
251    conquer(old_recs, new_recs, &mut vf, &mut vb, &mut solution);
252
253    solution
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    #[test]
261    fn test_find_middle_snake() {
262        let a = Range::new(&b"ABCABBA"[..], ..);
263        let b = Range::new(&b"CBABAC"[..], ..);
264        let max_d = max_d(a.len(), b.len());
265        let mut vf = V::new(max_d);
266        let mut vb = V::new(max_d);
267        find_middle_snake(a, b, &mut vf, &mut vb);
268    }
269}