pratdiff/
lib.rs

1use std::collections::HashMap;
2use std::iter::zip;
3use std::ops::Range;
4
5#[derive(Clone, Copy, Debug, Eq, PartialEq)]
6pub enum Side {
7  Lhs,
8  Rhs,
9}
10
11#[derive(Clone, Debug, Eq, PartialEq)]
12pub enum DiffItem {
13  Match { lhs: Range<usize>, rhs: Range<usize> },
14  Mutation { lhs: Range<usize>, rhs: Range<usize> },
15}
16
17use DiffItem::*;
18
19impl DiffItem {
20  pub fn lhs(&self) -> Range<usize> {
21    match self {
22      Match { lhs, .. } => lhs.clone(),
23      Mutation { lhs, .. } => lhs.clone(),
24    }
25  }
26
27  pub fn rhs(&self) -> Range<usize> {
28    match self {
29      Match { rhs, .. } => rhs.clone(),
30      Mutation { rhs, .. } => rhs.clone(),
31    }
32  }
33
34  pub fn side(&self, side: Side) -> Range<usize> {
35    match side {
36      Side::Lhs => self.lhs(),
37      Side::Rhs => self.rhs(),
38    }
39  }
40}
41
42#[derive(Clone, Debug, Eq, PartialEq)]
43pub struct Hunk {
44  pub diffs: Vec<DiffItem>,
45}
46
47impl Hunk {
48  pub fn build(context: usize, diffs: &[DiffItem]) -> Vec<Hunk> {
49    let mut res = vec![Hunk { diffs: Vec::new() }];
50
51    for d in diffs {
52      res.last_mut().unwrap().diffs.push(d.clone());
53
54      if matches!(d, Match { lhs, .. } if lhs.len() > 2 * context) {
55        res.push(Hunk { diffs: vec![d.clone()] });
56      }
57    }
58
59    res
60      .into_iter()
61      .filter_map(|mut hunk| {
62        if hunk.diffs.len() <= 1 && matches!(hunk.diffs[0], Match { .. }) {
63          return None;
64        }
65
66        if context == 0 {
67          hunk.diffs.retain(|d| matches!(d, Mutation { .. }));
68          return Some(hunk);
69        }
70
71        if let Some(Match { lhs, rhs }) = hunk.diffs.first_mut() {
72          if lhs.len() > context {
73            lhs.start = lhs.end - context;
74            rhs.start = rhs.end - context;
75          }
76        }
77        if let Some(Match { lhs, rhs }) = hunk.diffs.last_mut() {
78          if lhs.len() > context {
79            lhs.end = lhs.start + context;
80            rhs.end = rhs.start + context;
81          }
82        }
83        Some(hunk)
84      })
85      .collect()
86  }
87
88  pub fn side(&self, side: Side) -> Range<usize> {
89    Range {
90      start: self.diffs.first().map_or(0, |d| d.side(side).start),
91      end: self.diffs.last().map_or(0, |d| d.side(side).end),
92    }
93  }
94
95  pub fn lhs(&self) -> Range<usize> {
96    self.side(Side::Lhs)
97  }
98
99  pub fn rhs(&self) -> Range<usize> {
100    self.side(Side::Rhs)
101  }
102}
103
104#[derive(Clone, Debug, Default, Eq, PartialEq)]
105struct Diffs {
106  vec: Vec<DiffItem>,
107}
108
109impl Diffs {
110  fn add_match(&mut self, len: usize) {
111    if len == 0 {
112      return;
113    }
114    if let Some(Match { lhs, rhs }) = self.vec.last_mut() {
115      lhs.end += len;
116      rhs.end += len;
117    } else {
118      self.vec.push(Match {
119        lhs: Range {
120          start: self.lhs_pos(),
121          end: self.lhs_pos() + len,
122        },
123        rhs: Range {
124          start: self.rhs_pos(),
125          end: self.rhs_pos() + len,
126        },
127      });
128    }
129  }
130
131  fn add_mutation(&mut self, lhs: usize, rhs: usize) {
132    if lhs == 0 && rhs == 0 {
133      return;
134    }
135    if let Some(Mutation { lhs: l, rhs: r }) = self.vec.last_mut() {
136      l.end += lhs;
137      r.end += rhs;
138    } else {
139      self.vec.push(Mutation {
140        lhs: Range {
141          start: self.lhs_pos(),
142          end: self.lhs_pos() + lhs,
143        },
144        rhs: Range {
145          start: self.rhs_pos(),
146          end: self.rhs_pos() + rhs,
147        },
148      });
149    }
150  }
151
152  fn lhs_pos(&self) -> usize {
153    self.vec.last().map_or(0, |d| d.lhs().end)
154  }
155
156  fn rhs_pos(&self) -> usize {
157    self.vec.last().map_or(0, |d| d.rhs().end)
158  }
159}
160
161pub fn diff(lhs: &[&str], rhs: &[&str]) -> Vec<DiffItem> {
162  let mut d = Diffs::default();
163  accumulate_partitions(&mut d, lhs, rhs);
164  d.vec
165}
166
167pub fn tokenize_lines<'a>(lines: &[&'a str]) -> Vec<&'a str> {
168  let re = regex::Regex::new(r"\w+|\s+").unwrap();
169  let mut v = Vec::new();
170  for &line in lines {
171    let mut last_pos = 0;
172    for m in re.find_iter(line) {
173      if m.start() > last_pos {
174        v.push(&line[last_pos..m.start()]);
175      }
176      last_pos = m.end();
177      v.push(m.as_str());
178    }
179    if last_pos < line.len() {
180      v.push(&line[last_pos..]);
181    }
182    v.push("\n");
183  }
184  v.pop();
185  v
186}
187
188// Patience diff algorithm
189//
190// 1. Match the first lines of both if they're identical, then match the second,
191//    third, etc. until a pair doesn't match.
192// 2. Match the last lines of both if they're identical, then match the next to
193//    last, second to last, etc. until a pair doesn't match.
194// 3. Find all lines which occur exactly once on both sides, then do longest
195//    common subsequence on those lines, matching them up.
196// 4. Do steps 1-2 on each section between matched lines.
197fn accumulate_diffs(diffs: &mut Diffs, lhs: &[&str], rhs: &[&str]) {
198  let leading = leading_match_len(lhs, rhs);
199  diffs.add_match(leading);
200  if leading == lhs.len() && leading == rhs.len() {
201    return;
202  }
203
204  let trailing =
205    trailing_match_len(&lhs[leading..lhs.len()], &rhs[leading..rhs.len()]);
206  accumulate_partitions(
207    diffs,
208    &lhs[leading..lhs.len() - trailing],
209    &rhs[leading..rhs.len() - trailing],
210  );
211  diffs.add_match(trailing);
212}
213
214fn leading_match_len(lhs: &[&str], rhs: &[&str]) -> usize {
215  zip(lhs, rhs).take_while(|(&l, &r)| l == r).count()
216}
217
218fn trailing_match_len(lhs: &[&str], rhs: &[&str]) -> usize {
219  zip(lhs.iter().rev(), rhs.iter().rev())
220    .take_while(|(&l, &r)| l == r)
221    .count()
222}
223
224fn accumulate_partitions(diffs: &mut Diffs, lhs: &[&str], rhs: &[&str]) {
225  let matched = match_lines(lhs, rhs);
226  if matched.is_empty() {
227    diffs.add_mutation(lhs.len(), rhs.len());
228    return;
229  }
230  let matched = longest_common_subseq(&matched);
231
232  let mut lhs_pos: usize = 0;
233  let mut rhs_pos: usize = 0;
234  for (lhs_next, rhs_next) in matched {
235    accumulate_diffs(diffs, &lhs[lhs_pos..lhs_next], &rhs[rhs_pos..rhs_next]);
236    diffs.add_match(1);
237    lhs_pos = lhs_next + 1;
238    rhs_pos = rhs_next + 1;
239  }
240  accumulate_diffs(diffs, &lhs[lhs_pos..lhs.len()], &rhs[rhs_pos..rhs.len()]);
241}
242
243fn match_lines(lhs: &[&str], rhs: &[&str]) -> Vec<(usize, usize)> {
244  let mut m: HashMap<&str, (Vec<usize>, Vec<usize>)> = HashMap::new();
245  for (i, l) in lhs.iter().enumerate() {
246    m.entry(l).or_default().0.push(i);
247  }
248  for (i, r) in rhs.iter().enumerate() {
249    m.entry(r).or_default().1.push(i);
250  }
251
252  let mut min = usize::MAX;
253  m.retain(|_, (l, r)| {
254    if l.len() == r.len() {
255      min = min.min(l.len());
256      true
257    } else {
258      false
259    }
260  });
261
262  let mut v: Vec<(usize, usize)> = m
263    .into_values()
264    .filter(|(l, _)| l.len() == min)
265    .flat_map(|(l, r)| zip(l, r))
266    .collect();
267  v.sort();
268  v
269}
270
271fn longest_common_subseq(pairings: &[(usize, usize)]) -> Vec<(usize, usize)> {
272  type PairingStack = Vec<Vec<((usize, usize), usize)>>;
273  fn find_push_pos(stacks: &PairingStack, p: &(usize, usize)) -> usize {
274    for (pos, stack) in stacks.iter().enumerate() {
275      if p.1 < stack.last().unwrap().0 .1 {
276        return pos;
277      }
278    }
279    stacks.len()
280  }
281
282  let mut stacks = PairingStack::new();
283  for p in pairings {
284    let push_pos = find_push_pos(&stacks, p);
285    if push_pos == stacks.len() {
286      stacks.push(vec![]);
287    }
288    let prev = if push_pos == 0 { 0 } else { stacks[push_pos - 1].len() - 1 };
289    stacks[push_pos].push((*p, prev));
290  }
291
292  let mut r = vec![];
293  let mut prev = stacks.last().unwrap().len() - 1;
294  for stack in stacks.iter().rev() {
295    r.push(stack[prev].0);
296    prev = stack[prev].1;
297  }
298  r.reverse();
299  r
300}
301
302#[cfg(test)]
303mod tests {
304  use super::*;
305
306  fn diff_lines(lhs: &str, rhs: &str) -> Vec<DiffItem> {
307    let lhs_lines: Vec<_> = lhs.lines().collect();
308    let rhs_lines: Vec<_> = rhs.lines().collect();
309    diff(&lhs_lines, &rhs_lines)
310  }
311
312  #[test]
313  fn diff_empty() {
314    assert_eq!(diff(&[], &[]), &[]);
315  }
316
317  #[test]
318  fn diff_eq() {
319    assert_eq!(
320      diff(&["a", "b", "c"], &["a", "b", "c"]),
321      &[Match {
322        lhs: Range { start: 0, end: 3 },
323        rhs: Range { start: 0, end: 3 },
324      }]
325    );
326  }
327
328  #[test]
329  fn diff_ne() {
330    assert_eq!(
331      diff(&["a", "b", "c"], &["a", "c"]),
332      &[
333        Match {
334          lhs: Range { start: 0, end: 1 },
335          rhs: Range { start: 0, end: 1 },
336        },
337        Mutation {
338          lhs: Range { start: 1, end: 2 },
339          rhs: Range { start: 1, end: 1 },
340        },
341        Match {
342          lhs: Range { start: 2, end: 3 },
343          rhs: Range { start: 1, end: 2 },
344        },
345      ]
346    );
347    assert_eq!(
348      diff(&["z", "a", "b", "c"], &["a", "c"]),
349      &[
350        Mutation {
351          lhs: Range { start: 0, end: 1 },
352          rhs: Range { start: 0, end: 0 },
353        },
354        Match {
355          lhs: Range { start: 1, end: 2 },
356          rhs: Range { start: 0, end: 1 },
357        },
358        Mutation {
359          lhs: Range { start: 2, end: 3 },
360          rhs: Range { start: 1, end: 1 },
361        },
362        Match {
363          lhs: Range { start: 3, end: 4 },
364          rhs: Range { start: 1, end: 2 },
365        },
366      ]
367    );
368    assert_eq!(
369      diff(&["z", "a", "e", "b", "c"], &["a", "e", "c"]),
370      &[
371        Mutation {
372          lhs: Range { start: 0, end: 1 },
373          rhs: Range { start: 0, end: 0 },
374        },
375        Match {
376          lhs: Range { start: 1, end: 3 },
377          rhs: Range { start: 0, end: 2 },
378        },
379        Mutation {
380          lhs: Range { start: 3, end: 4 },
381          rhs: Range { start: 2, end: 2 },
382        },
383        Match {
384          lhs: Range { start: 4, end: 5 },
385          rhs: Range { start: 2, end: 3 },
386        },
387      ]
388    );
389  }
390
391  #[test]
392  fn diff_only_non_unique() {
393    assert_eq!(
394      diff(&["a", "b", "b", "c"], &["b", "b"]),
395      &[
396        Mutation {
397          lhs: Range { start: 0, end: 1 },
398          rhs: Range { start: 0, end: 0 },
399        },
400        Match {
401          lhs: Range { start: 1, end: 3 },
402          rhs: Range { start: 0, end: 2 },
403        },
404        Mutation {
405          lhs: Range { start: 3, end: 4 },
406          rhs: Range { start: 2, end: 2 },
407        },
408      ]
409    );
410  }
411
412  #[test]
413  fn match_lines_arity1() {
414    assert_eq!(
415      match_lines(&["a", "b", "c", "d", "e", "d"], &["a", "c", "d", "e"]),
416      vec![(0, 0), (2, 1), (4, 3)],
417    );
418  }
419
420  #[test]
421  fn match_lines_arity2() {
422    assert_eq!(
423      match_lines(&["a", "b", "b", "c"], &["b", "b"]),
424      vec![(1, 0), (2, 1)],
425    );
426  }
427
428  #[test]
429  fn longest_common_subseq_basic() {
430    // From https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/
431    assert_eq!(
432      longest_common_subseq(&[
433        (0, 9),
434        (1, 4),
435        (2, 6),
436        (3, 12),
437        (4, 8),
438        (5, 7),
439        (6, 1),
440        (7, 5),
441        (8, 10),
442        (9, 11),
443        (10, 3),
444        (11, 2),
445        (12, 13),
446      ]),
447      &[(1, 4), (2, 6), (5, 7), (8, 10), (9, 11), (12, 13),]
448    );
449  }
450
451  #[test]
452  fn lead_trail_overlap() {
453    assert_eq!(
454      diff(&["a", "b", "d", "b", "c"], &["a", "b", "c"]),
455      &[
456        Match {
457          lhs: Range { start: 0, end: 2 },
458          rhs: Range { start: 0, end: 2 },
459        },
460        Mutation {
461          lhs: Range { start: 2, end: 4 },
462          rhs: Range { start: 2, end: 2 },
463        },
464        Match {
465          lhs: Range { start: 4, end: 5 },
466          rhs: Range { start: 2, end: 3 },
467        },
468      ]
469    );
470  }
471
472  #[test]
473  fn lead_move_txt() {
474    assert_eq!(
475      diff_lines(
476        include_str!("testdata/old/move.txt"),
477        include_str!("testdata/new/move.txt"),
478      ),
479      &[
480        Mutation {
481          lhs: Range { start: 0, end: 8 },
482          rhs: Range { start: 0, end: 0 },
483        },
484        Match {
485          lhs: Range { start: 8, end: 16 },
486          rhs: Range { start: 0, end: 8 },
487        },
488        Mutation {
489          lhs: Range { start: 16, end: 16 },
490          rhs: Range { start: 8, end: 16 },
491        },
492      ]
493    );
494  }
495
496  fn hunk_positions(hunks: &[Hunk]) -> Vec<((usize, usize), (usize, usize))> {
497    hunks
498      .iter()
499      .map(|h| {
500        let (l, r) = (h.lhs(), h.rhs());
501        ((l.start + 1, l.len()), (r.start + 1, r.len()))
502      })
503      .collect::<Vec<_>>()
504  }
505
506  #[test]
507  fn build_hunks() {
508    let diff = diff_lines(
509      include_str!("testdata/old/move.txt"),
510      include_str!("testdata/new/move.txt"),
511    );
512    assert_eq!(
513      hunk_positions(&Hunk::build(3, &diff)),
514      &[((1, 11), (1, 3)), ((14, 3), (6, 11))]
515    );
516    assert_eq!(
517      hunk_positions(&Hunk::build(0, &diff)),
518      &[((1, 8), (1, 0)), ((17, 0), (9, 8))]
519    );
520  }
521
522  #[test]
523  fn tokenize() {
524    assert_eq!(
525      tokenize_lines(&["void func1() {", "  x += 1"]),
526      &[
527        "void", " ", "func1", "()", " ", "{", "\n", "  ", "x", " ", "+=", " ",
528        "1"
529      ],
530    );
531  }
532}