jujutsu_lib/
diff.rs

1// Copyright 2021 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::cmp::{max, min, Ordering};
16use std::collections::{BTreeMap, HashMap};
17use std::fmt::{Debug, Formatter};
18use std::ops::Range;
19use std::slice;
20
21use itertools::Itertools;
22
23use crate::nightly_shims::BTreeMapExt;
24
25pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
26    let mut ranges = vec![];
27    let mut start = 0;
28    loop {
29        match text[start..].iter().position(|b| *b == b'\n') {
30            None => {
31                break;
32            }
33            Some(i) => {
34                ranges.push(start..start + i + 1);
35                start += i + 1;
36            }
37        }
38    }
39    if start < text.len() {
40        ranges.push(start..text.len());
41    }
42    ranges
43}
44
45fn is_word_byte(b: u8) -> bool {
46    // TODO: Make this configurable (probably higher up in the call stack)
47    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
48}
49
50pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
51    let mut word_ranges = vec![];
52    let mut word_start_pos = 0;
53    let mut in_word = false;
54    for (i, b) in text.iter().enumerate() {
55        if in_word && !is_word_byte(*b) {
56            in_word = false;
57            word_ranges.push(word_start_pos..i);
58            word_start_pos = i;
59        } else if !in_word && is_word_byte(*b) {
60            in_word = true;
61            word_start_pos = i;
62        }
63    }
64    if in_word && word_start_pos < text.len() {
65        word_ranges.push(word_start_pos..text.len());
66    }
67    word_ranges
68}
69
70pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
71    let mut ranges = vec![];
72    for (i, b) in text.iter().enumerate() {
73        if !is_word_byte(*b) {
74            ranges.push(i..i + 1);
75        }
76    }
77    ranges
78}
79
80struct Histogram<'a> {
81    word_to_positions: HashMap<&'a [u8], Vec<usize>>,
82    count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
83}
84
85impl Histogram<'_> {
86    fn calculate<'a>(
87        text: &'a [u8],
88        ranges: &[Range<usize>],
89        max_occurrences: usize,
90    ) -> Histogram<'a> {
91        let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
92        for (i, range) in ranges.iter().enumerate() {
93            let positions = word_to_positions.entry(&text[range.clone()]).or_default();
94            // Allow one more than max_occurrences, so we can later skip those with more
95            // than max_occurrences
96            if positions.len() <= max_occurrences {
97                positions.push(i);
98            }
99        }
100        let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
101        for (word, ranges) in &word_to_positions {
102            count_to_words.entry(ranges.len()).or_default().push(word);
103        }
104        Histogram {
105            word_to_positions,
106            count_to_words,
107        }
108    }
109}
110
111/// Finds the LCS given a array where the value of `input[i]` indicates that
112/// the position of element `i` in the right array is at position `input[i]` in
113/// the left array.
114///
115/// For example (some have multiple valid outputs):
116///
117/// [0,1,2] => [(0,0),(1,1),(2,2)]
118/// [2,1,0] => [(0,2)]
119/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)]
120/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)]
121fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
122    if input.is_empty() {
123        return vec![];
124    }
125
126    let mut chain = vec![(0, 0, 0); input.len()];
127    let mut global_longest = 0;
128    let mut global_longest_right_pos = 0;
129    for (right_pos, &left_pos) in input.iter().enumerate() {
130        let mut longest_from_here = 1;
131        let mut previous_right_pos = usize::MAX;
132        for i in (0..right_pos).rev() {
133            let (previous_len, previous_left_pos, _) = chain[i];
134            if previous_left_pos < left_pos {
135                let len = previous_len + 1;
136                if len > longest_from_here {
137                    longest_from_here = len;
138                    previous_right_pos = i;
139                    if len > global_longest {
140                        global_longest = len;
141                        global_longest_right_pos = right_pos;
142                        // If this is the longest chain globally so far, we cannot find a
143                        // longer one by using a previous value, so break early.
144                        break;
145                    }
146                }
147            }
148        }
149        chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
150    }
151
152    let mut result = vec![];
153    let mut right_pos = global_longest_right_pos;
154    loop {
155        let (_, left_pos, previous_right_pos) = chain[right_pos];
156        result.push((left_pos, right_pos));
157        if previous_right_pos == usize::MAX {
158            break;
159        }
160        right_pos = previous_right_pos;
161    }
162    result.reverse();
163
164    result
165}
166
167/// Finds unchanged ranges among the ones given as arguments. The data between
168/// those ranges is ignored.
169pub(crate) fn unchanged_ranges(
170    left: &[u8],
171    right: &[u8],
172    left_ranges: &[Range<usize>],
173    right_ranges: &[Range<usize>],
174) -> Vec<(Range<usize>, Range<usize>)> {
175    if left_ranges.is_empty() || right_ranges.is_empty() {
176        return vec![];
177    }
178
179    let max_occurrences = 100;
180    let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
181    if *left_histogram.count_to_words.first_key().unwrap() > max_occurrences {
182        // If there are very many occurrences of all words, then we just give up.
183        return vec![];
184    }
185    let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
186    // Look for words with few occurrences in `left` (could equally well have picked
187    // `right`?). If any of them also occur in `right`, then we add the words to
188    // the LCS.
189    let mut uncommon_shared_words = vec![];
190    while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() {
191        let left_words = left_histogram.count_to_words.pop_first_value().unwrap();
192        for left_word in left_words {
193            if right_histogram.word_to_positions.contains_key(left_word) {
194                uncommon_shared_words.push(left_word);
195            }
196        }
197    }
198    if uncommon_shared_words.is_empty() {
199        return vec![];
200    }
201
202    // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
203    // the least common words to be "a" and "b". We now assume that each
204    // occurrence of each word lines up in the left and right input. We do that
205    // by numbering the shared occurrences, effectively instead comparing "a1 b1
206    // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the
207    // right input in order (["a1", "b1", "b2", "a2"]), and record the index of
208    // that word in the left input ([0,1,3,2]). We then find the LCS and split
209    // points based on that ([0,1,3] or [0,1,2] are both valid).
210
211    // [(index into left_ranges, word, occurrence #)]
212    let mut left_positions = vec![];
213    let mut right_positions = vec![];
214    for uncommon_shared_word in uncommon_shared_words {
215        let left_occurrences = left_histogram
216            .word_to_positions
217            .get_mut(uncommon_shared_word)
218            .unwrap();
219        let right_occurrences = right_histogram
220            .word_to_positions
221            .get_mut(uncommon_shared_word)
222            .unwrap();
223        let shared_count = min(left_occurrences.len(), right_occurrences.len());
224        for occurrence in 0..shared_count {
225            left_positions.push((
226                left_occurrences[occurrence],
227                uncommon_shared_word,
228                occurrence,
229            ));
230            right_positions.push((
231                right_occurrences[occurrence],
232                uncommon_shared_word,
233                occurrence,
234            ));
235        }
236    }
237    left_positions.sort();
238    right_positions.sort();
239    let mut left_position_map = HashMap::new();
240    for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
241        left_position_map.insert((*word, *occurrence), i);
242    }
243    let mut left_index_by_right_index = vec![];
244    for (_pos, word, occurrence) in &right_positions {
245        left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
246    }
247
248    let lcs = find_lcs(&left_index_by_right_index);
249
250    // Produce output ranges, recursing into the modified areas between the elements
251    // in the LCS.
252    let mut result = vec![];
253    let mut previous_left_position = 0;
254    let mut previous_right_position = 0;
255    for (left_index, right_index) in lcs {
256        let left_position = left_positions[left_index].0;
257        let right_position = right_positions[right_index].0;
258        let skipped_left_positions = previous_left_position..left_position;
259        let skipped_right_positions = previous_right_position..right_position;
260        if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
261            for unchanged_nested_range in unchanged_ranges(
262                left,
263                right,
264                &left_ranges[skipped_left_positions.clone()],
265                &right_ranges[skipped_right_positions.clone()],
266            ) {
267                result.push(unchanged_nested_range);
268            }
269        }
270        result.push((
271            left_ranges[left_position].clone(),
272            right_ranges[right_position].clone(),
273        ));
274        previous_left_position = left_position + 1;
275        previous_right_position = right_position + 1;
276    }
277    // Also recurse into range at end (after common ranges).
278    let skipped_left_positions = previous_left_position..left_ranges.len();
279    let skipped_right_positions = previous_right_position..right_ranges.len();
280    if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
281        for unchanged_nested_range in unchanged_ranges(
282            left,
283            right,
284            &left_ranges[skipped_left_positions],
285            &right_ranges[skipped_right_positions],
286        ) {
287            result.push(unchanged_nested_range);
288        }
289    }
290
291    result
292}
293
294#[derive(Clone, PartialEq, Eq, Debug)]
295struct UnchangedRange {
296    base_range: Range<usize>,
297    offsets: Vec<isize>,
298}
299
300impl UnchangedRange {
301    fn start(&self, side: usize) -> usize {
302        self.base_range
303            .start
304            .wrapping_add(self.offsets[side] as usize)
305    }
306
307    fn end(&self, side: usize) -> usize {
308        self.base_range
309            .end
310            .wrapping_add(self.offsets[side] as usize)
311    }
312}
313
314impl PartialOrd for UnchangedRange {
315    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
316        Some(self.cmp(other))
317    }
318}
319
320impl Ord for UnchangedRange {
321    fn cmp(&self, other: &Self) -> Ordering {
322        self.base_range
323            .start
324            .cmp(&other.base_range.start)
325            .then_with(|| self.base_range.end.cmp(&other.base_range.end))
326    }
327}
328
329/// Takes any number of inputs and finds regions that are them same between all
330/// of them.
331#[derive(Clone, Debug)]
332pub struct Diff<'input> {
333    base_input: &'input [u8],
334    other_inputs: Vec<&'input [u8]>,
335    // The key is a range in the base input. The value is the start of each non-base region
336    // relative to the base region's start. By making them relative, they don't need to change
337    // when the base range changes.
338    unchanged_regions: Vec<UnchangedRange>,
339}
340
341/// Takes the current regions and intersects it with the new unchanged ranges
342/// from a 2-way diff. The result is a map of unchanged regions with one more
343/// offset in the map's values.
344fn intersect_regions(
345    current_ranges: Vec<UnchangedRange>,
346    new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
347) -> Vec<UnchangedRange> {
348    let mut result = vec![];
349    let mut current_ranges_iter = current_ranges.into_iter().peekable();
350    for (new_base_range, other_range) in new_unchanged_ranges.iter() {
351        assert_eq!(new_base_range.len(), other_range.len());
352        while let Some(UnchangedRange {
353            base_range,
354            offsets,
355        }) = current_ranges_iter.peek()
356        {
357            // No need to look further if we're past the new range.
358            if base_range.start >= new_base_range.end {
359                break;
360            }
361            // Discard any current unchanged regions that don't match between the base and
362            // the new input.
363            if base_range.end <= new_base_range.start {
364                current_ranges_iter.next();
365                continue;
366            }
367            let new_start = max(base_range.start, new_base_range.start);
368            let new_end = min(base_range.end, new_base_range.end);
369            let mut new_offsets = offsets.clone();
370            new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
371            result.push(UnchangedRange {
372                base_range: new_start..new_end,
373                offsets: new_offsets,
374            });
375            if base_range.end >= new_base_range.end {
376                // Break without consuming the item; there may be other new ranges that overlap
377                // with it.
378                break;
379            }
380            current_ranges_iter.next();
381        }
382    }
383    result
384}
385
386impl<'input> Diff<'input> {
387    pub fn for_tokenizer(
388        inputs: &[&'input [u8]],
389        tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>,
390    ) -> Self {
391        assert!(!inputs.is_empty());
392        let base_input = inputs[0];
393        let other_inputs = inputs.iter().skip(1).copied().collect_vec();
394        // First tokenize each input
395        let base_token_ranges: Vec<Range<usize>> = tokenizer(base_input);
396        let other_token_ranges: Vec<Vec<Range<usize>>> = other_inputs
397            .iter()
398            .map(|other_input| tokenizer(other_input))
399            .collect_vec();
400
401        // Look for unchanged regions. Initially consider the whole range of the base
402        // input as unchanged (compared to itself). Then diff each other input
403        // against the base. Intersect the previously found ranges with the
404        // unchanged ranges in the diff.
405        let mut unchanged_regions = vec![UnchangedRange {
406            base_range: 0..base_input.len(),
407            offsets: vec![],
408        }];
409        for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
410            let unchanged_diff_ranges = unchanged_ranges(
411                base_input,
412                other_inputs[i],
413                &base_token_ranges,
414                other_token_ranges,
415            );
416            unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
417        }
418        // Add an empty range at the end to make life easier for hunks().
419        let offsets = other_inputs
420            .iter()
421            .map(|input| input.len().wrapping_sub(base_input.len()) as isize)
422            .collect_vec();
423        unchanged_regions.push(UnchangedRange {
424            base_range: base_input.len()..base_input.len(),
425            offsets,
426        });
427
428        let mut diff = Self {
429            base_input,
430            other_inputs,
431            unchanged_regions,
432        };
433        diff.compact_unchanged_regions();
434        diff
435    }
436
437    pub fn unrefined(inputs: &[&'input [u8]]) -> Self {
438        Diff::for_tokenizer(inputs, &|_| vec![])
439    }
440
441    // TODO: At least when merging, it's wasteful to refine the diff if e.g. if 2
442    // out of 3 inputs match in the differing regions. Perhaps the refine()
443    // method should be on the hunk instead (probably returning a new Diff)?
444    // That would let each user decide which hunks to refine. However, it would
445    // probably mean that many callers repeat the same code. Perhaps it
446    // should be possible to refine a whole diff *or* individual hunks.
447    pub fn default_refinement(inputs: &[&'input [u8]]) -> Self {
448        let mut diff = Diff::for_tokenizer(inputs, &find_line_ranges);
449        diff.refine_changed_regions(&find_word_ranges);
450        diff.refine_changed_regions(&find_nonword_ranges);
451        diff
452    }
453
454    pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
455        let previous_offsets = vec![0; self.other_inputs.len()];
456        DiffHunkIterator {
457            diff: self,
458            previous: UnchangedRange {
459                base_range: 0..0,
460                offsets: previous_offsets,
461            },
462            unchanged_emitted: true,
463            unchanged_iter: self.unchanged_regions.iter(),
464        }
465    }
466
467    /// Uses the given tokenizer to split the changed regions into smaller
468    /// regions. Then tries to finds unchanged regions among them.
469    pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) {
470        let mut previous = UnchangedRange {
471            base_range: 0..0,
472            offsets: vec![0; self.other_inputs.len()],
473        };
474        let mut new_unchanged_ranges = vec![];
475        for current in self.unchanged_regions.iter() {
476            // For the changed region between the previous region and the current one,
477            // create a new Diff instance. Then adjust the start positions and
478            // offsets to be valid in the context of the larger Diff instance
479            // (`self`).
480            let mut slices =
481                vec![&self.base_input[previous.base_range.end..current.base_range.start]];
482            for i in 0..current.offsets.len() {
483                let changed_range = previous.end(i)..current.start(i);
484                slices.push(&self.other_inputs[i][changed_range]);
485            }
486
487            let refined_diff = Diff::for_tokenizer(&slices, tokenizer);
488
489            for UnchangedRange {
490                base_range,
491                offsets,
492            } in refined_diff.unchanged_regions
493            {
494                let new_base_start = base_range.start + previous.base_range.end;
495                let new_base_end = base_range.end + previous.base_range.end;
496                let offsets = offsets
497                    .into_iter()
498                    .enumerate()
499                    .map(|(i, offset)| offset + previous.offsets[i])
500                    .collect_vec();
501                new_unchanged_ranges.push(UnchangedRange {
502                    base_range: new_base_start..new_base_end,
503                    offsets,
504                });
505            }
506            previous = current.clone();
507        }
508        self.unchanged_regions = self
509            .unchanged_regions
510            .iter()
511            .cloned()
512            .merge(new_unchanged_ranges)
513            .collect_vec();
514        self.compact_unchanged_regions();
515    }
516
517    fn compact_unchanged_regions(&mut self) {
518        let mut compacted = vec![];
519        let mut maybe_previous: Option<UnchangedRange> = None;
520        for current in self.unchanged_regions.iter() {
521            if let Some(previous) = maybe_previous {
522                if previous.base_range.end == current.base_range.start
523                    && previous.offsets == *current.offsets
524                {
525                    maybe_previous = Some(UnchangedRange {
526                        base_range: previous.base_range.start..current.base_range.end,
527                        offsets: current.offsets.clone(),
528                    });
529                    continue;
530                }
531                compacted.push(previous);
532            }
533            maybe_previous = Some(current.clone());
534        }
535        if let Some(previous) = maybe_previous {
536            compacted.push(previous);
537        }
538        self.unchanged_regions = compacted;
539    }
540}
541
542#[derive(PartialEq, Eq, Clone)]
543pub enum DiffHunk<'input> {
544    Matching(&'input [u8]),
545    Different(Vec<&'input [u8]>),
546}
547
548impl Debug for DiffHunk<'_> {
549    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
550        match self {
551            DiffHunk::Matching(slice) => f
552                .debug_tuple("DiffHunk::Matching")
553                .field(&String::from_utf8_lossy(slice))
554                .finish(),
555            DiffHunk::Different(slices) => f
556                .debug_tuple("DiffHunk::Different")
557                .field(
558                    &slices
559                        .iter()
560                        .map(|slice| String::from_utf8_lossy(slice))
561                        .collect_vec(),
562                )
563                .finish(),
564        }
565    }
566}
567
568pub struct DiffHunkIterator<'diff, 'input> {
569    diff: &'diff Diff<'input>,
570    previous: UnchangedRange,
571    unchanged_emitted: bool,
572    unchanged_iter: slice::Iter<'diff, UnchangedRange>,
573}
574
575impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
576    type Item = DiffHunk<'input>;
577
578    fn next(&mut self) -> Option<Self::Item> {
579        loop {
580            if !self.unchanged_emitted {
581                self.unchanged_emitted = true;
582                if !self.previous.base_range.is_empty() {
583                    return Some(DiffHunk::Matching(
584                        &self.diff.base_input[self.previous.base_range.clone()],
585                    ));
586                }
587            }
588            if let Some(current) = self.unchanged_iter.next() {
589                let mut slices = vec![
590                    &self.diff.base_input[self.previous.base_range.end..current.base_range.start],
591                ];
592                for (i, input) in self.diff.other_inputs.iter().enumerate() {
593                    slices.push(&input[self.previous.end(i)..current.start(i)]);
594                }
595                self.previous = current.clone();
596                self.unchanged_emitted = false;
597                if slices.iter().any(|slice| !slice.is_empty()) {
598                    return Some(DiffHunk::Different(slices));
599                }
600            } else {
601                break;
602            }
603        }
604        None
605    }
606}
607
608/// Diffs two slices of bytes. The returned diff hunks may be any length (may
609/// span many lines or may be only part of a line). This currently uses
610/// Histogram diff (or maybe something similar; I'm not sure I understood the
611/// algorithm correctly). It first diffs lines in the input and then refines
612/// the changed ranges at the word level.
613pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
614    if left == right {
615        return vec![DiffHunk::Matching(left)];
616    }
617    if left.is_empty() {
618        return vec![DiffHunk::Different(vec![b"", right])];
619    }
620    if right.is_empty() {
621        return vec![DiffHunk::Different(vec![left, b""])];
622    }
623
624    Diff::default_refinement(&[left, right])
625        .hunks()
626        .collect_vec()
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632
633    #[test]
634    fn test_find_line_ranges_empty() {
635        assert_eq!(find_line_ranges(b""), vec![]);
636    }
637
638    #[test]
639    fn test_find_line_ranges_blank_line() {
640        assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
641    }
642
643    #[test]
644    fn test_find_line_ranges_missing_newline_at_eof() {
645        assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
646    }
647
648    #[test]
649    fn test_find_line_ranges_multiple_lines() {
650        assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
651    }
652
653    #[test]
654    fn test_find_word_ranges_empty() {
655        assert_eq!(find_word_ranges(b""), vec![]);
656    }
657
658    #[test]
659    fn test_find_word_ranges_single_word() {
660        assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
661    }
662
663    #[test]
664    fn test_find_word_ranges_no_word() {
665        assert_eq!(find_word_ranges(b"+-*/"), vec![]);
666    }
667
668    #[test]
669    fn test_find_word_ranges_word_then_non_word() {
670        assert_eq!(find_word_ranges(b"Abc   "), vec![0..3]);
671    }
672
673    #[test]
674    fn test_find_word_ranges_non_word_then_word() {
675        assert_eq!(find_word_ranges(b"   Abc"), vec![3..6]);
676    }
677
678    #[test]
679    fn test_find_lcs_empty() {
680        let empty: Vec<(usize, usize)> = vec![];
681        assert_eq!(find_lcs(&[]), empty);
682    }
683
684    #[test]
685    fn test_find_lcs_single_element() {
686        assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
687    }
688
689    #[test]
690    fn test_find_lcs_in_order() {
691        assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
692    }
693
694    #[test]
695    fn test_find_lcs_reverse_order() {
696        assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
697    }
698
699    #[test]
700    fn test_find_lcs_two_swapped() {
701        assert_eq!(
702            find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
703            vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
704        );
705    }
706
707    #[test]
708    fn test_find_lcs_element_moved_earlier() {
709        assert_eq!(
710            find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
711            vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
712        );
713    }
714
715    #[test]
716    fn test_find_lcs_element_moved_later() {
717        assert_eq!(
718            find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
719            vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
720        );
721    }
722
723    #[test]
724    fn test_find_lcs_interleaved_longest_chains() {
725        assert_eq!(
726            find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
727            vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
728        );
729    }
730
731    #[test]
732    fn test_find_word_ranges_many_words() {
733        assert_eq!(
734            find_word_ranges(b"fn find_words(text: &[u8])"),
735            vec![0..2, 3..13, 14..18, 22..24]
736        );
737    }
738
739    #[test]
740    fn test_unchanged_ranges_insert_in_middle() {
741        assert_eq!(
742            unchanged_ranges(
743                b"a b b c",
744                b"a b X b c",
745                &[0..1, 2..3, 4..5, 6..7],
746                &[0..1, 2..3, 4..5, 6..7, 8..9],
747            ),
748            vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
749        );
750    }
751
752    #[test]
753    fn test_unchanged_ranges_non_unique_removed() {
754        assert_eq!(
755            unchanged_ranges(
756                b"a a a a",
757                b"a b a c",
758                &[0..1, 2..3, 4..5, 6..7],
759                &[0..1, 2..3, 4..5, 6..7],
760            ),
761            vec![(0..1, 0..1), (2..3, 4..5)]
762        );
763    }
764
765    #[test]
766    fn test_unchanged_ranges_non_unique_added() {
767        assert_eq!(
768            unchanged_ranges(
769                b"a b a c",
770                b"a a a a",
771                &[0..1, 2..3, 4..5, 6..7],
772                &[0..1, 2..3, 4..5, 6..7],
773            ),
774            vec![(0..1, 0..1), (4..5, 2..3)]
775        );
776    }
777
778    #[test]
779    fn test_intersect_regions_existing_empty() {
780        let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
781        let expected = vec![];
782        assert_eq!(actual, expected);
783    }
784
785    #[test]
786    fn test_intersect_regions_new_ranges_within_existing() {
787        let actual = intersect_regions(
788            vec![UnchangedRange {
789                base_range: 20..70,
790                offsets: vec![3],
791            }],
792            &[(25..30, 35..40), (40..50, 40..50)],
793        );
794        let expected = vec![
795            UnchangedRange {
796                base_range: 25..30,
797                offsets: vec![3, 10],
798            },
799            UnchangedRange {
800                base_range: 40..50,
801                offsets: vec![3, 0],
802            },
803        ];
804        assert_eq!(actual, expected);
805    }
806
807    #[test]
808    fn test_intersect_regions_partial_overlap() {
809        let actual = intersect_regions(
810            vec![UnchangedRange {
811                base_range: 20..50,
812                offsets: vec![-3],
813            }],
814            &[(15..25, 5..15), (45..60, 55..70)],
815        );
816        let expected = vec![
817            UnchangedRange {
818                base_range: 20..25,
819                offsets: vec![-3, -10],
820            },
821            UnchangedRange {
822                base_range: 45..50,
823                offsets: vec![-3, 10],
824            },
825        ];
826        assert_eq!(actual, expected);
827    }
828
829    #[test]
830    fn test_intersect_regions_new_range_overlaps_multiple_existing() {
831        let actual = intersect_regions(
832            vec![
833                UnchangedRange {
834                    base_range: 20..50,
835                    offsets: vec![3, -8],
836                },
837                UnchangedRange {
838                    base_range: 70..80,
839                    offsets: vec![7, 1],
840                },
841            ],
842            &[(10..100, 5..95)],
843        );
844        let expected = vec![
845            UnchangedRange {
846                base_range: 20..50,
847                offsets: vec![3, -8, -5],
848            },
849            UnchangedRange {
850                base_range: 70..80,
851                offsets: vec![7, 1, -5],
852            },
853        ];
854        assert_eq!(actual, expected);
855    }
856
857    #[test]
858    fn test_diff_single_input() {
859        let diff = Diff::default_refinement(&[b"abc"]);
860        assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]);
861    }
862
863    #[test]
864    fn test_diff_single_empty_input() {
865        let diff = Diff::default_refinement(&[b""]);
866        assert_eq!(diff.hunks().collect_vec(), vec![]);
867    }
868
869    #[test]
870    fn test_diff_two_inputs_one_different() {
871        let diff = Diff::default_refinement(&[b"a b c", b"a X c"]);
872        assert_eq!(
873            diff.hunks().collect_vec(),
874            vec![
875                DiffHunk::Matching(b"a "),
876                DiffHunk::Different(vec![b"b", b"X"]),
877                DiffHunk::Matching(b" c"),
878            ]
879        );
880    }
881
882    #[test]
883    fn test_diff_multiple_inputs_one_different() {
884        let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]);
885        assert_eq!(
886            diff.hunks().collect_vec(),
887            vec![
888                DiffHunk::Matching(b"a "),
889                DiffHunk::Different(vec![b"b", b"X", b"b"]),
890                DiffHunk::Matching(b" c"),
891            ]
892        );
893    }
894
895    #[test]
896    fn test_diff_multiple_inputs_all_different() {
897        let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]);
898        assert_eq!(
899            diff.hunks().collect_vec(),
900            vec![
901                DiffHunk::Matching(b"a "),
902                DiffHunk::Different(vec![b"b ", b"X ", b""]),
903                DiffHunk::Matching(b"c"),
904                DiffHunk::Different(vec![b"", b"", b" X"]),
905            ]
906        );
907    }
908
909    #[test]
910    fn test_diff_for_tokenizer_compacted() {
911        // Tests that unchanged regions are compacted when using for_tokenizer()
912        let diff = Diff::for_tokenizer(
913            &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"],
914            &find_line_ranges,
915        );
916        assert_eq!(
917            diff.hunks().collect_vec(),
918            vec![
919                DiffHunk::Matching(b"a\nb\nc\n"),
920                DiffHunk::Different(vec![b"d\n", b"X\n"]),
921                DiffHunk::Matching(b"e\nf\ng"),
922            ]
923        );
924    }
925
926    #[test]
927    fn test_diff_nothing_in_common() {
928        assert_eq!(
929            diff(b"aaa", b"bb"),
930            vec![DiffHunk::Different(vec![b"aaa", b"bb"])]
931        );
932    }
933
934    #[test]
935    fn test_diff_insert_in_middle() {
936        assert_eq!(
937            diff(b"a z", b"a S z"),
938            vec![
939                DiffHunk::Matching(b"a "),
940                DiffHunk::Different(vec![b"", b"S "]),
941                DiffHunk::Matching(b"z"),
942            ]
943        );
944    }
945
946    #[test]
947    fn test_diff_no_unique_middle_flips() {
948        assert_eq!(
949            diff(b"a R R S S z", b"a S S R R z"),
950            vec![
951                DiffHunk::Matching(b"a "),
952                DiffHunk::Different(vec![b"R R ", b""]),
953                DiffHunk::Matching(b"S S "),
954                DiffHunk::Different(vec![b"", b"R R "]),
955                DiffHunk::Matching(b"z")
956            ],
957        );
958    }
959
960    #[test]
961    fn test_diff_recursion_needed() {
962        assert_eq!(
963            diff(
964                b"a q x q y q z q b q y q x q c",
965                b"a r r x q y z q b y q x r r c",
966            ),
967            vec![
968                DiffHunk::Matching(b"a "),
969                DiffHunk::Different(vec![b"q", b"r"]),
970                DiffHunk::Matching(b" "),
971                DiffHunk::Different(vec![b"", b"r "]),
972                DiffHunk::Matching(b"x q y "),
973                DiffHunk::Different(vec![b"q ", b""]),
974                DiffHunk::Matching(b"z q b "),
975                DiffHunk::Different(vec![b"q ", b""]),
976                DiffHunk::Matching(b"y q x "),
977                DiffHunk::Different(vec![b"q", b"r"]),
978                DiffHunk::Matching(b" "),
979                DiffHunk::Different(vec![b"", b"r "]),
980                DiffHunk::Matching(b"c"),
981            ]
982        );
983    }
984
985    #[test]
986    fn test_diff_real_case_write_fmt() {
987        // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
988        // need for recursion into the range at the end: after splitting at "Arguments"
989        // and "formatter", the region at the end has the unique words "write_fmt"
990        // and "fmt", but we forgot to recurse into that region, so we ended up
991        // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
992        assert_eq!(diff(
993                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n        self.styler().write_fmt(fmt).unwrap()\n",
994                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n        self.styler().write_fmt(fmt)\n"
995            ),
996            vec![
997                DiffHunk::Matching(b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
998                DiffHunk::Different(vec![b"", b"-> io::Result<()> "]),
999                DiffHunk::Matching(b"{\n        self.styler().write_fmt(fmt)"),
1000                DiffHunk::Different(vec![b".unwrap()", b""]),
1001                DiffHunk::Matching(b"\n")
1002            ]
1003        );
1004    }
1005
1006    #[test]
1007    fn test_diff_real_case_gitgit_read_tree_c() {
1008        // This is the diff from commit e497ea2a9b in the git.git repo
1009        assert_eq!(
1010            diff(
1011                br##"/*
1012 * GIT - The information manager from hell
1013 *
1014 * Copyright (C) Linus Torvalds, 2005
1015 */
1016#include "#cache.h"
1017
1018static int unpack(unsigned char *sha1)
1019{
1020	void *buffer;
1021	unsigned long size;
1022	char type[20];
1023
1024	buffer = read_sha1_file(sha1, type, &size);
1025	if (!buffer)
1026		usage("unable to read sha1 file");
1027	if (strcmp(type, "tree"))
1028		usage("expected a 'tree' node");
1029	while (size) {
1030		int len = strlen(buffer)+1;
1031		unsigned char *sha1 = buffer + len;
1032		char *path = strchr(buffer, ' ')+1;
1033		unsigned int mode;
1034		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1035			usage("corrupt 'tree' file");
1036		buffer = sha1 + 20;
1037		size -= len + 20;
1038		printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1039	}
1040	return 0;
1041}
1042
1043int main(int argc, char **argv)
1044{
1045	int fd;
1046	unsigned char sha1[20];
1047
1048	if (argc != 2)
1049		usage("read-tree <key>");
1050	if (get_sha1_hex(argv[1], sha1) < 0)
1051		usage("read-tree <key>");
1052	sha1_file_directory = getenv(DB_ENVIRONMENT);
1053	if (!sha1_file_directory)
1054		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1055	if (unpack(sha1) < 0)
1056		usage("unpack failed");
1057	return 0;
1058}
1059"##,
1060                br##"/*
1061 * GIT - The information manager from hell
1062 *
1063 * Copyright (C) Linus Torvalds, 2005
1064 */
1065#include "#cache.h"
1066
1067static void create_directories(const char *path)
1068{
1069	int len = strlen(path);
1070	char *buf = malloc(len + 1);
1071	const char *slash = path;
1072
1073	while ((slash = strchr(slash+1, '/')) != NULL) {
1074		len = slash - path;
1075		memcpy(buf, path, len);
1076		buf[len] = 0;
1077		mkdir(buf, 0700);
1078	}
1079}
1080
1081static int create_file(const char *path)
1082{
1083	int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1084	if (fd < 0) {
1085		if (errno == ENOENT) {
1086			create_directories(path);
1087			fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1088		}
1089	}
1090	return fd;
1091}
1092
1093static int unpack(unsigned char *sha1)
1094{
1095	void *buffer;
1096	unsigned long size;
1097	char type[20];
1098
1099	buffer = read_sha1_file(sha1, type, &size);
1100	if (!buffer)
1101		usage("unable to read sha1 file");
1102	if (strcmp(type, "tree"))
1103		usage("expected a 'tree' node");
1104	while (size) {
1105		int len = strlen(buffer)+1;
1106		unsigned char *sha1 = buffer + len;
1107		char *path = strchr(buffer, ' ')+1;
1108		char *data;
1109		unsigned long filesize;
1110		unsigned int mode;
1111		int fd;
1112
1113		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1114			usage("corrupt 'tree' file");
1115		buffer = sha1 + 20;
1116		size -= len + 20;
1117		data = read_sha1_file(sha1, type, &filesize);
1118		if (!data || strcmp(type, "blob"))
1119			usage("tree file refers to bad file data");
1120		fd = create_file(path);
1121		if (fd < 0)
1122			usage("unable to create file");
1123		if (write(fd, data, filesize) != filesize)
1124			usage("unable to write file");
1125		fchmod(fd, mode);
1126		close(fd);
1127		free(data);
1128	}
1129	return 0;
1130}
1131
1132int main(int argc, char **argv)
1133{
1134	int fd;
1135	unsigned char sha1[20];
1136
1137	if (argc != 2)
1138		usage("read-tree <key>");
1139	if (get_sha1_hex(argv[1], sha1) < 0)
1140		usage("read-tree <key>");
1141	sha1_file_directory = getenv(DB_ENVIRONMENT);
1142	if (!sha1_file_directory)
1143		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1144	if (unpack(sha1) < 0)
1145		usage("unpack failed");
1146	return 0;
1147}
1148"##,
1149            ),
1150            vec![
1151               DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
1152               DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1153               DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
1154               DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1155               DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
1156               DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
1157               DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
1158               DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
1159               DiffHunk::Matching(b"("),
1160               DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
1161               DiffHunk::Matching(b"sha1"),
1162               DiffHunk::Different(vec![b"", b", type, &filesize"]),
1163               DiffHunk::Matching(b")"),
1164               DiffHunk::Different(vec![b")", b""]),
1165               DiffHunk::Matching(b";\n"),
1166               DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
1167               DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
1168            ]
1169        );
1170    }
1171}