jj_lib/
diff.rs

1// Copyright 2021 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#![expect(missing_docs)]
16
17use std::collections::BTreeMap;
18use std::hash::BuildHasher;
19use std::hash::Hash;
20use std::hash::Hasher;
21use std::hash::RandomState;
22use std::iter;
23use std::ops::Range;
24use std::slice;
25
26use bstr::BStr;
27use hashbrown::HashTable;
28use itertools::Itertools as _;
29use smallvec::SmallVec;
30use smallvec::smallvec;
31
32pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
33    text.split_inclusive(|b| *b == b'\n')
34        .scan(0, |total, line| {
35            let start = *total;
36            *total += line.len();
37            Some(start..*total)
38        })
39        .collect()
40}
41
42fn is_word_byte(b: u8) -> bool {
43    // TODO: Make this configurable (probably higher up in the call stack)
44    matches!(
45        b,
46        // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
47        // treated as a single unit.
48        b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
49    )
50}
51
52pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
53    let mut word_ranges = vec![];
54    let mut word_start_pos = 0;
55    let mut in_word = false;
56    for (i, b) in text.iter().enumerate() {
57        if in_word && !is_word_byte(*b) {
58            in_word = false;
59            word_ranges.push(word_start_pos..i);
60            word_start_pos = i;
61        } else if !in_word && is_word_byte(*b) {
62            in_word = true;
63            word_start_pos = i;
64        }
65    }
66    if in_word && word_start_pos < text.len() {
67        word_ranges.push(word_start_pos..text.len());
68    }
69    word_ranges
70}
71
72pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
73    text.iter()
74        .positions(|b| !is_word_byte(*b))
75        .map(|i| i..i + 1)
76        .collect()
77}
78
79fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
80    text.iter().copied().filter(|b| !b.is_ascii_whitespace())
81}
82
83fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
84    let mut prev_was_space = false;
85    text.iter().filter_map(move |&b| {
86        let was_space = prev_was_space;
87        let is_space = b.is_ascii_whitespace();
88        prev_was_space = is_space;
89        match (was_space, is_space) {
90            (_, false) => Some(b),
91            (false, true) => Some(b' '),
92            (true, true) => None,
93        }
94    })
95}
96
97fn hash_with_length_suffix<I, H>(data: I, state: &mut H)
98where
99    I: IntoIterator,
100    I::Item: Hash,
101    H: Hasher,
102{
103    let mut len: usize = 0;
104    for d in data {
105        d.hash(state);
106        len += 1;
107    }
108    state.write_usize(len);
109}
110
111/// Compares byte sequences based on a certain equivalence property.
112///
113/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison
114/// object for the following reasons:
115///
116/// a. If it were newtype, a generic `wrap` function would be needed. It
117///    couldn't be expressed as a simple closure:
118///    `for<'a> Fn(&'a [u8]) -> ???<'a>`
119/// b. Dynamic comparison object can be implemented intuitively. For example,
120///    `pattern: &Regex` would have to be copied to all newtype instances if it
121///    were newtype.
122/// c. Hash values can be cached if hashing is controlled externally.
123pub trait CompareBytes {
124    /// Returns true if `left` and `right` are equivalent.
125    fn eq(&self, left: &[u8], right: &[u8]) -> bool;
126
127    /// Generates hash which respects the following property:
128    /// `eq(left, right) => hash(left) == hash(right)`
129    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
130}
131
132// An instance might have e.g. Regex pattern, which can't be trivially copied.
133// Such comparison object can be passed by reference.
134impl<C: CompareBytes + ?Sized> CompareBytes for &C {
135    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
136        <C as CompareBytes>::eq(self, left, right)
137    }
138
139    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
140        <C as CompareBytes>::hash(self, text, state);
141    }
142}
143
144/// Compares byte sequences literally.
145#[derive(Clone, Debug, Default)]
146pub struct CompareBytesExactly;
147
148impl CompareBytes for CompareBytesExactly {
149    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
150        left == right
151    }
152
153    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
154        text.hash(state);
155    }
156}
157
158/// Compares byte sequences ignoring any whitespace occurrences.
159#[derive(Clone, Debug, Default)]
160pub struct CompareBytesIgnoreAllWhitespace;
161
162impl CompareBytes for CompareBytesIgnoreAllWhitespace {
163    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
164        bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right))
165    }
166
167    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
168        hash_with_length_suffix(bytes_ignore_all_whitespace(text), state);
169    }
170}
171
172/// Compares byte sequences ignoring changes in whitespace amount.
173#[derive(Clone, Debug, Default)]
174pub struct CompareBytesIgnoreWhitespaceAmount;
175
176impl CompareBytes for CompareBytesIgnoreWhitespaceAmount {
177    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
178        bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right))
179    }
180
181    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
182        hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state);
183    }
184}
185
186// Not implementing Eq because the text should be compared by WordComparator.
187#[derive(Clone, Copy, Debug)]
188struct HashedWord<'input> {
189    hash: u64,
190    text: &'input BStr,
191}
192
193/// Compares words (or tokens) under a certain hasher configuration.
194#[derive(Clone, Debug, Default)]
195struct WordComparator<C, S> {
196    compare: C,
197    hash_builder: S,
198}
199
200impl<C: CompareBytes> WordComparator<C, RandomState> {
201    fn new(compare: C) -> Self {
202        Self {
203            compare,
204            // TODO: switch to ahash for better performance?
205            hash_builder: RandomState::new(),
206        }
207    }
208}
209
210impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
211    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
212        self.compare.eq(left, right)
213    }
214
215    fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool {
216        left.hash == right.hash && self.compare.eq(left.text, right.text)
217    }
218
219    fn hash_one(&self, text: &[u8]) -> u64 {
220        let mut state = self.hash_builder.build_hasher();
221        self.compare.hash(text, &mut state);
222        state.finish()
223    }
224}
225
226/// Index in a list of word (or token) ranges in `DiffSource`.
227#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
228struct WordPosition(usize);
229
230/// Index in a list of word (or token) ranges in `LocalDiffSource`.
231#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
232struct LocalWordPosition(usize);
233
234#[derive(Clone, Debug)]
235struct DiffSource<'input, 'aux> {
236    text: &'input BStr,
237    ranges: &'aux [Range<usize>],
238    hashes: Vec<u64>,
239}
240
241impl<'input, 'aux> DiffSource<'input, 'aux> {
242    fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>(
243        text: &'input T,
244        ranges: &'aux [Range<usize>],
245        comp: &WordComparator<C, S>,
246    ) -> Self {
247        let text = BStr::new(text);
248        let hashes = ranges
249            .iter()
250            .map(|range| comp.hash_one(&text[range.clone()]))
251            .collect();
252        Self {
253            text,
254            ranges,
255            hashes,
256        }
257    }
258
259    fn local(&self) -> LocalDiffSource<'input, '_> {
260        LocalDiffSource {
261            text: self.text,
262            ranges: self.ranges,
263            hashes: &self.hashes,
264            global_offset: WordPosition(0),
265        }
266    }
267
268    fn range_at(&self, position: WordPosition) -> Range<usize> {
269        self.ranges[position.0].clone()
270    }
271}
272
273#[derive(Clone, Debug)]
274struct LocalDiffSource<'input, 'aux> {
275    text: &'input BStr,
276    ranges: &'aux [Range<usize>],
277    hashes: &'aux [u64],
278    /// The number of preceding word ranges excluded from the self `ranges`.
279    global_offset: WordPosition,
280}
281
282impl<'input> LocalDiffSource<'input, '_> {
283    fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
284        Self {
285            text: self.text,
286            ranges: &self.ranges[positions.start.0..positions.end.0],
287            hashes: &self.hashes[positions.start.0..positions.end.0],
288            global_offset: self.map_to_global(positions.start),
289        }
290    }
291
292    fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
293        WordPosition(self.global_offset.0 + position.0)
294    }
295
296    fn hashed_words(
297        &self,
298    ) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + use<'_, 'input>
299    {
300        iter::zip(self.ranges, self.hashes).map(|(range, &hash)| {
301            let text = &self.text[range.clone()];
302            HashedWord { hash, text }
303        })
304    }
305}
306
307struct Histogram<'input> {
308    word_to_positions: HashTable<HistogramEntry<'input>>,
309}
310
311// Many of the words are unique. We can inline up to 2 word positions (16 bytes
312// on 64-bit platform) in SmallVec for free.
313type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>);
314
315impl<'input> Histogram<'input> {
316    fn calculate<C: CompareBytes, S: BuildHasher>(
317        source: &LocalDiffSource<'input, '_>,
318        comp: &WordComparator<C, S>,
319        max_occurrences: usize,
320    ) -> Self {
321        let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
322        for (i, word) in source.hashed_words().enumerate() {
323            let pos = LocalWordPosition(i);
324            word_to_positions
325                .entry(
326                    word.hash,
327                    |(w, _)| comp.eq(w.text, word.text),
328                    |(w, _)| w.hash,
329                )
330                .and_modify(|(_, positions)| {
331                    // Allow one more than max_occurrences, so we can later skip
332                    // those with more than max_occurrences
333                    if positions.len() <= max_occurrences {
334                        positions.push(pos);
335                    }
336                })
337                .or_insert_with(|| (word, smallvec![pos]));
338        }
339        Self { word_to_positions }
340    }
341
342    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
343        let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
344        for entry in &self.word_to_positions {
345            let (_, positions) = entry;
346            let entries = count_to_entries.entry(positions.len()).or_default();
347            entries.push(entry);
348        }
349        count_to_entries
350    }
351
352    fn positions_by_word<C: CompareBytes, S: BuildHasher>(
353        &self,
354        word: HashedWord<'input>,
355        comp: &WordComparator<C, S>,
356    ) -> Option<&[LocalWordPosition]> {
357        let (_, positions) = self
358            .word_to_positions
359            .find(word.hash, |(w, _)| comp.eq(w.text, word.text))?;
360        Some(positions)
361    }
362}
363
364/// Finds the LCS given a array where the value of `input[i]` indicates that
365/// the position of element `i` in the right array is at position `input[i]` in
366/// the left array.
367///
368/// For example (some have multiple valid outputs):
369///
370/// [0,1,2] => [(0,0),(1,1),(2,2)]
371/// [2,1,0] => [(0,2)]
372/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)]
373/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)]
374fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
375    if input.is_empty() {
376        return vec![];
377    }
378
379    let mut chain = vec![(0, 0, 0); input.len()];
380    let mut global_longest = 0;
381    let mut global_longest_right_pos = 0;
382    for (right_pos, &left_pos) in input.iter().enumerate() {
383        let mut longest_from_here = 1;
384        let mut previous_right_pos = usize::MAX;
385        for i in (0..right_pos).rev() {
386            let (previous_len, previous_left_pos, _) = chain[i];
387            if previous_left_pos < left_pos {
388                let len = previous_len + 1;
389                if len > longest_from_here {
390                    longest_from_here = len;
391                    previous_right_pos = i;
392                    if len > global_longest {
393                        global_longest = len;
394                        global_longest_right_pos = right_pos;
395                        // If this is the longest chain globally so far, we cannot find a
396                        // longer one by using a previous value, so break early.
397                        break;
398                    }
399                }
400            }
401        }
402        chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
403    }
404
405    let mut result = vec![];
406    let mut right_pos = global_longest_right_pos;
407    loop {
408        let (_, left_pos, previous_right_pos) = chain[right_pos];
409        result.push((left_pos, right_pos));
410        if previous_right_pos == usize::MAX {
411            break;
412        }
413        right_pos = previous_right_pos;
414    }
415    result.reverse();
416
417    result
418}
419
420/// Finds unchanged word (or token) positions among the ones given as
421/// arguments. The data between those words is ignored.
422fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
423    found_positions: &mut Vec<(WordPosition, WordPosition)>,
424    left: &LocalDiffSource,
425    right: &LocalDiffSource,
426    comp: &WordComparator<C, S>,
427) {
428    if left.ranges.is_empty() || right.ranges.is_empty() {
429        return;
430    }
431
432    // Prioritize LCS-based algorithm than leading/trailing matches
433    let old_len = found_positions.len();
434    collect_unchanged_words_lcs(found_positions, left, right, comp);
435    if found_positions.len() != old_len {
436        return;
437    }
438
439    // Trim leading common ranges (i.e. grow previous unchanged region)
440    let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words())
441        .take_while(|&(l, r)| comp.eq_hashed(l, r))
442        .count();
443    let left_hashed_words = left.hashed_words().skip(common_leading_len);
444    let right_hashed_words = right.hashed_words().skip(common_leading_len);
445
446    // Trim trailing common ranges (i.e. grow next unchanged region)
447    let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev())
448        .take_while(|&(l, r)| comp.eq_hashed(l, r))
449        .count();
450
451    found_positions.extend(itertools::chain(
452        (0..common_leading_len).map(|i| {
453            (
454                left.map_to_global(LocalWordPosition(i)),
455                right.map_to_global(LocalWordPosition(i)),
456            )
457        }),
458        (1..=common_trailing_len).rev().map(|i| {
459            (
460                left.map_to_global(LocalWordPosition(left.ranges.len() - i)),
461                right.map_to_global(LocalWordPosition(right.ranges.len() - i)),
462            )
463        }),
464    ));
465}
466
467fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
468    found_positions: &mut Vec<(WordPosition, WordPosition)>,
469    left: &LocalDiffSource,
470    right: &LocalDiffSource,
471    comp: &WordComparator<C, S>,
472) {
473    let max_occurrences = 100;
474    let left_histogram = Histogram::calculate(left, comp, max_occurrences);
475    let left_count_to_entries = left_histogram.build_count_to_entries();
476    if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
477        // If there are very many occurrences of all words, then we just give up.
478        return;
479    }
480    let right_histogram = Histogram::calculate(right, comp, max_occurrences);
481    // Look for words with few occurrences in `left` (could equally well have picked
482    // `right`?). If any of them also occur in `right`, then we add the words to
483    // the LCS.
484    let Some(uncommon_shared_word_positions) =
485        left_count_to_entries.values().find_map(|left_entries| {
486            let mut both_positions = left_entries
487                .iter()
488                .filter_map(|&(word, left_positions)| {
489                    let right_positions = right_histogram.positions_by_word(*word, comp)?;
490                    (left_positions.len() == right_positions.len())
491                        .then_some((left_positions, right_positions))
492                })
493                .peekable();
494            both_positions.peek().is_some().then_some(both_positions)
495        })
496    else {
497        return;
498    };
499
500    // [(index into ranges, serial to identify {word, occurrence #})]
501    let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
502        uncommon_shared_word_positions
503            .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
504            .enumerate()
505            .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
506            .unzip();
507    left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
508    right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
509    let left_index_by_right_index: Vec<usize> = {
510        let mut left_index_map = vec![0; left_positions.len()];
511        for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
512            left_index_map[serial] = i;
513        }
514        right_positions
515            .iter()
516            .map(|&(_pos, serial)| left_index_map[serial])
517            .collect()
518    };
519
520    let lcs = find_lcs(&left_index_by_right_index);
521
522    // Produce output word positions, recursing into the modified areas between
523    // the elements in the LCS.
524    let mut previous_left_position = LocalWordPosition(0);
525    let mut previous_right_position = LocalWordPosition(0);
526    for (left_index, right_index) in lcs {
527        let (left_position, _) = left_positions[left_index];
528        let (right_position, _) = right_positions[right_index];
529        collect_unchanged_words(
530            found_positions,
531            &left.narrowed(previous_left_position..left_position),
532            &right.narrowed(previous_right_position..right_position),
533            comp,
534        );
535        found_positions.push((
536            left.map_to_global(left_position),
537            right.map_to_global(right_position),
538        ));
539        previous_left_position = LocalWordPosition(left_position.0 + 1);
540        previous_right_position = LocalWordPosition(right_position.0 + 1);
541    }
542    // Also recurse into range at end (after common ranges).
543    collect_unchanged_words(
544        found_positions,
545        &left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())),
546        &right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())),
547        comp,
548    );
549}
550
551/// Intersects two sorted sequences of `(base, other)` word positions by
552/// `base`. `base` positions should refer to the same source text.
553fn intersect_unchanged_words(
554    current_positions: Vec<(WordPosition, Vec<WordPosition>)>,
555    new_positions: &[(WordPosition, WordPosition)],
556) -> Vec<(WordPosition, Vec<WordPosition>)> {
557    itertools::merge_join_by(
558        current_positions,
559        new_positions,
560        |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos),
561    )
562    .filter_map(|entry| entry.both())
563    .map(|((base_pos, mut other_positions), &(_, new_other_pos))| {
564        other_positions.push(new_other_pos);
565        (base_pos, other_positions)
566    })
567    .collect()
568}
569
570#[derive(Clone, PartialEq, Eq, Debug)]
571struct UnchangedRange {
572    // Inline up to two sides (base + one other)
573    base: Range<usize>,
574    others: SmallVec<[Range<usize>; 1]>,
575}
576
577impl UnchangedRange {
578    /// Translates word positions to byte ranges in the source texts.
579    fn from_word_positions(
580        base_source: &DiffSource,
581        other_sources: &[DiffSource],
582        base_position: WordPosition,
583        other_positions: &[WordPosition],
584    ) -> Self {
585        assert_eq!(other_sources.len(), other_positions.len());
586        let base = base_source.range_at(base_position);
587        let others = iter::zip(other_sources, other_positions)
588            .map(|(source, pos)| source.range_at(*pos))
589            .collect();
590        Self { base, others }
591    }
592
593    fn is_all_empty(&self) -> bool {
594        self.base.is_empty() && self.others.iter().all(|r| r.is_empty())
595    }
596}
597
598/// Takes any number of inputs and finds regions that are them same between all
599/// of them.
600#[derive(Clone, Debug)]
601pub struct ContentDiff<'input> {
602    base_input: &'input BStr,
603    other_inputs: SmallVec<[&'input BStr; 1]>,
604    /// Sorted list of ranges of unchanged regions in bytes.
605    ///
606    /// The list should never be empty. The first and the last region may be
607    /// empty if inputs start/end with changes.
608    unchanged_regions: Vec<UnchangedRange>,
609}
610
611impl<'input> ContentDiff<'input> {
612    pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
613        inputs: impl IntoIterator<Item = &'input T>,
614        tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
615        compare: impl CompareBytes,
616    ) -> Self {
617        let mut inputs = inputs.into_iter().map(BStr::new);
618        let base_input = inputs.next().expect("inputs must not be empty");
619        let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect();
620        // First tokenize each input
621        let base_token_ranges: Vec<Range<usize>>;
622        let other_token_ranges: Vec<Vec<Range<usize>>>;
623        // No need to tokenize if one of the inputs is empty. Non-empty inputs
624        // are all different as long as the tokenizer emits non-empty ranges.
625        // This means "" and " " are different even if the compare function is
626        // ignore-whitespace. They are tokenized as [] and [" "] respectively.
627        if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
628            base_token_ranges = vec![];
629            other_token_ranges = std::iter::repeat_n(vec![], other_inputs.len()).collect();
630        } else {
631            base_token_ranges = tokenizer(base_input);
632            other_token_ranges = other_inputs
633                .iter()
634                .map(|other_input| tokenizer(other_input))
635                .collect();
636        }
637        Self::with_inputs_and_token_ranges(
638            base_input,
639            other_inputs,
640            &base_token_ranges,
641            &other_token_ranges,
642            compare,
643        )
644    }
645
646    fn with_inputs_and_token_ranges(
647        base_input: &'input BStr,
648        other_inputs: SmallVec<[&'input BStr; 1]>,
649        base_token_ranges: &[Range<usize>],
650        other_token_ranges: &[Vec<Range<usize>>],
651        compare: impl CompareBytes,
652    ) -> Self {
653        assert_eq!(other_inputs.len(), other_token_ranges.len());
654        let comp = WordComparator::new(compare);
655        let base_source = DiffSource::new(base_input, base_token_ranges, &comp);
656        let other_sources = iter::zip(&other_inputs, other_token_ranges)
657            .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp))
658            .collect_vec();
659        let unchanged_regions = match &*other_sources {
660            // Consider the whole range of the base input as unchanged compared
661            // to itself.
662            [] => {
663                let whole_range = UnchangedRange {
664                    base: 0..base_source.text.len(),
665                    others: smallvec![],
666                };
667                vec![whole_range]
668            }
669            // Diff each other input against the base. Intersect the previously
670            // found ranges with the ranges in the diff.
671            [first_other_source, tail_other_sources @ ..] => {
672                let mut unchanged_regions = Vec::new();
673                // Add an empty range at the start to make life easier for hunks().
674                unchanged_regions.push(UnchangedRange {
675                    base: 0..0,
676                    others: smallvec![0..0; other_inputs.len()],
677                });
678                let mut first_positions = Vec::new();
679                collect_unchanged_words(
680                    &mut first_positions,
681                    &base_source.local(),
682                    &first_other_source.local(),
683                    &comp,
684                );
685                if tail_other_sources.is_empty() {
686                    unchanged_regions.extend(first_positions.iter().map(
687                        |&(base_pos, other_pos)| {
688                            UnchangedRange::from_word_positions(
689                                &base_source,
690                                &other_sources,
691                                base_pos,
692                                &[other_pos],
693                            )
694                        },
695                    ));
696                } else {
697                    let first_positions = first_positions
698                        .iter()
699                        .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos]))
700                        .collect();
701                    let intersected_positions = tail_other_sources.iter().fold(
702                        first_positions,
703                        |current_positions, other_source| {
704                            let mut new_positions = Vec::new();
705                            collect_unchanged_words(
706                                &mut new_positions,
707                                &base_source.local(),
708                                &other_source.local(),
709                                &comp,
710                            );
711                            intersect_unchanged_words(current_positions, &new_positions)
712                        },
713                    );
714                    unchanged_regions.extend(intersected_positions.iter().map(
715                        |(base_pos, other_positions)| {
716                            UnchangedRange::from_word_positions(
717                                &base_source,
718                                &other_sources,
719                                *base_pos,
720                                other_positions,
721                            )
722                        },
723                    ));
724                };
725                // Add an empty range at the end to make life easier for hunks().
726                unchanged_regions.push(UnchangedRange {
727                    base: base_input.len()..base_input.len(),
728                    others: other_inputs
729                        .iter()
730                        .map(|input| input.len()..input.len())
731                        .collect(),
732                });
733                unchanged_regions
734            }
735        };
736
737        let mut diff = Self {
738            base_input,
739            other_inputs,
740            unchanged_regions,
741        };
742        diff.compact_unchanged_regions();
743        diff
744    }
745
746    pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
747        inputs: impl IntoIterator<Item = &'input T>,
748    ) -> Self {
749        ContentDiff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
750    }
751
752    /// Compares `inputs` line by line.
753    pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
754        inputs: impl IntoIterator<Item = &'input T>,
755    ) -> Self {
756        ContentDiff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
757    }
758
759    /// Compares `inputs` word by word.
760    ///
761    /// The `inputs` is usually a changed hunk (e.g. a `DiffHunk::Different`)
762    /// that was the output from a line-by-line diff.
763    pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
764        inputs: impl IntoIterator<Item = &'input T>,
765    ) -> Self {
766        let mut diff = ContentDiff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
767        diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
768        diff
769    }
770
771    /// Returns iterator over matching and different texts.
772    pub fn hunks(&self) -> DiffHunkIterator<'_, 'input> {
773        let ranges = self.hunk_ranges();
774        DiffHunkIterator { diff: self, ranges }
775    }
776
777    /// Returns iterator over matching and different ranges in bytes.
778    pub fn hunk_ranges(&self) -> DiffHunkRangeIterator<'_> {
779        DiffHunkRangeIterator::new(self)
780    }
781
782    /// Returns contents at the unchanged `range`.
783    fn hunk_at<'a, 'b>(
784        &'a self,
785        range: &'b UnchangedRange,
786    ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
787        itertools::chain(
788            iter::once(&self.base_input[range.base.clone()]),
789            iter::zip(&self.other_inputs, &range.others).map(|(input, r)| &input[r.clone()]),
790        )
791    }
792
793    /// Returns contents between the `previous` ends and the `current` starts.
794    fn hunk_between<'a, 'b>(
795        &'a self,
796        previous: &'b UnchangedRange,
797        current: &'b UnchangedRange,
798    ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
799        itertools::chain(
800            iter::once(&self.base_input[previous.base.end..current.base.start]),
801            itertools::izip!(&self.other_inputs, &previous.others, &current.others)
802                .map(|(input, prev, cur)| &input[prev.end..cur.start]),
803        )
804    }
805
806    /// Uses the given tokenizer to split the changed regions into smaller
807    /// regions. Then tries to finds unchanged regions among them.
808    pub fn refine_changed_regions(
809        &mut self,
810        tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
811        compare: impl CompareBytes,
812    ) {
813        let mut new_unchanged_ranges = vec![self.unchanged_regions[0].clone()];
814        for window in self.unchanged_regions.windows(2) {
815            let [previous, current]: &[_; 2] = window.try_into().unwrap();
816            // For the changed region between the previous region and the current one,
817            // create a new Diff instance. Then adjust the start positions and
818            // offsets to be valid in the context of the larger Diff instance
819            // (`self`).
820            let refined_diff = ContentDiff::for_tokenizer(
821                self.hunk_between(previous, current),
822                &tokenizer,
823                &compare,
824            );
825            for refined in &refined_diff.unchanged_regions {
826                let new_base_start = refined.base.start + previous.base.end;
827                let new_base_end = refined.base.end + previous.base.end;
828                let new_others = iter::zip(&refined.others, &previous.others)
829                    .map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end))
830                    .collect();
831                new_unchanged_ranges.push(UnchangedRange {
832                    base: new_base_start..new_base_end,
833                    others: new_others,
834                });
835            }
836            new_unchanged_ranges.push(current.clone());
837        }
838        self.unchanged_regions = new_unchanged_ranges;
839        self.compact_unchanged_regions();
840    }
841
842    fn compact_unchanged_regions(&mut self) {
843        let mut compacted = vec![];
844        let mut maybe_previous: Option<UnchangedRange> = None;
845        for current in &self.unchanged_regions {
846            if let Some(previous) = maybe_previous {
847                if previous.base.end == current.base.start
848                    && iter::zip(&previous.others, &current.others)
849                        .all(|(prev, cur)| prev.end == cur.start)
850                {
851                    maybe_previous = Some(UnchangedRange {
852                        base: previous.base.start..current.base.end,
853                        others: iter::zip(&previous.others, &current.others)
854                            .map(|(prev, cur)| prev.start..cur.end)
855                            .collect(),
856                    });
857                    continue;
858                }
859                compacted.push(previous);
860            }
861            maybe_previous = Some(current.clone());
862        }
863        if let Some(previous) = maybe_previous {
864            compacted.push(previous);
865        }
866        self.unchanged_regions = compacted;
867    }
868}
869
870/// Hunk texts.
871#[derive(Clone, Debug, Eq, PartialEq)]
872pub struct DiffHunk<'input> {
873    pub kind: DiffHunkKind,
874    pub contents: DiffHunkContentVec<'input>,
875}
876
877impl<'input> DiffHunk<'input> {
878    pub fn matching<T: AsRef<[u8]> + ?Sized + 'input>(
879        contents: impl IntoIterator<Item = &'input T>,
880    ) -> Self {
881        Self {
882            kind: DiffHunkKind::Matching,
883            contents: contents.into_iter().map(BStr::new).collect(),
884        }
885    }
886
887    pub fn different<T: AsRef<[u8]> + ?Sized + 'input>(
888        contents: impl IntoIterator<Item = &'input T>,
889    ) -> Self {
890        Self {
891            kind: DiffHunkKind::Different,
892            contents: contents.into_iter().map(BStr::new).collect(),
893        }
894    }
895}
896
897#[derive(Clone, Copy, Debug, Eq, PartialEq)]
898pub enum DiffHunkKind {
899    Matching,
900    Different,
901}
902
903// Inline up to two sides
904pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>;
905
906/// Iterator over matching and different texts.
907#[derive(Clone, Debug)]
908pub struct DiffHunkIterator<'diff, 'input> {
909    diff: &'diff ContentDiff<'input>,
910    ranges: DiffHunkRangeIterator<'diff>,
911}
912
913impl<'input> Iterator for DiffHunkIterator<'_, 'input> {
914    type Item = DiffHunk<'input>;
915
916    fn next(&mut self) -> Option<Self::Item> {
917        self.ranges.next_with(
918            |previous| {
919                let contents = self.diff.hunk_at(previous).collect();
920                let kind = DiffHunkKind::Matching;
921                DiffHunk { kind, contents }
922            },
923            |previous, current| {
924                let contents: DiffHunkContentVec =
925                    self.diff.hunk_between(previous, current).collect();
926                debug_assert!(
927                    contents.iter().any(|content| !content.is_empty()),
928                    "unchanged regions should have been compacted"
929                );
930                let kind = DiffHunkKind::Different;
931                DiffHunk { kind, contents }
932            },
933        )
934    }
935}
936
937/// Hunk ranges in bytes.
938#[derive(Clone, Debug, Eq, PartialEq)]
939pub struct DiffHunkRange {
940    pub kind: DiffHunkKind,
941    pub ranges: DiffHunkRangeVec,
942}
943
944// Inline up to two sides
945pub type DiffHunkRangeVec = SmallVec<[Range<usize>; 2]>;
946
947/// Iterator over matching and different ranges in bytes.
948#[derive(Clone, Debug)]
949pub struct DiffHunkRangeIterator<'diff> {
950    previous: &'diff UnchangedRange,
951    unchanged_emitted: bool,
952    unchanged_iter: slice::Iter<'diff, UnchangedRange>,
953}
954
955impl<'diff> DiffHunkRangeIterator<'diff> {
956    fn new(diff: &'diff ContentDiff) -> Self {
957        let mut unchanged_iter = diff.unchanged_regions.iter();
958        let previous = unchanged_iter.next().unwrap();
959        Self {
960            previous,
961            unchanged_emitted: previous.is_all_empty(),
962            unchanged_iter,
963        }
964    }
965
966    fn next_with<T>(
967        &mut self,
968        hunk_at: impl FnOnce(&UnchangedRange) -> T,
969        hunk_between: impl FnOnce(&UnchangedRange, &UnchangedRange) -> T,
970    ) -> Option<T> {
971        if !self.unchanged_emitted {
972            self.unchanged_emitted = true;
973            return Some(hunk_at(self.previous));
974        }
975        let current = self.unchanged_iter.next()?;
976        let hunk = hunk_between(self.previous, current);
977        self.previous = current;
978        self.unchanged_emitted = self.previous.is_all_empty();
979        Some(hunk)
980    }
981}
982
983impl Iterator for DiffHunkRangeIterator<'_> {
984    type Item = DiffHunkRange;
985
986    fn next(&mut self) -> Option<Self::Item> {
987        self.next_with(
988            |previous| {
989                let ranges = itertools::chain(iter::once(&previous.base), &previous.others)
990                    .cloned()
991                    .collect();
992                let kind = DiffHunkKind::Matching;
993                DiffHunkRange { kind, ranges }
994            },
995            |previous, current| {
996                let ranges: DiffHunkRangeVec = itertools::chain(
997                    iter::once(previous.base.end..current.base.start),
998                    iter::zip(&previous.others, &current.others)
999                        .map(|(prev, cur)| prev.end..cur.start),
1000                )
1001                .collect();
1002                debug_assert!(
1003                    ranges.iter().any(|range| !range.is_empty()),
1004                    "unchanged regions should have been compacted"
1005                );
1006                let kind = DiffHunkKind::Different;
1007                DiffHunkRange { kind, ranges }
1008            },
1009        )
1010    }
1011}
1012
1013/// Diffs slices of bytes.
1014///
1015/// The returned diff hunks may be any length (may span many lines or
1016/// may be only part of a line). This currently uses Histogram diff
1017/// (or maybe something similar; I'm not sure I understood the
1018/// algorithm correctly). It first diffs lines in the input and then
1019/// refines the changed ranges at the word level.
1020pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
1021    inputs: impl IntoIterator<Item = &'a T>,
1022) -> Vec<DiffHunk<'a>> {
1023    let mut diff = ContentDiff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
1024    diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
1025    diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
1026    diff.hunks().collect()
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031    use super::*;
1032
1033    // Extracted to a function because type inference is ambiguous due to
1034    // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>`
1035    fn no_ranges() -> Vec<Range<usize>> {
1036        vec![]
1037    }
1038
1039    #[test]
1040    fn test_find_line_ranges_empty() {
1041        assert_eq!(find_line_ranges(b""), no_ranges());
1042    }
1043
1044    #[test]
1045    fn test_find_line_ranges_blank_line() {
1046        assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
1047    }
1048
1049    #[test]
1050    fn test_find_line_ranges_missing_newline_at_eof() {
1051        assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
1052    }
1053
1054    #[test]
1055    fn test_find_line_ranges_multiple_lines() {
1056        assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
1057    }
1058
1059    #[test]
1060    fn test_find_word_ranges_empty() {
1061        assert_eq!(find_word_ranges(b""), no_ranges());
1062    }
1063
1064    #[test]
1065    fn test_find_word_ranges_single_word() {
1066        assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
1067    }
1068
1069    #[test]
1070    fn test_find_word_ranges_no_word() {
1071        assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
1072    }
1073
1074    #[test]
1075    fn test_find_word_ranges_word_then_non_word() {
1076        assert_eq!(find_word_ranges(b"Abc   "), vec![0..3]);
1077    }
1078
1079    #[test]
1080    fn test_find_word_ranges_non_word_then_word() {
1081        assert_eq!(find_word_ranges(b"   Abc"), vec![3..6]);
1082    }
1083
1084    #[test]
1085    fn test_find_word_ranges_multibyte() {
1086        assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3]);
1087    }
1088
1089    #[test]
1090    fn test_find_lcs_empty() {
1091        let empty: Vec<(usize, usize)> = vec![];
1092        assert_eq!(find_lcs(&[]), empty);
1093    }
1094
1095    #[test]
1096    fn test_find_lcs_single_element() {
1097        assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
1098    }
1099
1100    #[test]
1101    fn test_find_lcs_in_order() {
1102        assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
1103    }
1104
1105    #[test]
1106    fn test_find_lcs_reverse_order() {
1107        assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
1108    }
1109
1110    #[test]
1111    fn test_find_lcs_two_swapped() {
1112        assert_eq!(
1113            find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
1114            vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
1115        );
1116    }
1117
1118    #[test]
1119    fn test_find_lcs_element_moved_earlier() {
1120        assert_eq!(
1121            find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
1122            vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
1123        );
1124    }
1125
1126    #[test]
1127    fn test_find_lcs_element_moved_later() {
1128        assert_eq!(
1129            find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
1130            vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
1131        );
1132    }
1133
1134    #[test]
1135    fn test_find_lcs_interleaved_longest_chains() {
1136        assert_eq!(
1137            find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
1138            vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
1139        );
1140    }
1141
1142    #[test]
1143    fn test_find_word_ranges_many_words() {
1144        assert_eq!(
1145            find_word_ranges(b"fn find_words(text: &[u8])"),
1146            vec![0..2, 3..13, 14..18, 22..24]
1147        );
1148    }
1149
1150    #[test]
1151    fn test_compare_bytes_ignore_all_whitespace() {
1152        let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace);
1153        let hash = |data: &[u8]| comp.hash_one(data);
1154
1155        assert!(comp.eq(b"", b""));
1156        assert!(comp.eq(b"", b" "));
1157        assert!(comp.eq(b"\t", b"\r"));
1158        assert_eq!(hash(b""), hash(b""));
1159        assert_eq!(hash(b""), hash(b" "));
1160        assert_eq!(hash(b""), hash(b"\t"));
1161        assert_eq!(hash(b""), hash(b"\r"));
1162
1163        assert!(comp.eq(b"ab", b" a  b\t"));
1164        assert_eq!(hash(b"ab"), hash(b" a  b\t"));
1165
1166        assert!(!comp.eq(b"a", b""));
1167        assert!(!comp.eq(b"a", b" "));
1168        assert!(!comp.eq(b"a", b"ab"));
1169        assert!(!comp.eq(b"ab", b"ba"));
1170    }
1171
1172    #[test]
1173    fn test_compare_bytes_ignore_whitespace_amount() {
1174        let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount);
1175        let hash = |data: &[u8]| comp.hash_one(data);
1176
1177        assert!(comp.eq(b"", b""));
1178        assert!(comp.eq(b"\n", b" \n"));
1179        assert!(comp.eq(b"\t", b"\r"));
1180        assert_eq!(hash(b""), hash(b""));
1181        assert_eq!(hash(b" "), hash(b"\n"));
1182        assert_eq!(hash(b" "), hash(b" \n"));
1183        assert_eq!(hash(b" "), hash(b"\t"));
1184        assert_eq!(hash(b" "), hash(b"\r"));
1185
1186        assert!(comp.eq(b"a b c\n", b"a  b\tc\r\n"));
1187        assert_eq!(hash(b"a b c\n"), hash(b"a  b\tc\r\n"));
1188
1189        assert!(!comp.eq(b"", b" "));
1190        assert!(!comp.eq(b"a", b""));
1191        assert!(!comp.eq(b"a", b" "));
1192        assert!(!comp.eq(b"a", b"a "));
1193        assert!(!comp.eq(b"a", b" a"));
1194        assert!(!comp.eq(b"a", b"ab"));
1195        assert!(!comp.eq(b"ab", b"ba"));
1196        assert!(!comp.eq(b"ab", b"a b"));
1197    }
1198
1199    fn unchanged_ranges(
1200        (left_text, left_ranges): (&[u8], &[Range<usize>]),
1201        (right_text, right_ranges): (&[u8], &[Range<usize>]),
1202    ) -> Vec<(Range<usize>, Range<usize>)> {
1203        let comp = WordComparator::new(CompareBytesExactly);
1204        let left = DiffSource::new(left_text, left_ranges, &comp);
1205        let right = DiffSource::new(right_text, right_ranges, &comp);
1206        let mut positions = Vec::new();
1207        collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
1208        positions
1209            .into_iter()
1210            .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
1211            .collect()
1212    }
1213
1214    #[test]
1215    fn test_unchanged_ranges_insert_in_middle() {
1216        assert_eq!(
1217            unchanged_ranges(
1218                (b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
1219                (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1220            ),
1221            vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
1222        );
1223    }
1224
1225    #[test]
1226    fn test_unchanged_ranges_non_unique_removed() {
1227        // We used to consider the first two "a" in the first input to match the two
1228        // "a"s in the second input. We no longer do.
1229        assert_eq!(
1230            unchanged_ranges(
1231                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1232                (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1233            ),
1234            vec![(0..1, 0..1)]
1235        );
1236        assert_eq!(
1237            unchanged_ranges(
1238                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1239                (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1240            ),
1241            vec![(6..7, 6..7)]
1242        );
1243        assert_eq!(
1244            unchanged_ranges(
1245                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1246                (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1247            ),
1248            vec![]
1249        );
1250        assert_eq!(
1251            unchanged_ranges(
1252                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1253                (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1254            ),
1255            vec![(0..1, 0..1), (6..7, 6..7)]
1256        );
1257    }
1258
1259    #[test]
1260    fn test_unchanged_ranges_non_unique_added() {
1261        // We used to consider the first two "a" in the first input to match the two
1262        // "a"s in the second input. We no longer do.
1263        assert_eq!(
1264            unchanged_ranges(
1265                (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1266                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1267            ),
1268            vec![(0..1, 0..1)]
1269        );
1270        assert_eq!(
1271            unchanged_ranges(
1272                (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1273                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1274            ),
1275            vec![(6..7, 6..7)]
1276        );
1277        assert_eq!(
1278            unchanged_ranges(
1279                (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1280                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1281            ),
1282            vec![]
1283        );
1284        assert_eq!(
1285            unchanged_ranges(
1286                (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1287                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1288            ),
1289            vec![(0..1, 0..1), (6..7, 6..7)]
1290        );
1291    }
1292
1293    #[test]
1294    fn test_unchanged_ranges_recursion_needed() {
1295        // "|" matches first, then "b" matches within the left/right range.
1296        assert_eq!(
1297            unchanged_ranges(
1298                (b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
1299                (b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
1300            ),
1301            vec![(2..3, 0..1), (4..5, 6..7)]
1302        );
1303        assert_eq!(
1304            unchanged_ranges(
1305                (b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
1306                (b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
1307            ),
1308            vec![(0..1, 2..3), (2..3, 6..7)]
1309        );
1310        // "|" matches first, then the middle range is trimmed.
1311        assert_eq!(
1312            unchanged_ranges(
1313                (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1314                (b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
1315            ),
1316            vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
1317        );
1318        assert_eq!(
1319            unchanged_ranges(
1320                (b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
1321                (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1322            ),
1323            vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
1324        );
1325        // "|" matches first, then "a", then "b".
1326        assert_eq!(
1327            unchanged_ranges(
1328                (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1329                (b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
1330            ),
1331            vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
1332        );
1333        assert_eq!(
1334            unchanged_ranges(
1335                (b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
1336                (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1337            ),
1338            vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
1339        );
1340    }
1341
1342    #[test]
1343    fn test_diff_single_input() {
1344        assert_eq!(diff(["abc"]), vec![DiffHunk::matching(["abc"])]);
1345    }
1346
1347    #[test]
1348    fn test_diff_some_empty_inputs() {
1349        // All empty
1350        assert_eq!(diff([""]), vec![]);
1351        assert_eq!(diff(["", ""]), vec![]);
1352        assert_eq!(diff(["", "", ""]), vec![]);
1353
1354        // One empty
1355        assert_eq!(diff(["a b", ""]), vec![DiffHunk::different(["a b", ""])]);
1356        assert_eq!(diff(["", "a b"]), vec![DiffHunk::different(["", "a b"])]);
1357
1358        // One empty, two match
1359        assert_eq!(
1360            diff(["a b", "", "a b"]),
1361            vec![DiffHunk::different(["a b", "", "a b"])]
1362        );
1363        assert_eq!(
1364            diff(["", "a b", "a b"]),
1365            vec![DiffHunk::different(["", "a b", "a b"])]
1366        );
1367
1368        // Two empty, one differs
1369        assert_eq!(
1370            diff(["a b", "", ""]),
1371            vec![DiffHunk::different(["a b", "", ""])]
1372        );
1373        assert_eq!(
1374            diff(["", "a b", ""]),
1375            vec![DiffHunk::different(["", "a b", ""])]
1376        );
1377    }
1378
1379    #[test]
1380    fn test_diff_two_inputs_one_different() {
1381        assert_eq!(
1382            diff(["a b c", "a X c"]),
1383            vec![
1384                DiffHunk::matching(["a "].repeat(2)),
1385                DiffHunk::different(["b", "X"]),
1386                DiffHunk::matching([" c"].repeat(2)),
1387            ]
1388        );
1389    }
1390
1391    #[test]
1392    fn test_diff_multiple_inputs_one_different() {
1393        assert_eq!(
1394            diff(["a b c", "a X c", "a b c"]),
1395            vec![
1396                DiffHunk::matching(["a "].repeat(3)),
1397                DiffHunk::different(["b", "X", "b"]),
1398                DiffHunk::matching([" c"].repeat(3)),
1399            ]
1400        );
1401    }
1402
1403    #[test]
1404    fn test_diff_multiple_inputs_all_different() {
1405        assert_eq!(
1406            diff(["a b c", "a X c", "a c X"]),
1407            vec![
1408                DiffHunk::matching(["a "].repeat(3)),
1409                DiffHunk::different(["b ", "X ", ""]),
1410                DiffHunk::matching(["c"].repeat(3)),
1411                DiffHunk::different(["", "", " X"]),
1412            ]
1413        );
1414    }
1415
1416    #[test]
1417    fn test_diff_for_tokenizer_compacted() {
1418        // Tests that unchanged regions are compacted when using for_tokenizer()
1419        let diff = ContentDiff::for_tokenizer(
1420            ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
1421            find_line_ranges,
1422            CompareBytesExactly,
1423        );
1424        assert_eq!(
1425            diff.hunks().collect_vec(),
1426            vec![
1427                DiffHunk::matching(["a\nb\nc\n"].repeat(2)),
1428                DiffHunk::different(["d\n", "X\n"]),
1429                DiffHunk::matching(["e\nf\ng"].repeat(2)),
1430            ]
1431        );
1432    }
1433
1434    #[test]
1435    fn test_diff_nothing_in_common() {
1436        assert_eq!(
1437            diff(["aaa", "bb"]),
1438            vec![DiffHunk::different(["aaa", "bb"])]
1439        );
1440    }
1441
1442    #[test]
1443    fn test_diff_insert_in_middle() {
1444        assert_eq!(
1445            diff(["a z", "a S z"]),
1446            vec![
1447                DiffHunk::matching(["a "].repeat(2)),
1448                DiffHunk::different(["", "S "]),
1449                DiffHunk::matching(["z"].repeat(2)),
1450            ]
1451        );
1452    }
1453
1454    #[test]
1455    fn test_diff_no_unique_middle_flips() {
1456        assert_eq!(
1457            diff(["a R R S S z", "a S S R R z"]),
1458            vec![
1459                DiffHunk::matching(["a "].repeat(2)),
1460                DiffHunk::different(["R R ", ""]),
1461                DiffHunk::matching(["S S "].repeat(2)),
1462                DiffHunk::different(["", "R R "]),
1463                DiffHunk::matching(["z"].repeat(2))
1464            ],
1465        );
1466    }
1467
1468    #[test]
1469    fn test_diff_recursion_needed() {
1470        assert_eq!(
1471            diff([
1472                "a q x q y q z q b q y q x q c",
1473                "a r r x q y z q b y q x r r c",
1474            ]),
1475            vec![
1476                DiffHunk::matching(["a "].repeat(2)),
1477                DiffHunk::different(["q", "r"]),
1478                DiffHunk::matching([" "].repeat(2)),
1479                DiffHunk::different(["", "r "]),
1480                DiffHunk::matching(["x q y "].repeat(2)),
1481                DiffHunk::different(["q ", ""]),
1482                DiffHunk::matching(["z q b "].repeat(2)),
1483                DiffHunk::different(["q ", ""]),
1484                DiffHunk::matching(["y q x "].repeat(2)),
1485                DiffHunk::different(["q", "r"]),
1486                DiffHunk::matching([" "].repeat(2)),
1487                DiffHunk::different(["", "r "]),
1488                DiffHunk::matching(["c"].repeat(2)),
1489            ]
1490        );
1491    }
1492
1493    #[test]
1494    fn test_diff_ignore_all_whitespace() {
1495        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1496            let diff = ContentDiff::for_tokenizer(
1497                inputs,
1498                find_line_ranges,
1499                CompareBytesIgnoreAllWhitespace,
1500            );
1501            diff.hunks().collect()
1502        }
1503
1504        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1505        assert_eq!(
1506            diff(["a\n", " a\r\n"]),
1507            vec![DiffHunk::matching(["a\n", " a\r\n"])]
1508        );
1509        assert_eq!(
1510            diff(["a\n", " a\nb"]),
1511            vec![
1512                DiffHunk::matching(["a\n", " a\n"]),
1513                DiffHunk::different(["", "b"]),
1514            ]
1515        );
1516
1517        // No LCS matches, so trim leading/trailing common lines
1518        assert_eq!(
1519            diff(["a\nc\n", " a\n a\n"]),
1520            vec![
1521                DiffHunk::matching(["a\n", " a\n"]),
1522                DiffHunk::different(["c\n", " a\n"]),
1523            ]
1524        );
1525        assert_eq!(
1526            diff(["c\na\n", " a\n a\n"]),
1527            vec![
1528                DiffHunk::different(["c\n", " a\n"]),
1529                DiffHunk::matching(["a\n", " a\n"]),
1530            ]
1531        );
1532    }
1533
1534    #[test]
1535    fn test_diff_ignore_whitespace_amount() {
1536        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1537            let diff = ContentDiff::for_tokenizer(
1538                inputs,
1539                find_line_ranges,
1540                CompareBytesIgnoreWhitespaceAmount,
1541            );
1542            diff.hunks().collect()
1543        }
1544
1545        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1546        // whitespace at line end is ignored
1547        assert_eq!(
1548            diff(["a\n", "a\r\n"]),
1549            vec![DiffHunk::matching(["a\n", "a\r\n"])]
1550        );
1551        // but whitespace at line start isn't
1552        assert_eq!(
1553            diff(["a\n", " a\n"]),
1554            vec![DiffHunk::different(["a\n", " a\n"])]
1555        );
1556        assert_eq!(
1557            diff(["a\n", "a \nb"]),
1558            vec![
1559                DiffHunk::matching(["a\n", "a \n"]),
1560                DiffHunk::different(["", "b"]),
1561            ]
1562        );
1563    }
1564
1565    #[test]
1566    fn test_diff_hunk_iterator() {
1567        let diff = ContentDiff::by_word(["a b c", "a XX c", "a b "]);
1568        assert_eq!(
1569            diff.hunks().collect_vec(),
1570            vec![
1571                DiffHunk::matching(["a "].repeat(3)),
1572                DiffHunk::different(["b", "XX", "b"]),
1573                DiffHunk::matching([" "].repeat(3)),
1574                DiffHunk::different(["c", "c", ""]),
1575            ]
1576        );
1577        assert_eq!(
1578            diff.hunk_ranges().collect_vec(),
1579            vec![
1580                DiffHunkRange {
1581                    kind: DiffHunkKind::Matching,
1582                    ranges: smallvec![0..2, 0..2, 0..2],
1583                },
1584                DiffHunkRange {
1585                    kind: DiffHunkKind::Different,
1586                    ranges: smallvec![2..3, 2..4, 2..3],
1587                },
1588                DiffHunkRange {
1589                    kind: DiffHunkKind::Matching,
1590                    ranges: smallvec![3..4, 4..5, 3..4],
1591                },
1592                DiffHunkRange {
1593                    kind: DiffHunkKind::Different,
1594                    ranges: smallvec![4..5, 5..6, 4..4],
1595                },
1596            ]
1597        );
1598    }
1599
1600    #[test]
1601    fn test_diff_real_case_write_fmt() {
1602        // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
1603        // need for recursion into the range at the end: after splitting at "Arguments"
1604        // and "formatter", the region at the end has the unique words "write_fmt"
1605        // and "fmt", but we forgot to recurse into that region, so we ended up
1606        // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
1607        #[rustfmt::skip]
1608        assert_eq!(
1609            diff([
1610                "    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n        self.styler().write_fmt(fmt).unwrap()\n",
1611                "    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n        self.styler().write_fmt(fmt)\n"
1612            ]),
1613            vec![
1614                DiffHunk::matching(["    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "].repeat(2)),
1615                DiffHunk::different(["", "-> io::Result<()> "]),
1616                DiffHunk::matching(["{\n        self.styler().write_fmt(fmt)"].repeat(2)),
1617                DiffHunk::different([".unwrap()", ""]),
1618                DiffHunk::matching(["\n"].repeat(2))
1619            ]
1620        );
1621    }
1622
1623    #[test]
1624    fn test_diff_real_case_gitgit_read_tree_c() {
1625        // This is the diff from commit e497ea2a9b in the git.git repo
1626        #[rustfmt::skip]
1627        assert_eq!(
1628            diff([
1629                r##"/*
1630 * GIT - The information manager from hell
1631 *
1632 * Copyright (C) Linus Torvalds, 2005
1633 */
1634#include "#cache.h"
1635
1636static int unpack(unsigned char *sha1)
1637{
1638	void *buffer;
1639	unsigned long size;
1640	char type[20];
1641
1642	buffer = read_sha1_file(sha1, type, &size);
1643	if (!buffer)
1644		usage("unable to read sha1 file");
1645	if (strcmp(type, "tree"))
1646		usage("expected a 'tree' node");
1647	while (size) {
1648		int len = strlen(buffer)+1;
1649		unsigned char *sha1 = buffer + len;
1650		char *path = strchr(buffer, ' ')+1;
1651		unsigned int mode;
1652		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1653			usage("corrupt 'tree' file");
1654		buffer = sha1 + 20;
1655		size -= len + 20;
1656		printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1657	}
1658	return 0;
1659}
1660
1661int main(int argc, char **argv)
1662{
1663	int fd;
1664	unsigned char sha1[20];
1665
1666	if (argc != 2)
1667		usage("read-tree <key>");
1668	if (get_sha1_hex(argv[1], sha1) < 0)
1669		usage("read-tree <key>");
1670	sha1_file_directory = getenv(DB_ENVIRONMENT);
1671	if (!sha1_file_directory)
1672		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1673	if (unpack(sha1) < 0)
1674		usage("unpack failed");
1675	return 0;
1676}
1677"##,
1678                r##"/*
1679 * GIT - The information manager from hell
1680 *
1681 * Copyright (C) Linus Torvalds, 2005
1682 */
1683#include "#cache.h"
1684
1685static void create_directories(const char *path)
1686{
1687	int len = strlen(path);
1688	char *buf = malloc(len + 1);
1689	const char *slash = path;
1690
1691	while ((slash = strchr(slash+1, '/')) != NULL) {
1692		len = slash - path;
1693		memcpy(buf, path, len);
1694		buf[len] = 0;
1695		mkdir(buf, 0700);
1696	}
1697}
1698
1699static int create_file(const char *path)
1700{
1701	int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1702	if (fd < 0) {
1703		if (errno == ENOENT) {
1704			create_directories(path);
1705			fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1706		}
1707	}
1708	return fd;
1709}
1710
1711static int unpack(unsigned char *sha1)
1712{
1713	void *buffer;
1714	unsigned long size;
1715	char type[20];
1716
1717	buffer = read_sha1_file(sha1, type, &size);
1718	if (!buffer)
1719		usage("unable to read sha1 file");
1720	if (strcmp(type, "tree"))
1721		usage("expected a 'tree' node");
1722	while (size) {
1723		int len = strlen(buffer)+1;
1724		unsigned char *sha1 = buffer + len;
1725		char *path = strchr(buffer, ' ')+1;
1726		char *data;
1727		unsigned long filesize;
1728		unsigned int mode;
1729		int fd;
1730
1731		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1732			usage("corrupt 'tree' file");
1733		buffer = sha1 + 20;
1734		size -= len + 20;
1735		data = read_sha1_file(sha1, type, &filesize);
1736		if (!data || strcmp(type, "blob"))
1737			usage("tree file refers to bad file data");
1738		fd = create_file(path);
1739		if (fd < 0)
1740			usage("unable to create file");
1741		if (write(fd, data, filesize) != filesize)
1742			usage("unable to write file");
1743		fchmod(fd, mode);
1744		close(fd);
1745		free(data);
1746	}
1747	return 0;
1748}
1749
1750int main(int argc, char **argv)
1751{
1752	int fd;
1753	unsigned char sha1[20];
1754
1755	if (argc != 2)
1756		usage("read-tree <key>");
1757	if (get_sha1_hex(argv[1], sha1) < 0)
1758		usage("read-tree <key>");
1759	sha1_file_directory = getenv(DB_ENVIRONMENT);
1760	if (!sha1_file_directory)
1761		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1762	if (unpack(sha1) < 0)
1763		usage("unpack failed");
1764	return 0;
1765}
1766"##,
1767            ]),
1768            vec![
1769               DiffHunk::matching(["/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"].repeat(2)),
1770               DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1771               DiffHunk::matching(["static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"].repeat(2)),
1772               DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1773               DiffHunk::matching(["\t\tunsigned int mode;\n"].repeat(2)),
1774               DiffHunk::different(["", "\t\tint fd;\n\n"]),
1775               DiffHunk::matching(["\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"].repeat(2)),
1776               DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]),
1777               DiffHunk::matching([" "].repeat(2)),
1778               DiffHunk::different(["sha1_to_hex", "read_sha1_file"]),
1779               DiffHunk::matching(["(sha1"].repeat(2)),
1780               DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
1781               DiffHunk::matching([");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"].repeat(2)),
1782            ]
1783        );
1784    }
1785}