1#![expect(missing_docs)]
16
17use std::collections::BTreeMap;
18use std::hash::BuildHasher;
19use std::hash::Hash;
20use std::hash::Hasher;
21use std::hash::RandomState;
22use std::iter;
23use std::ops::Range;
24use std::slice;
25
26use bstr::BStr;
27use hashbrown::HashTable;
28use itertools::Itertools as _;
29use smallvec::SmallVec;
30use smallvec::smallvec;
31
32pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
33 text.split_inclusive(|b| *b == b'\n')
34 .scan(0, |total, line| {
35 let start = *total;
36 *total += line.len();
37 Some(start..*total)
38 })
39 .collect()
40}
41
42fn is_word_byte(b: u8) -> bool {
43 matches!(
45 b,
46 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
49 )
50}
51
52pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
53 let mut word_ranges = vec![];
54 let mut word_start_pos = 0;
55 let mut in_word = false;
56 for (i, b) in text.iter().enumerate() {
57 if in_word && !is_word_byte(*b) {
58 in_word = false;
59 word_ranges.push(word_start_pos..i);
60 word_start_pos = i;
61 } else if !in_word && is_word_byte(*b) {
62 in_word = true;
63 word_start_pos = i;
64 }
65 }
66 if in_word && word_start_pos < text.len() {
67 word_ranges.push(word_start_pos..text.len());
68 }
69 word_ranges
70}
71
72pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
73 text.iter()
74 .positions(|b| !is_word_byte(*b))
75 .map(|i| i..i + 1)
76 .collect()
77}
78
79fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
80 text.iter().copied().filter(|b| !b.is_ascii_whitespace())
81}
82
83fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
84 let mut prev_was_space = false;
85 text.iter().filter_map(move |&b| {
86 let was_space = prev_was_space;
87 let is_space = b.is_ascii_whitespace();
88 prev_was_space = is_space;
89 match (was_space, is_space) {
90 (_, false) => Some(b),
91 (false, true) => Some(b' '),
92 (true, true) => None,
93 }
94 })
95}
96
97fn hash_with_length_suffix<I, H>(data: I, state: &mut H)
98where
99 I: IntoIterator,
100 I::Item: Hash,
101 H: Hasher,
102{
103 let mut len: usize = 0;
104 for d in data {
105 d.hash(state);
106 len += 1;
107 }
108 state.write_usize(len);
109}
110
111pub trait CompareBytes {
124 fn eq(&self, left: &[u8], right: &[u8]) -> bool;
126
127 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
130}
131
132impl<C: CompareBytes + ?Sized> CompareBytes for &C {
135 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
136 <C as CompareBytes>::eq(self, left, right)
137 }
138
139 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
140 <C as CompareBytes>::hash(self, text, state);
141 }
142}
143
144#[derive(Clone, Debug, Default)]
146pub struct CompareBytesExactly;
147
148impl CompareBytes for CompareBytesExactly {
149 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
150 left == right
151 }
152
153 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
154 text.hash(state);
155 }
156}
157
158#[derive(Clone, Debug, Default)]
160pub struct CompareBytesIgnoreAllWhitespace;
161
162impl CompareBytes for CompareBytesIgnoreAllWhitespace {
163 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
164 bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right))
165 }
166
167 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
168 hash_with_length_suffix(bytes_ignore_all_whitespace(text), state);
169 }
170}
171
172#[derive(Clone, Debug, Default)]
174pub struct CompareBytesIgnoreWhitespaceAmount;
175
176impl CompareBytes for CompareBytesIgnoreWhitespaceAmount {
177 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
178 bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right))
179 }
180
181 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
182 hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state);
183 }
184}
185
186#[derive(Clone, Copy, Debug)]
188struct HashedWord<'input> {
189 hash: u64,
190 text: &'input BStr,
191}
192
193#[derive(Clone, Debug, Default)]
195struct WordComparator<C, S> {
196 compare: C,
197 hash_builder: S,
198}
199
200impl<C: CompareBytes> WordComparator<C, RandomState> {
201 fn new(compare: C) -> Self {
202 Self {
203 compare,
204 hash_builder: RandomState::new(),
206 }
207 }
208}
209
210impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
211 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
212 self.compare.eq(left, right)
213 }
214
215 fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool {
216 left.hash == right.hash && self.compare.eq(left.text, right.text)
217 }
218
219 fn hash_one(&self, text: &[u8]) -> u64 {
220 let mut state = self.hash_builder.build_hasher();
221 self.compare.hash(text, &mut state);
222 state.finish()
223 }
224}
225
226#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
228struct WordPosition(usize);
229
230#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
232struct LocalWordPosition(usize);
233
234#[derive(Clone, Debug)]
235struct DiffSource<'input, 'aux> {
236 text: &'input BStr,
237 ranges: &'aux [Range<usize>],
238 hashes: Vec<u64>,
239}
240
241impl<'input, 'aux> DiffSource<'input, 'aux> {
242 fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>(
243 text: &'input T,
244 ranges: &'aux [Range<usize>],
245 comp: &WordComparator<C, S>,
246 ) -> Self {
247 let text = BStr::new(text);
248 let hashes = ranges
249 .iter()
250 .map(|range| comp.hash_one(&text[range.clone()]))
251 .collect();
252 Self {
253 text,
254 ranges,
255 hashes,
256 }
257 }
258
259 fn local(&self) -> LocalDiffSource<'input, '_> {
260 LocalDiffSource {
261 text: self.text,
262 ranges: self.ranges,
263 hashes: &self.hashes,
264 global_offset: WordPosition(0),
265 }
266 }
267
268 fn range_at(&self, position: WordPosition) -> Range<usize> {
269 self.ranges[position.0].clone()
270 }
271}
272
273#[derive(Clone, Debug)]
274struct LocalDiffSource<'input, 'aux> {
275 text: &'input BStr,
276 ranges: &'aux [Range<usize>],
277 hashes: &'aux [u64],
278 global_offset: WordPosition,
280}
281
282impl<'input> LocalDiffSource<'input, '_> {
283 fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
284 Self {
285 text: self.text,
286 ranges: &self.ranges[positions.start.0..positions.end.0],
287 hashes: &self.hashes[positions.start.0..positions.end.0],
288 global_offset: self.map_to_global(positions.start),
289 }
290 }
291
292 fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
293 WordPosition(self.global_offset.0 + position.0)
294 }
295
296 fn hashed_words(
297 &self,
298 ) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + use<'_, 'input>
299 {
300 iter::zip(self.ranges, self.hashes).map(|(range, &hash)| {
301 let text = &self.text[range.clone()];
302 HashedWord { hash, text }
303 })
304 }
305}
306
307struct Histogram<'input> {
308 word_to_positions: HashTable<HistogramEntry<'input>>,
309}
310
311type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>);
314
315impl<'input> Histogram<'input> {
316 fn calculate<C: CompareBytes, S: BuildHasher>(
317 source: &LocalDiffSource<'input, '_>,
318 comp: &WordComparator<C, S>,
319 max_occurrences: usize,
320 ) -> Self {
321 let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
322 for (i, word) in source.hashed_words().enumerate() {
323 let pos = LocalWordPosition(i);
324 word_to_positions
325 .entry(
326 word.hash,
327 |(w, _)| comp.eq(w.text, word.text),
328 |(w, _)| w.hash,
329 )
330 .and_modify(|(_, positions)| {
331 if positions.len() <= max_occurrences {
334 positions.push(pos);
335 }
336 })
337 .or_insert_with(|| (word, smallvec![pos]));
338 }
339 Self { word_to_positions }
340 }
341
342 fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
343 let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
344 for entry in &self.word_to_positions {
345 let (_, positions) = entry;
346 let entries = count_to_entries.entry(positions.len()).or_default();
347 entries.push(entry);
348 }
349 count_to_entries
350 }
351
352 fn positions_by_word<C: CompareBytes, S: BuildHasher>(
353 &self,
354 word: HashedWord<'input>,
355 comp: &WordComparator<C, S>,
356 ) -> Option<&[LocalWordPosition]> {
357 let (_, positions) = self
358 .word_to_positions
359 .find(word.hash, |(w, _)| comp.eq(w.text, word.text))?;
360 Some(positions)
361 }
362}
363
364fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
375 if input.is_empty() {
376 return vec![];
377 }
378
379 let mut chain = vec![(0, 0, 0); input.len()];
380 let mut global_longest = 0;
381 let mut global_longest_right_pos = 0;
382 for (right_pos, &left_pos) in input.iter().enumerate() {
383 let mut longest_from_here = 1;
384 let mut previous_right_pos = usize::MAX;
385 for i in (0..right_pos).rev() {
386 let (previous_len, previous_left_pos, _) = chain[i];
387 if previous_left_pos < left_pos {
388 let len = previous_len + 1;
389 if len > longest_from_here {
390 longest_from_here = len;
391 previous_right_pos = i;
392 if len > global_longest {
393 global_longest = len;
394 global_longest_right_pos = right_pos;
395 break;
398 }
399 }
400 }
401 }
402 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
403 }
404
405 let mut result = vec![];
406 let mut right_pos = global_longest_right_pos;
407 loop {
408 let (_, left_pos, previous_right_pos) = chain[right_pos];
409 result.push((left_pos, right_pos));
410 if previous_right_pos == usize::MAX {
411 break;
412 }
413 right_pos = previous_right_pos;
414 }
415 result.reverse();
416
417 result
418}
419
420fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
423 found_positions: &mut Vec<(WordPosition, WordPosition)>,
424 left: &LocalDiffSource,
425 right: &LocalDiffSource,
426 comp: &WordComparator<C, S>,
427) {
428 if left.ranges.is_empty() || right.ranges.is_empty() {
429 return;
430 }
431
432 let old_len = found_positions.len();
434 collect_unchanged_words_lcs(found_positions, left, right, comp);
435 if found_positions.len() != old_len {
436 return;
437 }
438
439 let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words())
441 .take_while(|&(l, r)| comp.eq_hashed(l, r))
442 .count();
443 let left_hashed_words = left.hashed_words().skip(common_leading_len);
444 let right_hashed_words = right.hashed_words().skip(common_leading_len);
445
446 let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev())
448 .take_while(|&(l, r)| comp.eq_hashed(l, r))
449 .count();
450
451 found_positions.extend(itertools::chain(
452 (0..common_leading_len).map(|i| {
453 (
454 left.map_to_global(LocalWordPosition(i)),
455 right.map_to_global(LocalWordPosition(i)),
456 )
457 }),
458 (1..=common_trailing_len).rev().map(|i| {
459 (
460 left.map_to_global(LocalWordPosition(left.ranges.len() - i)),
461 right.map_to_global(LocalWordPosition(right.ranges.len() - i)),
462 )
463 }),
464 ));
465}
466
467fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
468 found_positions: &mut Vec<(WordPosition, WordPosition)>,
469 left: &LocalDiffSource,
470 right: &LocalDiffSource,
471 comp: &WordComparator<C, S>,
472) {
473 let max_occurrences = 100;
474 let left_histogram = Histogram::calculate(left, comp, max_occurrences);
475 let left_count_to_entries = left_histogram.build_count_to_entries();
476 if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
477 return;
479 }
480 let right_histogram = Histogram::calculate(right, comp, max_occurrences);
481 let Some(uncommon_shared_word_positions) =
485 left_count_to_entries.values().find_map(|left_entries| {
486 let mut both_positions = left_entries
487 .iter()
488 .filter_map(|&(word, left_positions)| {
489 let right_positions = right_histogram.positions_by_word(*word, comp)?;
490 (left_positions.len() == right_positions.len())
491 .then_some((left_positions, right_positions))
492 })
493 .peekable();
494 both_positions.peek().is_some().then_some(both_positions)
495 })
496 else {
497 return;
498 };
499
500 let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
502 uncommon_shared_word_positions
503 .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
504 .enumerate()
505 .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
506 .unzip();
507 left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
508 right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
509 let left_index_by_right_index: Vec<usize> = {
510 let mut left_index_map = vec![0; left_positions.len()];
511 for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
512 left_index_map[serial] = i;
513 }
514 right_positions
515 .iter()
516 .map(|&(_pos, serial)| left_index_map[serial])
517 .collect()
518 };
519
520 let lcs = find_lcs(&left_index_by_right_index);
521
522 let mut previous_left_position = LocalWordPosition(0);
525 let mut previous_right_position = LocalWordPosition(0);
526 for (left_index, right_index) in lcs {
527 let (left_position, _) = left_positions[left_index];
528 let (right_position, _) = right_positions[right_index];
529 collect_unchanged_words(
530 found_positions,
531 &left.narrowed(previous_left_position..left_position),
532 &right.narrowed(previous_right_position..right_position),
533 comp,
534 );
535 found_positions.push((
536 left.map_to_global(left_position),
537 right.map_to_global(right_position),
538 ));
539 previous_left_position = LocalWordPosition(left_position.0 + 1);
540 previous_right_position = LocalWordPosition(right_position.0 + 1);
541 }
542 collect_unchanged_words(
544 found_positions,
545 &left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())),
546 &right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())),
547 comp,
548 );
549}
550
551fn intersect_unchanged_words(
554 current_positions: Vec<(WordPosition, Vec<WordPosition>)>,
555 new_positions: &[(WordPosition, WordPosition)],
556) -> Vec<(WordPosition, Vec<WordPosition>)> {
557 itertools::merge_join_by(
558 current_positions,
559 new_positions,
560 |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos),
561 )
562 .filter_map(|entry| entry.both())
563 .map(|((base_pos, mut other_positions), &(_, new_other_pos))| {
564 other_positions.push(new_other_pos);
565 (base_pos, other_positions)
566 })
567 .collect()
568}
569
570#[derive(Clone, PartialEq, Eq, Debug)]
571struct UnchangedRange {
572 base: Range<usize>,
574 others: SmallVec<[Range<usize>; 1]>,
575}
576
577impl UnchangedRange {
578 fn from_word_positions(
580 base_source: &DiffSource,
581 other_sources: &[DiffSource],
582 base_position: WordPosition,
583 other_positions: &[WordPosition],
584 ) -> Self {
585 assert_eq!(other_sources.len(), other_positions.len());
586 let base = base_source.range_at(base_position);
587 let others = iter::zip(other_sources, other_positions)
588 .map(|(source, pos)| source.range_at(*pos))
589 .collect();
590 Self { base, others }
591 }
592
593 fn is_all_empty(&self) -> bool {
594 self.base.is_empty() && self.others.iter().all(|r| r.is_empty())
595 }
596}
597
598#[derive(Clone, Debug)]
601pub struct ContentDiff<'input> {
602 base_input: &'input BStr,
603 other_inputs: SmallVec<[&'input BStr; 1]>,
604 unchanged_regions: Vec<UnchangedRange>,
609}
610
611impl<'input> ContentDiff<'input> {
612 pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
613 inputs: impl IntoIterator<Item = &'input T>,
614 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
615 compare: impl CompareBytes,
616 ) -> Self {
617 let mut inputs = inputs.into_iter().map(BStr::new);
618 let base_input = inputs.next().expect("inputs must not be empty");
619 let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect();
620 let base_token_ranges: Vec<Range<usize>>;
622 let other_token_ranges: Vec<Vec<Range<usize>>>;
623 if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
628 base_token_ranges = vec![];
629 other_token_ranges = std::iter::repeat_n(vec![], other_inputs.len()).collect();
630 } else {
631 base_token_ranges = tokenizer(base_input);
632 other_token_ranges = other_inputs
633 .iter()
634 .map(|other_input| tokenizer(other_input))
635 .collect();
636 }
637 Self::with_inputs_and_token_ranges(
638 base_input,
639 other_inputs,
640 &base_token_ranges,
641 &other_token_ranges,
642 compare,
643 )
644 }
645
646 fn with_inputs_and_token_ranges(
647 base_input: &'input BStr,
648 other_inputs: SmallVec<[&'input BStr; 1]>,
649 base_token_ranges: &[Range<usize>],
650 other_token_ranges: &[Vec<Range<usize>>],
651 compare: impl CompareBytes,
652 ) -> Self {
653 assert_eq!(other_inputs.len(), other_token_ranges.len());
654 let comp = WordComparator::new(compare);
655 let base_source = DiffSource::new(base_input, base_token_ranges, &comp);
656 let other_sources = iter::zip(&other_inputs, other_token_ranges)
657 .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp))
658 .collect_vec();
659 let unchanged_regions = match &*other_sources {
660 [] => {
663 let whole_range = UnchangedRange {
664 base: 0..base_source.text.len(),
665 others: smallvec![],
666 };
667 vec![whole_range]
668 }
669 [first_other_source, tail_other_sources @ ..] => {
672 let mut unchanged_regions = Vec::new();
673 unchanged_regions.push(UnchangedRange {
675 base: 0..0,
676 others: smallvec![0..0; other_inputs.len()],
677 });
678 let mut first_positions = Vec::new();
679 collect_unchanged_words(
680 &mut first_positions,
681 &base_source.local(),
682 &first_other_source.local(),
683 &comp,
684 );
685 if tail_other_sources.is_empty() {
686 unchanged_regions.extend(first_positions.iter().map(
687 |&(base_pos, other_pos)| {
688 UnchangedRange::from_word_positions(
689 &base_source,
690 &other_sources,
691 base_pos,
692 &[other_pos],
693 )
694 },
695 ));
696 } else {
697 let first_positions = first_positions
698 .iter()
699 .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos]))
700 .collect();
701 let intersected_positions = tail_other_sources.iter().fold(
702 first_positions,
703 |current_positions, other_source| {
704 let mut new_positions = Vec::new();
705 collect_unchanged_words(
706 &mut new_positions,
707 &base_source.local(),
708 &other_source.local(),
709 &comp,
710 );
711 intersect_unchanged_words(current_positions, &new_positions)
712 },
713 );
714 unchanged_regions.extend(intersected_positions.iter().map(
715 |(base_pos, other_positions)| {
716 UnchangedRange::from_word_positions(
717 &base_source,
718 &other_sources,
719 *base_pos,
720 other_positions,
721 )
722 },
723 ));
724 };
725 unchanged_regions.push(UnchangedRange {
727 base: base_input.len()..base_input.len(),
728 others: other_inputs
729 .iter()
730 .map(|input| input.len()..input.len())
731 .collect(),
732 });
733 unchanged_regions
734 }
735 };
736
737 let mut diff = Self {
738 base_input,
739 other_inputs,
740 unchanged_regions,
741 };
742 diff.compact_unchanged_regions();
743 diff
744 }
745
746 pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
747 inputs: impl IntoIterator<Item = &'input T>,
748 ) -> Self {
749 ContentDiff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
750 }
751
752 pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
754 inputs: impl IntoIterator<Item = &'input T>,
755 ) -> Self {
756 ContentDiff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
757 }
758
759 pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
764 inputs: impl IntoIterator<Item = &'input T>,
765 ) -> Self {
766 let mut diff = ContentDiff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
767 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
768 diff
769 }
770
771 pub fn hunks(&self) -> DiffHunkIterator<'_, 'input> {
773 let ranges = self.hunk_ranges();
774 DiffHunkIterator { diff: self, ranges }
775 }
776
777 pub fn hunk_ranges(&self) -> DiffHunkRangeIterator<'_> {
779 DiffHunkRangeIterator::new(self)
780 }
781
782 fn hunk_at<'a, 'b>(
784 &'a self,
785 range: &'b UnchangedRange,
786 ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
787 itertools::chain(
788 iter::once(&self.base_input[range.base.clone()]),
789 iter::zip(&self.other_inputs, &range.others).map(|(input, r)| &input[r.clone()]),
790 )
791 }
792
793 fn hunk_between<'a, 'b>(
795 &'a self,
796 previous: &'b UnchangedRange,
797 current: &'b UnchangedRange,
798 ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
799 itertools::chain(
800 iter::once(&self.base_input[previous.base.end..current.base.start]),
801 itertools::izip!(&self.other_inputs, &previous.others, ¤t.others)
802 .map(|(input, prev, cur)| &input[prev.end..cur.start]),
803 )
804 }
805
806 pub fn refine_changed_regions(
809 &mut self,
810 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
811 compare: impl CompareBytes,
812 ) {
813 let mut new_unchanged_ranges = vec![self.unchanged_regions[0].clone()];
814 for window in self.unchanged_regions.windows(2) {
815 let [previous, current]: &[_; 2] = window.try_into().unwrap();
816 let refined_diff = ContentDiff::for_tokenizer(
821 self.hunk_between(previous, current),
822 &tokenizer,
823 &compare,
824 );
825 for refined in &refined_diff.unchanged_regions {
826 let new_base_start = refined.base.start + previous.base.end;
827 let new_base_end = refined.base.end + previous.base.end;
828 let new_others = iter::zip(&refined.others, &previous.others)
829 .map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end))
830 .collect();
831 new_unchanged_ranges.push(UnchangedRange {
832 base: new_base_start..new_base_end,
833 others: new_others,
834 });
835 }
836 new_unchanged_ranges.push(current.clone());
837 }
838 self.unchanged_regions = new_unchanged_ranges;
839 self.compact_unchanged_regions();
840 }
841
842 fn compact_unchanged_regions(&mut self) {
843 let mut compacted = vec![];
844 let mut maybe_previous: Option<UnchangedRange> = None;
845 for current in &self.unchanged_regions {
846 if let Some(previous) = maybe_previous {
847 if previous.base.end == current.base.start
848 && iter::zip(&previous.others, ¤t.others)
849 .all(|(prev, cur)| prev.end == cur.start)
850 {
851 maybe_previous = Some(UnchangedRange {
852 base: previous.base.start..current.base.end,
853 others: iter::zip(&previous.others, ¤t.others)
854 .map(|(prev, cur)| prev.start..cur.end)
855 .collect(),
856 });
857 continue;
858 }
859 compacted.push(previous);
860 }
861 maybe_previous = Some(current.clone());
862 }
863 if let Some(previous) = maybe_previous {
864 compacted.push(previous);
865 }
866 self.unchanged_regions = compacted;
867 }
868}
869
870#[derive(Clone, Debug, Eq, PartialEq)]
872pub struct DiffHunk<'input> {
873 pub kind: DiffHunkKind,
874 pub contents: DiffHunkContentVec<'input>,
875}
876
877impl<'input> DiffHunk<'input> {
878 pub fn matching<T: AsRef<[u8]> + ?Sized + 'input>(
879 contents: impl IntoIterator<Item = &'input T>,
880 ) -> Self {
881 Self {
882 kind: DiffHunkKind::Matching,
883 contents: contents.into_iter().map(BStr::new).collect(),
884 }
885 }
886
887 pub fn different<T: AsRef<[u8]> + ?Sized + 'input>(
888 contents: impl IntoIterator<Item = &'input T>,
889 ) -> Self {
890 Self {
891 kind: DiffHunkKind::Different,
892 contents: contents.into_iter().map(BStr::new).collect(),
893 }
894 }
895}
896
897#[derive(Clone, Copy, Debug, Eq, PartialEq)]
898pub enum DiffHunkKind {
899 Matching,
900 Different,
901}
902
903pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>;
905
906#[derive(Clone, Debug)]
908pub struct DiffHunkIterator<'diff, 'input> {
909 diff: &'diff ContentDiff<'input>,
910 ranges: DiffHunkRangeIterator<'diff>,
911}
912
913impl<'input> Iterator for DiffHunkIterator<'_, 'input> {
914 type Item = DiffHunk<'input>;
915
916 fn next(&mut self) -> Option<Self::Item> {
917 self.ranges.next_with(
918 |previous| {
919 let contents = self.diff.hunk_at(previous).collect();
920 let kind = DiffHunkKind::Matching;
921 DiffHunk { kind, contents }
922 },
923 |previous, current| {
924 let contents: DiffHunkContentVec =
925 self.diff.hunk_between(previous, current).collect();
926 debug_assert!(
927 contents.iter().any(|content| !content.is_empty()),
928 "unchanged regions should have been compacted"
929 );
930 let kind = DiffHunkKind::Different;
931 DiffHunk { kind, contents }
932 },
933 )
934 }
935}
936
937#[derive(Clone, Debug, Eq, PartialEq)]
939pub struct DiffHunkRange {
940 pub kind: DiffHunkKind,
941 pub ranges: DiffHunkRangeVec,
942}
943
944pub type DiffHunkRangeVec = SmallVec<[Range<usize>; 2]>;
946
947#[derive(Clone, Debug)]
949pub struct DiffHunkRangeIterator<'diff> {
950 previous: &'diff UnchangedRange,
951 unchanged_emitted: bool,
952 unchanged_iter: slice::Iter<'diff, UnchangedRange>,
953}
954
955impl<'diff> DiffHunkRangeIterator<'diff> {
956 fn new(diff: &'diff ContentDiff) -> Self {
957 let mut unchanged_iter = diff.unchanged_regions.iter();
958 let previous = unchanged_iter.next().unwrap();
959 Self {
960 previous,
961 unchanged_emitted: previous.is_all_empty(),
962 unchanged_iter,
963 }
964 }
965
966 fn next_with<T>(
967 &mut self,
968 hunk_at: impl FnOnce(&UnchangedRange) -> T,
969 hunk_between: impl FnOnce(&UnchangedRange, &UnchangedRange) -> T,
970 ) -> Option<T> {
971 if !self.unchanged_emitted {
972 self.unchanged_emitted = true;
973 return Some(hunk_at(self.previous));
974 }
975 let current = self.unchanged_iter.next()?;
976 let hunk = hunk_between(self.previous, current);
977 self.previous = current;
978 self.unchanged_emitted = self.previous.is_all_empty();
979 Some(hunk)
980 }
981}
982
983impl Iterator for DiffHunkRangeIterator<'_> {
984 type Item = DiffHunkRange;
985
986 fn next(&mut self) -> Option<Self::Item> {
987 self.next_with(
988 |previous| {
989 let ranges = itertools::chain(iter::once(&previous.base), &previous.others)
990 .cloned()
991 .collect();
992 let kind = DiffHunkKind::Matching;
993 DiffHunkRange { kind, ranges }
994 },
995 |previous, current| {
996 let ranges: DiffHunkRangeVec = itertools::chain(
997 iter::once(previous.base.end..current.base.start),
998 iter::zip(&previous.others, ¤t.others)
999 .map(|(prev, cur)| prev.end..cur.start),
1000 )
1001 .collect();
1002 debug_assert!(
1003 ranges.iter().any(|range| !range.is_empty()),
1004 "unchanged regions should have been compacted"
1005 );
1006 let kind = DiffHunkKind::Different;
1007 DiffHunkRange { kind, ranges }
1008 },
1009 )
1010 }
1011}
1012
1013pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
1021 inputs: impl IntoIterator<Item = &'a T>,
1022) -> Vec<DiffHunk<'a>> {
1023 let mut diff = ContentDiff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
1024 diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
1025 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
1026 diff.hunks().collect()
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031 use super::*;
1032
1033 fn no_ranges() -> Vec<Range<usize>> {
1036 vec![]
1037 }
1038
1039 #[test]
1040 fn test_find_line_ranges_empty() {
1041 assert_eq!(find_line_ranges(b""), no_ranges());
1042 }
1043
1044 #[test]
1045 fn test_find_line_ranges_blank_line() {
1046 assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
1047 }
1048
1049 #[test]
1050 fn test_find_line_ranges_missing_newline_at_eof() {
1051 assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
1052 }
1053
1054 #[test]
1055 fn test_find_line_ranges_multiple_lines() {
1056 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
1057 }
1058
1059 #[test]
1060 fn test_find_word_ranges_empty() {
1061 assert_eq!(find_word_ranges(b""), no_ranges());
1062 }
1063
1064 #[test]
1065 fn test_find_word_ranges_single_word() {
1066 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
1067 }
1068
1069 #[test]
1070 fn test_find_word_ranges_no_word() {
1071 assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
1072 }
1073
1074 #[test]
1075 fn test_find_word_ranges_word_then_non_word() {
1076 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]);
1077 }
1078
1079 #[test]
1080 fn test_find_word_ranges_non_word_then_word() {
1081 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
1082 }
1083
1084 #[test]
1085 fn test_find_word_ranges_multibyte() {
1086 assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3]);
1087 }
1088
1089 #[test]
1090 fn test_find_lcs_empty() {
1091 let empty: Vec<(usize, usize)> = vec![];
1092 assert_eq!(find_lcs(&[]), empty);
1093 }
1094
1095 #[test]
1096 fn test_find_lcs_single_element() {
1097 assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
1098 }
1099
1100 #[test]
1101 fn test_find_lcs_in_order() {
1102 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
1103 }
1104
1105 #[test]
1106 fn test_find_lcs_reverse_order() {
1107 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
1108 }
1109
1110 #[test]
1111 fn test_find_lcs_two_swapped() {
1112 assert_eq!(
1113 find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
1114 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
1115 );
1116 }
1117
1118 #[test]
1119 fn test_find_lcs_element_moved_earlier() {
1120 assert_eq!(
1121 find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
1122 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
1123 );
1124 }
1125
1126 #[test]
1127 fn test_find_lcs_element_moved_later() {
1128 assert_eq!(
1129 find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
1130 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
1131 );
1132 }
1133
1134 #[test]
1135 fn test_find_lcs_interleaved_longest_chains() {
1136 assert_eq!(
1137 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
1138 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
1139 );
1140 }
1141
1142 #[test]
1143 fn test_find_word_ranges_many_words() {
1144 assert_eq!(
1145 find_word_ranges(b"fn find_words(text: &[u8])"),
1146 vec![0..2, 3..13, 14..18, 22..24]
1147 );
1148 }
1149
1150 #[test]
1151 fn test_compare_bytes_ignore_all_whitespace() {
1152 let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace);
1153 let hash = |data: &[u8]| comp.hash_one(data);
1154
1155 assert!(comp.eq(b"", b""));
1156 assert!(comp.eq(b"", b" "));
1157 assert!(comp.eq(b"\t", b"\r"));
1158 assert_eq!(hash(b""), hash(b""));
1159 assert_eq!(hash(b""), hash(b" "));
1160 assert_eq!(hash(b""), hash(b"\t"));
1161 assert_eq!(hash(b""), hash(b"\r"));
1162
1163 assert!(comp.eq(b"ab", b" a b\t"));
1164 assert_eq!(hash(b"ab"), hash(b" a b\t"));
1165
1166 assert!(!comp.eq(b"a", b""));
1167 assert!(!comp.eq(b"a", b" "));
1168 assert!(!comp.eq(b"a", b"ab"));
1169 assert!(!comp.eq(b"ab", b"ba"));
1170 }
1171
1172 #[test]
1173 fn test_compare_bytes_ignore_whitespace_amount() {
1174 let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount);
1175 let hash = |data: &[u8]| comp.hash_one(data);
1176
1177 assert!(comp.eq(b"", b""));
1178 assert!(comp.eq(b"\n", b" \n"));
1179 assert!(comp.eq(b"\t", b"\r"));
1180 assert_eq!(hash(b""), hash(b""));
1181 assert_eq!(hash(b" "), hash(b"\n"));
1182 assert_eq!(hash(b" "), hash(b" \n"));
1183 assert_eq!(hash(b" "), hash(b"\t"));
1184 assert_eq!(hash(b" "), hash(b"\r"));
1185
1186 assert!(comp.eq(b"a b c\n", b"a b\tc\r\n"));
1187 assert_eq!(hash(b"a b c\n"), hash(b"a b\tc\r\n"));
1188
1189 assert!(!comp.eq(b"", b" "));
1190 assert!(!comp.eq(b"a", b""));
1191 assert!(!comp.eq(b"a", b" "));
1192 assert!(!comp.eq(b"a", b"a "));
1193 assert!(!comp.eq(b"a", b" a"));
1194 assert!(!comp.eq(b"a", b"ab"));
1195 assert!(!comp.eq(b"ab", b"ba"));
1196 assert!(!comp.eq(b"ab", b"a b"));
1197 }
1198
1199 fn unchanged_ranges(
1200 (left_text, left_ranges): (&[u8], &[Range<usize>]),
1201 (right_text, right_ranges): (&[u8], &[Range<usize>]),
1202 ) -> Vec<(Range<usize>, Range<usize>)> {
1203 let comp = WordComparator::new(CompareBytesExactly);
1204 let left = DiffSource::new(left_text, left_ranges, &comp);
1205 let right = DiffSource::new(right_text, right_ranges, &comp);
1206 let mut positions = Vec::new();
1207 collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
1208 positions
1209 .into_iter()
1210 .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
1211 .collect()
1212 }
1213
1214 #[test]
1215 fn test_unchanged_ranges_insert_in_middle() {
1216 assert_eq!(
1217 unchanged_ranges(
1218 (b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
1219 (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1220 ),
1221 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
1222 );
1223 }
1224
1225 #[test]
1226 fn test_unchanged_ranges_non_unique_removed() {
1227 assert_eq!(
1230 unchanged_ranges(
1231 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1232 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1233 ),
1234 vec![(0..1, 0..1)]
1235 );
1236 assert_eq!(
1237 unchanged_ranges(
1238 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1239 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1240 ),
1241 vec![(6..7, 6..7)]
1242 );
1243 assert_eq!(
1244 unchanged_ranges(
1245 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1246 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1247 ),
1248 vec![]
1249 );
1250 assert_eq!(
1251 unchanged_ranges(
1252 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1253 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1254 ),
1255 vec![(0..1, 0..1), (6..7, 6..7)]
1256 );
1257 }
1258
1259 #[test]
1260 fn test_unchanged_ranges_non_unique_added() {
1261 assert_eq!(
1264 unchanged_ranges(
1265 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1266 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1267 ),
1268 vec![(0..1, 0..1)]
1269 );
1270 assert_eq!(
1271 unchanged_ranges(
1272 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1273 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1274 ),
1275 vec![(6..7, 6..7)]
1276 );
1277 assert_eq!(
1278 unchanged_ranges(
1279 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1280 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1281 ),
1282 vec![]
1283 );
1284 assert_eq!(
1285 unchanged_ranges(
1286 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1287 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1288 ),
1289 vec![(0..1, 0..1), (6..7, 6..7)]
1290 );
1291 }
1292
1293 #[test]
1294 fn test_unchanged_ranges_recursion_needed() {
1295 assert_eq!(
1297 unchanged_ranges(
1298 (b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
1299 (b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
1300 ),
1301 vec![(2..3, 0..1), (4..5, 6..7)]
1302 );
1303 assert_eq!(
1304 unchanged_ranges(
1305 (b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
1306 (b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
1307 ),
1308 vec![(0..1, 2..3), (2..3, 6..7)]
1309 );
1310 assert_eq!(
1312 unchanged_ranges(
1313 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1314 (b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
1315 ),
1316 vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
1317 );
1318 assert_eq!(
1319 unchanged_ranges(
1320 (b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
1321 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1322 ),
1323 vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
1324 );
1325 assert_eq!(
1327 unchanged_ranges(
1328 (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1329 (b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
1330 ),
1331 vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
1332 );
1333 assert_eq!(
1334 unchanged_ranges(
1335 (b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
1336 (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1337 ),
1338 vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
1339 );
1340 }
1341
1342 #[test]
1343 fn test_diff_single_input() {
1344 assert_eq!(diff(["abc"]), vec![DiffHunk::matching(["abc"])]);
1345 }
1346
1347 #[test]
1348 fn test_diff_some_empty_inputs() {
1349 assert_eq!(diff([""]), vec![]);
1351 assert_eq!(diff(["", ""]), vec![]);
1352 assert_eq!(diff(["", "", ""]), vec![]);
1353
1354 assert_eq!(diff(["a b", ""]), vec![DiffHunk::different(["a b", ""])]);
1356 assert_eq!(diff(["", "a b"]), vec![DiffHunk::different(["", "a b"])]);
1357
1358 assert_eq!(
1360 diff(["a b", "", "a b"]),
1361 vec![DiffHunk::different(["a b", "", "a b"])]
1362 );
1363 assert_eq!(
1364 diff(["", "a b", "a b"]),
1365 vec![DiffHunk::different(["", "a b", "a b"])]
1366 );
1367
1368 assert_eq!(
1370 diff(["a b", "", ""]),
1371 vec![DiffHunk::different(["a b", "", ""])]
1372 );
1373 assert_eq!(
1374 diff(["", "a b", ""]),
1375 vec![DiffHunk::different(["", "a b", ""])]
1376 );
1377 }
1378
1379 #[test]
1380 fn test_diff_two_inputs_one_different() {
1381 assert_eq!(
1382 diff(["a b c", "a X c"]),
1383 vec![
1384 DiffHunk::matching(["a "].repeat(2)),
1385 DiffHunk::different(["b", "X"]),
1386 DiffHunk::matching([" c"].repeat(2)),
1387 ]
1388 );
1389 }
1390
1391 #[test]
1392 fn test_diff_multiple_inputs_one_different() {
1393 assert_eq!(
1394 diff(["a b c", "a X c", "a b c"]),
1395 vec![
1396 DiffHunk::matching(["a "].repeat(3)),
1397 DiffHunk::different(["b", "X", "b"]),
1398 DiffHunk::matching([" c"].repeat(3)),
1399 ]
1400 );
1401 }
1402
1403 #[test]
1404 fn test_diff_multiple_inputs_all_different() {
1405 assert_eq!(
1406 diff(["a b c", "a X c", "a c X"]),
1407 vec![
1408 DiffHunk::matching(["a "].repeat(3)),
1409 DiffHunk::different(["b ", "X ", ""]),
1410 DiffHunk::matching(["c"].repeat(3)),
1411 DiffHunk::different(["", "", " X"]),
1412 ]
1413 );
1414 }
1415
1416 #[test]
1417 fn test_diff_for_tokenizer_compacted() {
1418 let diff = ContentDiff::for_tokenizer(
1420 ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
1421 find_line_ranges,
1422 CompareBytesExactly,
1423 );
1424 assert_eq!(
1425 diff.hunks().collect_vec(),
1426 vec![
1427 DiffHunk::matching(["a\nb\nc\n"].repeat(2)),
1428 DiffHunk::different(["d\n", "X\n"]),
1429 DiffHunk::matching(["e\nf\ng"].repeat(2)),
1430 ]
1431 );
1432 }
1433
1434 #[test]
1435 fn test_diff_nothing_in_common() {
1436 assert_eq!(
1437 diff(["aaa", "bb"]),
1438 vec![DiffHunk::different(["aaa", "bb"])]
1439 );
1440 }
1441
1442 #[test]
1443 fn test_diff_insert_in_middle() {
1444 assert_eq!(
1445 diff(["a z", "a S z"]),
1446 vec![
1447 DiffHunk::matching(["a "].repeat(2)),
1448 DiffHunk::different(["", "S "]),
1449 DiffHunk::matching(["z"].repeat(2)),
1450 ]
1451 );
1452 }
1453
1454 #[test]
1455 fn test_diff_no_unique_middle_flips() {
1456 assert_eq!(
1457 diff(["a R R S S z", "a S S R R z"]),
1458 vec![
1459 DiffHunk::matching(["a "].repeat(2)),
1460 DiffHunk::different(["R R ", ""]),
1461 DiffHunk::matching(["S S "].repeat(2)),
1462 DiffHunk::different(["", "R R "]),
1463 DiffHunk::matching(["z"].repeat(2))
1464 ],
1465 );
1466 }
1467
1468 #[test]
1469 fn test_diff_recursion_needed() {
1470 assert_eq!(
1471 diff([
1472 "a q x q y q z q b q y q x q c",
1473 "a r r x q y z q b y q x r r c",
1474 ]),
1475 vec![
1476 DiffHunk::matching(["a "].repeat(2)),
1477 DiffHunk::different(["q", "r"]),
1478 DiffHunk::matching([" "].repeat(2)),
1479 DiffHunk::different(["", "r "]),
1480 DiffHunk::matching(["x q y "].repeat(2)),
1481 DiffHunk::different(["q ", ""]),
1482 DiffHunk::matching(["z q b "].repeat(2)),
1483 DiffHunk::different(["q ", ""]),
1484 DiffHunk::matching(["y q x "].repeat(2)),
1485 DiffHunk::different(["q", "r"]),
1486 DiffHunk::matching([" "].repeat(2)),
1487 DiffHunk::different(["", "r "]),
1488 DiffHunk::matching(["c"].repeat(2)),
1489 ]
1490 );
1491 }
1492
1493 #[test]
1494 fn test_diff_ignore_all_whitespace() {
1495 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1496 let diff = ContentDiff::for_tokenizer(
1497 inputs,
1498 find_line_ranges,
1499 CompareBytesIgnoreAllWhitespace,
1500 );
1501 diff.hunks().collect()
1502 }
1503
1504 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1505 assert_eq!(
1506 diff(["a\n", " a\r\n"]),
1507 vec![DiffHunk::matching(["a\n", " a\r\n"])]
1508 );
1509 assert_eq!(
1510 diff(["a\n", " a\nb"]),
1511 vec![
1512 DiffHunk::matching(["a\n", " a\n"]),
1513 DiffHunk::different(["", "b"]),
1514 ]
1515 );
1516
1517 assert_eq!(
1519 diff(["a\nc\n", " a\n a\n"]),
1520 vec![
1521 DiffHunk::matching(["a\n", " a\n"]),
1522 DiffHunk::different(["c\n", " a\n"]),
1523 ]
1524 );
1525 assert_eq!(
1526 diff(["c\na\n", " a\n a\n"]),
1527 vec![
1528 DiffHunk::different(["c\n", " a\n"]),
1529 DiffHunk::matching(["a\n", " a\n"]),
1530 ]
1531 );
1532 }
1533
1534 #[test]
1535 fn test_diff_ignore_whitespace_amount() {
1536 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1537 let diff = ContentDiff::for_tokenizer(
1538 inputs,
1539 find_line_ranges,
1540 CompareBytesIgnoreWhitespaceAmount,
1541 );
1542 diff.hunks().collect()
1543 }
1544
1545 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1546 assert_eq!(
1548 diff(["a\n", "a\r\n"]),
1549 vec![DiffHunk::matching(["a\n", "a\r\n"])]
1550 );
1551 assert_eq!(
1553 diff(["a\n", " a\n"]),
1554 vec![DiffHunk::different(["a\n", " a\n"])]
1555 );
1556 assert_eq!(
1557 diff(["a\n", "a \nb"]),
1558 vec![
1559 DiffHunk::matching(["a\n", "a \n"]),
1560 DiffHunk::different(["", "b"]),
1561 ]
1562 );
1563 }
1564
1565 #[test]
1566 fn test_diff_hunk_iterator() {
1567 let diff = ContentDiff::by_word(["a b c", "a XX c", "a b "]);
1568 assert_eq!(
1569 diff.hunks().collect_vec(),
1570 vec![
1571 DiffHunk::matching(["a "].repeat(3)),
1572 DiffHunk::different(["b", "XX", "b"]),
1573 DiffHunk::matching([" "].repeat(3)),
1574 DiffHunk::different(["c", "c", ""]),
1575 ]
1576 );
1577 assert_eq!(
1578 diff.hunk_ranges().collect_vec(),
1579 vec![
1580 DiffHunkRange {
1581 kind: DiffHunkKind::Matching,
1582 ranges: smallvec![0..2, 0..2, 0..2],
1583 },
1584 DiffHunkRange {
1585 kind: DiffHunkKind::Different,
1586 ranges: smallvec![2..3, 2..4, 2..3],
1587 },
1588 DiffHunkRange {
1589 kind: DiffHunkKind::Matching,
1590 ranges: smallvec![3..4, 4..5, 3..4],
1591 },
1592 DiffHunkRange {
1593 kind: DiffHunkKind::Different,
1594 ranges: smallvec![4..5, 5..6, 4..4],
1595 },
1596 ]
1597 );
1598 }
1599
1600 #[test]
1601 fn test_diff_real_case_write_fmt() {
1602 #[rustfmt::skip]
1608 assert_eq!(
1609 diff([
1610 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
1611 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
1612 ]),
1613 vec![
1614 DiffHunk::matching([" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "].repeat(2)),
1615 DiffHunk::different(["", "-> io::Result<()> "]),
1616 DiffHunk::matching(["{\n self.styler().write_fmt(fmt)"].repeat(2)),
1617 DiffHunk::different([".unwrap()", ""]),
1618 DiffHunk::matching(["\n"].repeat(2))
1619 ]
1620 );
1621 }
1622
1623 #[test]
1624 fn test_diff_real_case_gitgit_read_tree_c() {
1625 #[rustfmt::skip]
1627 assert_eq!(
1628 diff([
1629 r##"/*
1630 * GIT - The information manager from hell
1631 *
1632 * Copyright (C) Linus Torvalds, 2005
1633 */
1634#include "#cache.h"
1635
1636static int unpack(unsigned char *sha1)
1637{
1638 void *buffer;
1639 unsigned long size;
1640 char type[20];
1641
1642 buffer = read_sha1_file(sha1, type, &size);
1643 if (!buffer)
1644 usage("unable to read sha1 file");
1645 if (strcmp(type, "tree"))
1646 usage("expected a 'tree' node");
1647 while (size) {
1648 int len = strlen(buffer)+1;
1649 unsigned char *sha1 = buffer + len;
1650 char *path = strchr(buffer, ' ')+1;
1651 unsigned int mode;
1652 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1653 usage("corrupt 'tree' file");
1654 buffer = sha1 + 20;
1655 size -= len + 20;
1656 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1657 }
1658 return 0;
1659}
1660
1661int main(int argc, char **argv)
1662{
1663 int fd;
1664 unsigned char sha1[20];
1665
1666 if (argc != 2)
1667 usage("read-tree <key>");
1668 if (get_sha1_hex(argv[1], sha1) < 0)
1669 usage("read-tree <key>");
1670 sha1_file_directory = getenv(DB_ENVIRONMENT);
1671 if (!sha1_file_directory)
1672 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1673 if (unpack(sha1) < 0)
1674 usage("unpack failed");
1675 return 0;
1676}
1677"##,
1678 r##"/*
1679 * GIT - The information manager from hell
1680 *
1681 * Copyright (C) Linus Torvalds, 2005
1682 */
1683#include "#cache.h"
1684
1685static void create_directories(const char *path)
1686{
1687 int len = strlen(path);
1688 char *buf = malloc(len + 1);
1689 const char *slash = path;
1690
1691 while ((slash = strchr(slash+1, '/')) != NULL) {
1692 len = slash - path;
1693 memcpy(buf, path, len);
1694 buf[len] = 0;
1695 mkdir(buf, 0700);
1696 }
1697}
1698
1699static int create_file(const char *path)
1700{
1701 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1702 if (fd < 0) {
1703 if (errno == ENOENT) {
1704 create_directories(path);
1705 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1706 }
1707 }
1708 return fd;
1709}
1710
1711static int unpack(unsigned char *sha1)
1712{
1713 void *buffer;
1714 unsigned long size;
1715 char type[20];
1716
1717 buffer = read_sha1_file(sha1, type, &size);
1718 if (!buffer)
1719 usage("unable to read sha1 file");
1720 if (strcmp(type, "tree"))
1721 usage("expected a 'tree' node");
1722 while (size) {
1723 int len = strlen(buffer)+1;
1724 unsigned char *sha1 = buffer + len;
1725 char *path = strchr(buffer, ' ')+1;
1726 char *data;
1727 unsigned long filesize;
1728 unsigned int mode;
1729 int fd;
1730
1731 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1732 usage("corrupt 'tree' file");
1733 buffer = sha1 + 20;
1734 size -= len + 20;
1735 data = read_sha1_file(sha1, type, &filesize);
1736 if (!data || strcmp(type, "blob"))
1737 usage("tree file refers to bad file data");
1738 fd = create_file(path);
1739 if (fd < 0)
1740 usage("unable to create file");
1741 if (write(fd, data, filesize) != filesize)
1742 usage("unable to write file");
1743 fchmod(fd, mode);
1744 close(fd);
1745 free(data);
1746 }
1747 return 0;
1748}
1749
1750int main(int argc, char **argv)
1751{
1752 int fd;
1753 unsigned char sha1[20];
1754
1755 if (argc != 2)
1756 usage("read-tree <key>");
1757 if (get_sha1_hex(argv[1], sha1) < 0)
1758 usage("read-tree <key>");
1759 sha1_file_directory = getenv(DB_ENVIRONMENT);
1760 if (!sha1_file_directory)
1761 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1762 if (unpack(sha1) < 0)
1763 usage("unpack failed");
1764 return 0;
1765}
1766"##,
1767 ]),
1768 vec![
1769 DiffHunk::matching(["/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"].repeat(2)),
1770 DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1771 DiffHunk::matching(["static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"].repeat(2)),
1772 DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1773 DiffHunk::matching(["\t\tunsigned int mode;\n"].repeat(2)),
1774 DiffHunk::different(["", "\t\tint fd;\n\n"]),
1775 DiffHunk::matching(["\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"].repeat(2)),
1776 DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]),
1777 DiffHunk::matching([" "].repeat(2)),
1778 DiffHunk::different(["sha1_to_hex", "read_sha1_file"]),
1779 DiffHunk::matching(["(sha1"].repeat(2)),
1780 DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
1781 DiffHunk::matching([");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"].repeat(2)),
1782 ]
1783 );
1784 }
1785}