1#![allow(missing_docs)]
16
17use std::collections::BTreeMap;
18use std::hash::BuildHasher;
19use std::hash::Hash;
20use std::hash::Hasher;
21use std::hash::RandomState;
22use std::iter;
23use std::ops::Range;
24use std::slice;
25
26use bstr::BStr;
27use hashbrown::HashTable;
28use itertools::Itertools;
29use smallvec::smallvec;
30use smallvec::SmallVec;
31
32pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
33 text.split_inclusive(|b| *b == b'\n')
34 .scan(0, |total, line| {
35 let start = *total;
36 *total += line.len();
37 Some(start..*total)
38 })
39 .collect()
40}
41
42fn is_word_byte(b: u8) -> bool {
43 matches!(
45 b,
46 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
49 )
50}
51
52pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
53 let mut word_ranges = vec![];
54 let mut word_start_pos = 0;
55 let mut in_word = false;
56 for (i, b) in text.iter().enumerate() {
57 if in_word && !is_word_byte(*b) {
58 in_word = false;
59 word_ranges.push(word_start_pos..i);
60 word_start_pos = i;
61 } else if !in_word && is_word_byte(*b) {
62 in_word = true;
63 word_start_pos = i;
64 }
65 }
66 if in_word && word_start_pos < text.len() {
67 word_ranges.push(word_start_pos..text.len());
68 }
69 word_ranges
70}
71
72pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
73 text.iter()
74 .positions(|b| !is_word_byte(*b))
75 .map(|i| i..i + 1)
76 .collect()
77}
78
79fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + '_ {
80 text.iter().copied().filter(|b| !b.is_ascii_whitespace())
81}
82
83fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + '_ {
84 let mut prev_was_space = false;
85 text.iter().filter_map(move |&b| {
86 let was_space = prev_was_space;
87 let is_space = b.is_ascii_whitespace();
88 prev_was_space = is_space;
89 match (was_space, is_space) {
90 (_, false) => Some(b),
91 (false, true) => Some(b' '),
92 (true, true) => None,
93 }
94 })
95}
96
97fn hash_with_length_suffix<I, H>(data: I, state: &mut H)
98where
99 I: IntoIterator,
100 I::Item: Hash,
101 H: Hasher,
102{
103 let mut len: usize = 0;
104 for d in data {
105 d.hash(state);
106 len += 1;
107 }
108 state.write_usize(len);
109}
110
111pub trait CompareBytes {
124 fn eq(&self, left: &[u8], right: &[u8]) -> bool;
126
127 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
130}
131
132impl<C: CompareBytes + ?Sized> CompareBytes for &C {
135 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
136 <C as CompareBytes>::eq(self, left, right)
137 }
138
139 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
140 <C as CompareBytes>::hash(self, text, state);
141 }
142}
143
144#[derive(Clone, Debug, Default)]
146pub struct CompareBytesExactly;
147
148impl CompareBytes for CompareBytesExactly {
149 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
150 left == right
151 }
152
153 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
154 text.hash(state);
155 }
156}
157
158#[derive(Clone, Debug, Default)]
160pub struct CompareBytesIgnoreAllWhitespace;
161
162impl CompareBytes for CompareBytesIgnoreAllWhitespace {
163 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
164 bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right))
165 }
166
167 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
168 hash_with_length_suffix(bytes_ignore_all_whitespace(text), state);
169 }
170}
171
172#[derive(Clone, Debug, Default)]
174pub struct CompareBytesIgnoreWhitespaceAmount;
175
176impl CompareBytes for CompareBytesIgnoreWhitespaceAmount {
177 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
178 bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right))
179 }
180
181 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
182 hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state);
183 }
184}
185
186#[derive(Clone, Copy, Debug)]
188struct HashedWord<'input> {
189 hash: u64,
190 text: &'input BStr,
191}
192
193#[derive(Clone, Debug, Default)]
195struct WordComparator<C, S> {
196 compare: C,
197 hash_builder: S,
198}
199
200impl<C: CompareBytes> WordComparator<C, RandomState> {
201 fn new(compare: C) -> Self {
202 WordComparator {
203 compare,
204 hash_builder: RandomState::new(),
206 }
207 }
208}
209
210impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
211 fn eq(&self, left: &[u8], right: &[u8]) -> bool {
212 self.compare.eq(left, right)
213 }
214
215 fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool {
216 left.hash == right.hash && self.compare.eq(left.text, right.text)
217 }
218
219 fn hash_one(&self, text: &[u8]) -> u64 {
220 let mut state = self.hash_builder.build_hasher();
221 self.compare.hash(text, &mut state);
222 state.finish()
223 }
224}
225
226#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
228struct WordPosition(usize);
229
230#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
232struct LocalWordPosition(usize);
233
234#[derive(Clone, Debug)]
235struct DiffSource<'input, 'aux> {
236 text: &'input BStr,
237 ranges: &'aux [Range<usize>],
238 hashes: Vec<u64>,
239}
240
241impl<'input, 'aux> DiffSource<'input, 'aux> {
242 fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>(
243 text: &'input T,
244 ranges: &'aux [Range<usize>],
245 comp: &WordComparator<C, S>,
246 ) -> Self {
247 let text = BStr::new(text);
248 let hashes = ranges
249 .iter()
250 .map(|range| comp.hash_one(&text[range.clone()]))
251 .collect();
252 DiffSource {
253 text,
254 ranges,
255 hashes,
256 }
257 }
258
259 fn local(&self) -> LocalDiffSource<'input, '_> {
260 LocalDiffSource {
261 text: self.text,
262 ranges: self.ranges,
263 hashes: &self.hashes,
264 global_offset: WordPosition(0),
265 }
266 }
267
268 fn range_at(&self, position: WordPosition) -> Range<usize> {
269 self.ranges[position.0].clone()
270 }
271}
272
273#[derive(Clone, Debug)]
274struct LocalDiffSource<'input, 'aux> {
275 text: &'input BStr,
276 ranges: &'aux [Range<usize>],
277 hashes: &'aux [u64],
278 global_offset: WordPosition,
280}
281
282impl<'input> LocalDiffSource<'input, '_> {
283 fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
284 LocalDiffSource {
285 text: self.text,
286 ranges: &self.ranges[positions.start.0..positions.end.0],
287 hashes: &self.hashes[positions.start.0..positions.end.0],
288 global_offset: self.map_to_global(positions.start),
289 }
290 }
291
292 fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
293 WordPosition(self.global_offset.0 + position.0)
294 }
295
296 fn hashed_words(
297 &self,
298 ) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + '_ {
299 iter::zip(self.ranges, self.hashes).map(|(range, &hash)| {
300 let text = &self.text[range.clone()];
301 HashedWord { hash, text }
302 })
303 }
304}
305
306struct Histogram<'input> {
307 word_to_positions: HashTable<HistogramEntry<'input>>,
308}
309
310type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>);
313
314impl<'input> Histogram<'input> {
315 fn calculate<C: CompareBytes, S: BuildHasher>(
316 source: &LocalDiffSource<'input, '_>,
317 comp: &WordComparator<C, S>,
318 max_occurrences: usize,
319 ) -> Self {
320 let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
321 for (i, word) in source.hashed_words().enumerate() {
322 let pos = LocalWordPosition(i);
323 word_to_positions
324 .entry(
325 word.hash,
326 |(w, _)| comp.eq(w.text, word.text),
327 |(w, _)| w.hash,
328 )
329 .and_modify(|(_, positions)| {
330 if positions.len() <= max_occurrences {
333 positions.push(pos);
334 }
335 })
336 .or_insert_with(|| (word, smallvec![pos]));
337 }
338 Histogram { word_to_positions }
339 }
340
341 fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
342 let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
343 for entry in &self.word_to_positions {
344 let (_, positions) = entry;
345 let entries = count_to_entries.entry(positions.len()).or_default();
346 entries.push(entry);
347 }
348 count_to_entries
349 }
350
351 fn positions_by_word<C: CompareBytes, S: BuildHasher>(
352 &self,
353 word: HashedWord<'input>,
354 comp: &WordComparator<C, S>,
355 ) -> Option<&[LocalWordPosition]> {
356 let (_, positions) = self
357 .word_to_positions
358 .find(word.hash, |(w, _)| comp.eq(w.text, word.text))?;
359 Some(positions)
360 }
361}
362
363fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
374 if input.is_empty() {
375 return vec![];
376 }
377
378 let mut chain = vec![(0, 0, 0); input.len()];
379 let mut global_longest = 0;
380 let mut global_longest_right_pos = 0;
381 for (right_pos, &left_pos) in input.iter().enumerate() {
382 let mut longest_from_here = 1;
383 let mut previous_right_pos = usize::MAX;
384 for i in (0..right_pos).rev() {
385 let (previous_len, previous_left_pos, _) = chain[i];
386 if previous_left_pos < left_pos {
387 let len = previous_len + 1;
388 if len > longest_from_here {
389 longest_from_here = len;
390 previous_right_pos = i;
391 if len > global_longest {
392 global_longest = len;
393 global_longest_right_pos = right_pos;
394 break;
397 }
398 }
399 }
400 }
401 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
402 }
403
404 let mut result = vec![];
405 let mut right_pos = global_longest_right_pos;
406 loop {
407 let (_, left_pos, previous_right_pos) = chain[right_pos];
408 result.push((left_pos, right_pos));
409 if previous_right_pos == usize::MAX {
410 break;
411 }
412 right_pos = previous_right_pos;
413 }
414 result.reverse();
415
416 result
417}
418
419fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
422 found_positions: &mut Vec<(WordPosition, WordPosition)>,
423 left: &LocalDiffSource,
424 right: &LocalDiffSource,
425 comp: &WordComparator<C, S>,
426) {
427 if left.ranges.is_empty() || right.ranges.is_empty() {
428 return;
429 }
430
431 let old_len = found_positions.len();
433 collect_unchanged_words_lcs(found_positions, left, right, comp);
434 if found_positions.len() != old_len {
435 return;
436 }
437
438 let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words())
440 .take_while(|&(l, r)| comp.eq_hashed(l, r))
441 .count();
442 let left_hashed_words = left.hashed_words().skip(common_leading_len);
443 let right_hashed_words = right.hashed_words().skip(common_leading_len);
444
445 let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev())
447 .take_while(|&(l, r)| comp.eq_hashed(l, r))
448 .count();
449
450 found_positions.extend(itertools::chain(
451 (0..common_leading_len).map(|i| {
452 (
453 left.map_to_global(LocalWordPosition(i)),
454 right.map_to_global(LocalWordPosition(i)),
455 )
456 }),
457 (1..=common_trailing_len).rev().map(|i| {
458 (
459 left.map_to_global(LocalWordPosition(left.ranges.len() - i)),
460 right.map_to_global(LocalWordPosition(right.ranges.len() - i)),
461 )
462 }),
463 ));
464}
465
466fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
467 found_positions: &mut Vec<(WordPosition, WordPosition)>,
468 left: &LocalDiffSource,
469 right: &LocalDiffSource,
470 comp: &WordComparator<C, S>,
471) {
472 let max_occurrences = 100;
473 let left_histogram = Histogram::calculate(left, comp, max_occurrences);
474 let left_count_to_entries = left_histogram.build_count_to_entries();
475 if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
476 return;
478 }
479 let right_histogram = Histogram::calculate(right, comp, max_occurrences);
480 let Some(uncommon_shared_word_positions) =
484 left_count_to_entries.values().find_map(|left_entries| {
485 let mut both_positions = left_entries
486 .iter()
487 .filter_map(|&(word, left_positions)| {
488 let right_positions = right_histogram.positions_by_word(*word, comp)?;
489 (left_positions.len() == right_positions.len())
490 .then_some((left_positions, right_positions))
491 })
492 .peekable();
493 both_positions.peek().is_some().then_some(both_positions)
494 })
495 else {
496 return;
497 };
498
499 let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
501 uncommon_shared_word_positions
502 .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
503 .enumerate()
504 .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
505 .unzip();
506 left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
507 right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
508 let left_index_by_right_index: Vec<usize> = {
509 let mut left_index_map = vec![0; left_positions.len()];
510 for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
511 left_index_map[serial] = i;
512 }
513 right_positions
514 .iter()
515 .map(|&(_pos, serial)| left_index_map[serial])
516 .collect()
517 };
518
519 let lcs = find_lcs(&left_index_by_right_index);
520
521 let mut previous_left_position = LocalWordPosition(0);
524 let mut previous_right_position = LocalWordPosition(0);
525 for (left_index, right_index) in lcs {
526 let (left_position, _) = left_positions[left_index];
527 let (right_position, _) = right_positions[right_index];
528 collect_unchanged_words(
529 found_positions,
530 &left.narrowed(previous_left_position..left_position),
531 &right.narrowed(previous_right_position..right_position),
532 comp,
533 );
534 found_positions.push((
535 left.map_to_global(left_position),
536 right.map_to_global(right_position),
537 ));
538 previous_left_position = LocalWordPosition(left_position.0 + 1);
539 previous_right_position = LocalWordPosition(right_position.0 + 1);
540 }
541 collect_unchanged_words(
543 found_positions,
544 &left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())),
545 &right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())),
546 comp,
547 );
548}
549
550fn intersect_unchanged_words(
553 current_positions: Vec<(WordPosition, Vec<WordPosition>)>,
554 new_positions: &[(WordPosition, WordPosition)],
555) -> Vec<(WordPosition, Vec<WordPosition>)> {
556 itertools::merge_join_by(
557 current_positions,
558 new_positions,
559 |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos),
560 )
561 .filter_map(|entry| entry.both())
562 .map(|((base_pos, mut other_positions), &(_, new_other_pos))| {
563 other_positions.push(new_other_pos);
564 (base_pos, other_positions)
565 })
566 .collect()
567}
568
569#[derive(Clone, PartialEq, Eq, Debug)]
570struct UnchangedRange {
571 base: Range<usize>,
573 others: SmallVec<[Range<usize>; 1]>,
574}
575
576impl UnchangedRange {
577 fn from_word_positions(
579 base_source: &DiffSource,
580 other_sources: &[DiffSource],
581 base_position: WordPosition,
582 other_positions: &[WordPosition],
583 ) -> Self {
584 assert_eq!(other_sources.len(), other_positions.len());
585 let base = base_source.range_at(base_position);
586 let others = iter::zip(other_sources, other_positions)
587 .map(|(source, pos)| source.range_at(*pos))
588 .collect();
589 UnchangedRange { base, others }
590 }
591
592 fn is_all_empty(&self) -> bool {
593 self.base.is_empty() && self.others.iter().all(|r| r.is_empty())
594 }
595}
596
597#[derive(Clone, Debug)]
600pub struct Diff<'input> {
601 base_input: &'input BStr,
602 other_inputs: SmallVec<[&'input BStr; 1]>,
603 unchanged_regions: Vec<UnchangedRange>,
608}
609
610impl<'input> Diff<'input> {
611 pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
612 inputs: impl IntoIterator<Item = &'input T>,
613 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
614 compare: impl CompareBytes,
615 ) -> Self {
616 let mut inputs = inputs.into_iter().map(BStr::new);
617 let base_input = inputs.next().expect("inputs must not be empty");
618 let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect();
619 let base_token_ranges: Vec<Range<usize>>;
621 let other_token_ranges: Vec<Vec<Range<usize>>>;
622 if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
627 base_token_ranges = vec![];
628 other_token_ranges = iter::repeat(vec![]).take(other_inputs.len()).collect();
629 } else {
630 base_token_ranges = tokenizer(base_input);
631 other_token_ranges = other_inputs
632 .iter()
633 .map(|other_input| tokenizer(other_input))
634 .collect();
635 }
636 Self::with_inputs_and_token_ranges(
637 base_input,
638 other_inputs,
639 &base_token_ranges,
640 &other_token_ranges,
641 compare,
642 )
643 }
644
645 fn with_inputs_and_token_ranges(
646 base_input: &'input BStr,
647 other_inputs: SmallVec<[&'input BStr; 1]>,
648 base_token_ranges: &[Range<usize>],
649 other_token_ranges: &[Vec<Range<usize>>],
650 compare: impl CompareBytes,
651 ) -> Self {
652 assert_eq!(other_inputs.len(), other_token_ranges.len());
653 let comp = WordComparator::new(compare);
654 let base_source = DiffSource::new(base_input, base_token_ranges, &comp);
655 let other_sources = iter::zip(&other_inputs, other_token_ranges)
656 .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp))
657 .collect_vec();
658 let unchanged_regions = match &*other_sources {
659 [] => {
662 let whole_range = UnchangedRange {
663 base: 0..base_source.text.len(),
664 others: smallvec![],
665 };
666 vec![whole_range]
667 }
668 [first_other_source, tail_other_sources @ ..] => {
671 let mut unchanged_regions = Vec::new();
672 unchanged_regions.push(UnchangedRange {
674 base: 0..0,
675 others: smallvec![0..0; other_inputs.len()],
676 });
677 let mut first_positions = Vec::new();
678 collect_unchanged_words(
679 &mut first_positions,
680 &base_source.local(),
681 &first_other_source.local(),
682 &comp,
683 );
684 if tail_other_sources.is_empty() {
685 unchanged_regions.extend(first_positions.iter().map(
686 |&(base_pos, other_pos)| {
687 UnchangedRange::from_word_positions(
688 &base_source,
689 &other_sources,
690 base_pos,
691 &[other_pos],
692 )
693 },
694 ));
695 } else {
696 let first_positions = first_positions
697 .iter()
698 .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos]))
699 .collect();
700 let intersected_positions = tail_other_sources.iter().fold(
701 first_positions,
702 |current_positions, other_source| {
703 let mut new_positions = Vec::new();
704 collect_unchanged_words(
705 &mut new_positions,
706 &base_source.local(),
707 &other_source.local(),
708 &comp,
709 );
710 intersect_unchanged_words(current_positions, &new_positions)
711 },
712 );
713 unchanged_regions.extend(intersected_positions.iter().map(
714 |(base_pos, other_positions)| {
715 UnchangedRange::from_word_positions(
716 &base_source,
717 &other_sources,
718 *base_pos,
719 other_positions,
720 )
721 },
722 ));
723 };
724 unchanged_regions.push(UnchangedRange {
726 base: base_input.len()..base_input.len(),
727 others: other_inputs
728 .iter()
729 .map(|input| input.len()..input.len())
730 .collect(),
731 });
732 unchanged_regions
733 }
734 };
735
736 let mut diff = Self {
737 base_input,
738 other_inputs,
739 unchanged_regions,
740 };
741 diff.compact_unchanged_regions();
742 diff
743 }
744
745 pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
746 inputs: impl IntoIterator<Item = &'input T>,
747 ) -> Self {
748 Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
749 }
750
751 pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
753 inputs: impl IntoIterator<Item = &'input T>,
754 ) -> Self {
755 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
756 }
757
758 pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
763 inputs: impl IntoIterator<Item = &'input T>,
764 ) -> Self {
765 let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
766 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
767 diff
768 }
769
770 pub fn hunks(&self) -> DiffHunkIterator<'_, 'input> {
772 let ranges = self.hunk_ranges();
773 DiffHunkIterator { diff: self, ranges }
774 }
775
776 pub fn hunk_ranges(&self) -> DiffHunkRangeIterator<'_> {
778 DiffHunkRangeIterator::new(self)
779 }
780
781 fn hunk_at<'a>(&'a self, range: &'a UnchangedRange) -> impl Iterator<Item = &'input BStr> + 'a {
783 itertools::chain(
784 iter::once(&self.base_input[range.base.clone()]),
785 iter::zip(&self.other_inputs, &range.others).map(|(input, r)| &input[r.clone()]),
786 )
787 }
788
789 fn hunk_between<'a>(
791 &'a self,
792 previous: &'a UnchangedRange,
793 current: &'a UnchangedRange,
794 ) -> impl Iterator<Item = &'input BStr> + 'a {
795 itertools::chain(
796 iter::once(&self.base_input[previous.base.end..current.base.start]),
797 itertools::izip!(&self.other_inputs, &previous.others, ¤t.others)
798 .map(|(input, prev, cur)| &input[prev.end..cur.start]),
799 )
800 }
801
802 pub fn refine_changed_regions(
805 &mut self,
806 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
807 compare: impl CompareBytes,
808 ) {
809 let mut new_unchanged_ranges = vec![self.unchanged_regions[0].clone()];
810 for window in self.unchanged_regions.windows(2) {
811 let [previous, current]: &[_; 2] = window.try_into().unwrap();
812 let refined_diff =
817 Diff::for_tokenizer(self.hunk_between(previous, current), &tokenizer, &compare);
818 for refined in &refined_diff.unchanged_regions {
819 let new_base_start = refined.base.start + previous.base.end;
820 let new_base_end = refined.base.end + previous.base.end;
821 let new_others = iter::zip(&refined.others, &previous.others)
822 .map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end))
823 .collect();
824 new_unchanged_ranges.push(UnchangedRange {
825 base: new_base_start..new_base_end,
826 others: new_others,
827 });
828 }
829 new_unchanged_ranges.push(current.clone());
830 }
831 self.unchanged_regions = new_unchanged_ranges;
832 self.compact_unchanged_regions();
833 }
834
835 fn compact_unchanged_regions(&mut self) {
836 let mut compacted = vec![];
837 let mut maybe_previous: Option<UnchangedRange> = None;
838 for current in &self.unchanged_regions {
839 if let Some(previous) = maybe_previous {
840 if previous.base.end == current.base.start
841 && iter::zip(&previous.others, ¤t.others)
842 .all(|(prev, cur)| prev.end == cur.start)
843 {
844 maybe_previous = Some(UnchangedRange {
845 base: previous.base.start..current.base.end,
846 others: iter::zip(&previous.others, ¤t.others)
847 .map(|(prev, cur)| prev.start..cur.end)
848 .collect(),
849 });
850 continue;
851 }
852 compacted.push(previous);
853 }
854 maybe_previous = Some(current.clone());
855 }
856 if let Some(previous) = maybe_previous {
857 compacted.push(previous);
858 }
859 self.unchanged_regions = compacted;
860 }
861}
862
863#[derive(Clone, Debug, Eq, PartialEq)]
865pub struct DiffHunk<'input> {
866 pub kind: DiffHunkKind,
867 pub contents: DiffHunkContentVec<'input>,
868}
869
870impl<'input> DiffHunk<'input> {
871 pub fn matching<T: AsRef<[u8]> + ?Sized + 'input>(
872 contents: impl IntoIterator<Item = &'input T>,
873 ) -> Self {
874 DiffHunk {
875 kind: DiffHunkKind::Matching,
876 contents: contents.into_iter().map(BStr::new).collect(),
877 }
878 }
879
880 pub fn different<T: AsRef<[u8]> + ?Sized + 'input>(
881 contents: impl IntoIterator<Item = &'input T>,
882 ) -> Self {
883 DiffHunk {
884 kind: DiffHunkKind::Different,
885 contents: contents.into_iter().map(BStr::new).collect(),
886 }
887 }
888}
889
890#[derive(Clone, Copy, Debug, Eq, PartialEq)]
891pub enum DiffHunkKind {
892 Matching,
893 Different,
894}
895
896pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>;
898
899#[derive(Clone, Debug)]
901pub struct DiffHunkIterator<'diff, 'input> {
902 diff: &'diff Diff<'input>,
903 ranges: DiffHunkRangeIterator<'diff>,
904}
905
906impl<'input> Iterator for DiffHunkIterator<'_, 'input> {
907 type Item = DiffHunk<'input>;
908
909 fn next(&mut self) -> Option<Self::Item> {
910 self.ranges.next_with(
911 |previous| {
912 let contents = self.diff.hunk_at(previous).collect();
913 let kind = DiffHunkKind::Matching;
914 DiffHunk { kind, contents }
915 },
916 |previous, current| {
917 let contents: DiffHunkContentVec =
918 self.diff.hunk_between(previous, current).collect();
919 debug_assert!(
920 contents.iter().any(|content| !content.is_empty()),
921 "unchanged regions should have been compacted"
922 );
923 let kind = DiffHunkKind::Different;
924 DiffHunk { kind, contents }
925 },
926 )
927 }
928}
929
930#[derive(Clone, Debug, Eq, PartialEq)]
932pub struct DiffHunkRange {
933 pub kind: DiffHunkKind,
934 pub ranges: DiffHunkRangeVec,
935}
936
937pub type DiffHunkRangeVec = SmallVec<[Range<usize>; 2]>;
939
940#[derive(Clone, Debug)]
942pub struct DiffHunkRangeIterator<'diff> {
943 previous: &'diff UnchangedRange,
944 unchanged_emitted: bool,
945 unchanged_iter: slice::Iter<'diff, UnchangedRange>,
946}
947
948impl<'diff> DiffHunkRangeIterator<'diff> {
949 fn new(diff: &'diff Diff) -> Self {
950 let mut unchanged_iter = diff.unchanged_regions.iter();
951 let previous = unchanged_iter.next().unwrap();
952 DiffHunkRangeIterator {
953 previous,
954 unchanged_emitted: previous.is_all_empty(),
955 unchanged_iter,
956 }
957 }
958
959 fn next_with<T>(
960 &mut self,
961 hunk_at: impl FnOnce(&UnchangedRange) -> T,
962 hunk_between: impl FnOnce(&UnchangedRange, &UnchangedRange) -> T,
963 ) -> Option<T> {
964 if !self.unchanged_emitted {
965 self.unchanged_emitted = true;
966 return Some(hunk_at(self.previous));
967 }
968 let current = self.unchanged_iter.next()?;
969 let hunk = hunk_between(self.previous, current);
970 self.previous = current;
971 self.unchanged_emitted = self.previous.is_all_empty();
972 Some(hunk)
973 }
974}
975
976impl Iterator for DiffHunkRangeIterator<'_> {
977 type Item = DiffHunkRange;
978
979 fn next(&mut self) -> Option<Self::Item> {
980 self.next_with(
981 |previous| {
982 let ranges = itertools::chain(iter::once(&previous.base), &previous.others)
983 .cloned()
984 .collect();
985 let kind = DiffHunkKind::Matching;
986 DiffHunkRange { kind, ranges }
987 },
988 |previous, current| {
989 let ranges: DiffHunkRangeVec = itertools::chain(
990 iter::once(previous.base.end..current.base.start),
991 iter::zip(&previous.others, ¤t.others)
992 .map(|(prev, cur)| prev.end..cur.start),
993 )
994 .collect();
995 debug_assert!(
996 ranges.iter().any(|range| !range.is_empty()),
997 "unchanged regions should have been compacted"
998 );
999 let kind = DiffHunkKind::Different;
1000 DiffHunkRange { kind, ranges }
1001 },
1002 )
1003 }
1004}
1005
1006pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
1014 inputs: impl IntoIterator<Item = &'a T>,
1015) -> Vec<DiffHunk<'a>> {
1016 let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
1017 diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
1018 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
1019 diff.hunks().collect()
1020}
1021
1022#[cfg(test)]
1023mod tests {
1024 use super::*;
1025
1026 fn no_ranges() -> Vec<Range<usize>> {
1029 vec![]
1030 }
1031
1032 #[test]
1033 fn test_find_line_ranges_empty() {
1034 assert_eq!(find_line_ranges(b""), no_ranges());
1035 }
1036
1037 #[test]
1038 fn test_find_line_ranges_blank_line() {
1039 assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
1040 }
1041
1042 #[test]
1043 fn test_find_line_ranges_missing_newline_at_eof() {
1044 assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
1045 }
1046
1047 #[test]
1048 fn test_find_line_ranges_multiple_lines() {
1049 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
1050 }
1051
1052 #[test]
1053 fn test_find_word_ranges_empty() {
1054 assert_eq!(find_word_ranges(b""), no_ranges());
1055 }
1056
1057 #[test]
1058 fn test_find_word_ranges_single_word() {
1059 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
1060 }
1061
1062 #[test]
1063 fn test_find_word_ranges_no_word() {
1064 assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
1065 }
1066
1067 #[test]
1068 fn test_find_word_ranges_word_then_non_word() {
1069 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]);
1070 }
1071
1072 #[test]
1073 fn test_find_word_ranges_non_word_then_word() {
1074 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
1075 }
1076
1077 #[test]
1078 fn test_find_word_ranges_multibyte() {
1079 assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3]);
1080 }
1081
1082 #[test]
1083 fn test_find_lcs_empty() {
1084 let empty: Vec<(usize, usize)> = vec![];
1085 assert_eq!(find_lcs(&[]), empty);
1086 }
1087
1088 #[test]
1089 fn test_find_lcs_single_element() {
1090 assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
1091 }
1092
1093 #[test]
1094 fn test_find_lcs_in_order() {
1095 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
1096 }
1097
1098 #[test]
1099 fn test_find_lcs_reverse_order() {
1100 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
1101 }
1102
1103 #[test]
1104 fn test_find_lcs_two_swapped() {
1105 assert_eq!(
1106 find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
1107 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
1108 );
1109 }
1110
1111 #[test]
1112 fn test_find_lcs_element_moved_earlier() {
1113 assert_eq!(
1114 find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
1115 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
1116 );
1117 }
1118
1119 #[test]
1120 fn test_find_lcs_element_moved_later() {
1121 assert_eq!(
1122 find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
1123 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
1124 );
1125 }
1126
1127 #[test]
1128 fn test_find_lcs_interleaved_longest_chains() {
1129 assert_eq!(
1130 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
1131 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
1132 );
1133 }
1134
1135 #[test]
1136 fn test_find_word_ranges_many_words() {
1137 assert_eq!(
1138 find_word_ranges(b"fn find_words(text: &[u8])"),
1139 vec![0..2, 3..13, 14..18, 22..24]
1140 );
1141 }
1142
1143 #[test]
1144 fn test_compare_bytes_ignore_all_whitespace() {
1145 let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace);
1146 let hash = |data: &[u8]| comp.hash_one(data);
1147
1148 assert!(comp.eq(b"", b""));
1149 assert!(comp.eq(b"", b" "));
1150 assert!(comp.eq(b"\t", b"\r"));
1151 assert_eq!(hash(b""), hash(b""));
1152 assert_eq!(hash(b""), hash(b" "));
1153 assert_eq!(hash(b""), hash(b"\t"));
1154 assert_eq!(hash(b""), hash(b"\r"));
1155
1156 assert!(comp.eq(b"ab", b" a b\t"));
1157 assert_eq!(hash(b"ab"), hash(b" a b\t"));
1158
1159 assert!(!comp.eq(b"a", b""));
1160 assert!(!comp.eq(b"a", b" "));
1161 assert!(!comp.eq(b"a", b"ab"));
1162 assert!(!comp.eq(b"ab", b"ba"));
1163 }
1164
1165 #[test]
1166 fn test_compare_bytes_ignore_whitespace_amount() {
1167 let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount);
1168 let hash = |data: &[u8]| comp.hash_one(data);
1169
1170 assert!(comp.eq(b"", b""));
1171 assert!(comp.eq(b"\n", b" \n"));
1172 assert!(comp.eq(b"\t", b"\r"));
1173 assert_eq!(hash(b""), hash(b""));
1174 assert_eq!(hash(b" "), hash(b"\n"));
1175 assert_eq!(hash(b" "), hash(b" \n"));
1176 assert_eq!(hash(b" "), hash(b"\t"));
1177 assert_eq!(hash(b" "), hash(b"\r"));
1178
1179 assert!(comp.eq(b"a b c\n", b"a b\tc\r\n"));
1180 assert_eq!(hash(b"a b c\n"), hash(b"a b\tc\r\n"));
1181
1182 assert!(!comp.eq(b"", b" "));
1183 assert!(!comp.eq(b"a", b""));
1184 assert!(!comp.eq(b"a", b" "));
1185 assert!(!comp.eq(b"a", b"a "));
1186 assert!(!comp.eq(b"a", b" a"));
1187 assert!(!comp.eq(b"a", b"ab"));
1188 assert!(!comp.eq(b"ab", b"ba"));
1189 assert!(!comp.eq(b"ab", b"a b"));
1190 }
1191
1192 fn unchanged_ranges(
1193 (left_text, left_ranges): (&[u8], &[Range<usize>]),
1194 (right_text, right_ranges): (&[u8], &[Range<usize>]),
1195 ) -> Vec<(Range<usize>, Range<usize>)> {
1196 let comp = WordComparator::new(CompareBytesExactly);
1197 let left = DiffSource::new(left_text, left_ranges, &comp);
1198 let right = DiffSource::new(right_text, right_ranges, &comp);
1199 let mut positions = Vec::new();
1200 collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
1201 positions
1202 .into_iter()
1203 .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
1204 .collect()
1205 }
1206
1207 #[test]
1208 fn test_unchanged_ranges_insert_in_middle() {
1209 assert_eq!(
1210 unchanged_ranges(
1211 (b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
1212 (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1213 ),
1214 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
1215 );
1216 }
1217
1218 #[test]
1219 fn test_unchanged_ranges_non_unique_removed() {
1220 assert_eq!(
1223 unchanged_ranges(
1224 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1225 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1226 ),
1227 vec![(0..1, 0..1)]
1228 );
1229 assert_eq!(
1230 unchanged_ranges(
1231 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1232 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1233 ),
1234 vec![(6..7, 6..7)]
1235 );
1236 assert_eq!(
1237 unchanged_ranges(
1238 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1239 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1240 ),
1241 vec![]
1242 );
1243 assert_eq!(
1244 unchanged_ranges(
1245 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1246 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1247 ),
1248 vec![(0..1, 0..1), (6..7, 6..7)]
1249 );
1250 }
1251
1252 #[test]
1253 fn test_unchanged_ranges_non_unique_added() {
1254 assert_eq!(
1257 unchanged_ranges(
1258 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1259 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1260 ),
1261 vec![(0..1, 0..1)]
1262 );
1263 assert_eq!(
1264 unchanged_ranges(
1265 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1266 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1267 ),
1268 vec![(6..7, 6..7)]
1269 );
1270 assert_eq!(
1271 unchanged_ranges(
1272 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1273 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1274 ),
1275 vec![]
1276 );
1277 assert_eq!(
1278 unchanged_ranges(
1279 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1280 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1281 ),
1282 vec![(0..1, 0..1), (6..7, 6..7)]
1283 );
1284 }
1285
1286 #[test]
1287 fn test_unchanged_ranges_recursion_needed() {
1288 assert_eq!(
1290 unchanged_ranges(
1291 (b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
1292 (b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
1293 ),
1294 vec![(2..3, 0..1), (4..5, 6..7)]
1295 );
1296 assert_eq!(
1297 unchanged_ranges(
1298 (b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
1299 (b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
1300 ),
1301 vec![(0..1, 2..3), (2..3, 6..7)]
1302 );
1303 assert_eq!(
1305 unchanged_ranges(
1306 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1307 (b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
1308 ),
1309 vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
1310 );
1311 assert_eq!(
1312 unchanged_ranges(
1313 (b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
1314 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1315 ),
1316 vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
1317 );
1318 assert_eq!(
1320 unchanged_ranges(
1321 (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1322 (b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
1323 ),
1324 vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
1325 );
1326 assert_eq!(
1327 unchanged_ranges(
1328 (b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
1329 (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1330 ),
1331 vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
1332 );
1333 }
1334
1335 #[test]
1336 fn test_diff_single_input() {
1337 assert_eq!(diff(["abc"]), vec![DiffHunk::matching(["abc"])]);
1338 }
1339
1340 #[test]
1341 fn test_diff_some_empty_inputs() {
1342 assert_eq!(diff([""]), vec![]);
1344 assert_eq!(diff(["", ""]), vec![]);
1345 assert_eq!(diff(["", "", ""]), vec![]);
1346
1347 assert_eq!(diff(["a b", ""]), vec![DiffHunk::different(["a b", ""])]);
1349 assert_eq!(diff(["", "a b"]), vec![DiffHunk::different(["", "a b"])]);
1350
1351 assert_eq!(
1353 diff(["a b", "", "a b"]),
1354 vec![DiffHunk::different(["a b", "", "a b"])]
1355 );
1356 assert_eq!(
1357 diff(["", "a b", "a b"]),
1358 vec![DiffHunk::different(["", "a b", "a b"])]
1359 );
1360
1361 assert_eq!(
1363 diff(["a b", "", ""]),
1364 vec![DiffHunk::different(["a b", "", ""])]
1365 );
1366 assert_eq!(
1367 diff(["", "a b", ""]),
1368 vec![DiffHunk::different(["", "a b", ""])]
1369 );
1370 }
1371
1372 #[test]
1373 fn test_diff_two_inputs_one_different() {
1374 assert_eq!(
1375 diff(["a b c", "a X c"]),
1376 vec![
1377 DiffHunk::matching(["a "].repeat(2)),
1378 DiffHunk::different(["b", "X"]),
1379 DiffHunk::matching([" c"].repeat(2)),
1380 ]
1381 );
1382 }
1383
1384 #[test]
1385 fn test_diff_multiple_inputs_one_different() {
1386 assert_eq!(
1387 diff(["a b c", "a X c", "a b c"]),
1388 vec![
1389 DiffHunk::matching(["a "].repeat(3)),
1390 DiffHunk::different(["b", "X", "b"]),
1391 DiffHunk::matching([" c"].repeat(3)),
1392 ]
1393 );
1394 }
1395
1396 #[test]
1397 fn test_diff_multiple_inputs_all_different() {
1398 assert_eq!(
1399 diff(["a b c", "a X c", "a c X"]),
1400 vec![
1401 DiffHunk::matching(["a "].repeat(3)),
1402 DiffHunk::different(["b ", "X ", ""]),
1403 DiffHunk::matching(["c"].repeat(3)),
1404 DiffHunk::different(["", "", " X"]),
1405 ]
1406 );
1407 }
1408
1409 #[test]
1410 fn test_diff_for_tokenizer_compacted() {
1411 let diff = Diff::for_tokenizer(
1413 ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
1414 find_line_ranges,
1415 CompareBytesExactly,
1416 );
1417 assert_eq!(
1418 diff.hunks().collect_vec(),
1419 vec![
1420 DiffHunk::matching(["a\nb\nc\n"].repeat(2)),
1421 DiffHunk::different(["d\n", "X\n"]),
1422 DiffHunk::matching(["e\nf\ng"].repeat(2)),
1423 ]
1424 );
1425 }
1426
1427 #[test]
1428 fn test_diff_nothing_in_common() {
1429 assert_eq!(
1430 diff(["aaa", "bb"]),
1431 vec![DiffHunk::different(["aaa", "bb"])]
1432 );
1433 }
1434
1435 #[test]
1436 fn test_diff_insert_in_middle() {
1437 assert_eq!(
1438 diff(["a z", "a S z"]),
1439 vec![
1440 DiffHunk::matching(["a "].repeat(2)),
1441 DiffHunk::different(["", "S "]),
1442 DiffHunk::matching(["z"].repeat(2)),
1443 ]
1444 );
1445 }
1446
1447 #[test]
1448 fn test_diff_no_unique_middle_flips() {
1449 assert_eq!(
1450 diff(["a R R S S z", "a S S R R z"]),
1451 vec![
1452 DiffHunk::matching(["a "].repeat(2)),
1453 DiffHunk::different(["R R ", ""]),
1454 DiffHunk::matching(["S S "].repeat(2)),
1455 DiffHunk::different(["", "R R "]),
1456 DiffHunk::matching(["z"].repeat(2))
1457 ],
1458 );
1459 }
1460
1461 #[test]
1462 fn test_diff_recursion_needed() {
1463 assert_eq!(
1464 diff([
1465 "a q x q y q z q b q y q x q c",
1466 "a r r x q y z q b y q x r r c",
1467 ]),
1468 vec![
1469 DiffHunk::matching(["a "].repeat(2)),
1470 DiffHunk::different(["q", "r"]),
1471 DiffHunk::matching([" "].repeat(2)),
1472 DiffHunk::different(["", "r "]),
1473 DiffHunk::matching(["x q y "].repeat(2)),
1474 DiffHunk::different(["q ", ""]),
1475 DiffHunk::matching(["z q b "].repeat(2)),
1476 DiffHunk::different(["q ", ""]),
1477 DiffHunk::matching(["y q x "].repeat(2)),
1478 DiffHunk::different(["q", "r"]),
1479 DiffHunk::matching([" "].repeat(2)),
1480 DiffHunk::different(["", "r "]),
1481 DiffHunk::matching(["c"].repeat(2)),
1482 ]
1483 );
1484 }
1485
1486 #[test]
1487 fn test_diff_ignore_all_whitespace() {
1488 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1489 let diff =
1490 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
1491 diff.hunks().collect()
1492 }
1493
1494 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1495 assert_eq!(
1496 diff(["a\n", " a\r\n"]),
1497 vec![DiffHunk::matching(["a\n", " a\r\n"])]
1498 );
1499 assert_eq!(
1500 diff(["a\n", " a\nb"]),
1501 vec![
1502 DiffHunk::matching(["a\n", " a\n"]),
1503 DiffHunk::different(["", "b"]),
1504 ]
1505 );
1506
1507 assert_eq!(
1509 diff(["a\nc\n", " a\n a\n"]),
1510 vec![
1511 DiffHunk::matching(["a\n", " a\n"]),
1512 DiffHunk::different(["c\n", " a\n"]),
1513 ]
1514 );
1515 assert_eq!(
1516 diff(["c\na\n", " a\n a\n"]),
1517 vec![
1518 DiffHunk::different(["c\n", " a\n"]),
1519 DiffHunk::matching(["a\n", " a\n"]),
1520 ]
1521 );
1522 }
1523
1524 #[test]
1525 fn test_diff_ignore_whitespace_amount() {
1526 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1527 let diff =
1528 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
1529 diff.hunks().collect()
1530 }
1531
1532 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1533 assert_eq!(
1535 diff(["a\n", "a\r\n"]),
1536 vec![DiffHunk::matching(["a\n", "a\r\n"])]
1537 );
1538 assert_eq!(
1540 diff(["a\n", " a\n"]),
1541 vec![DiffHunk::different(["a\n", " a\n"])]
1542 );
1543 assert_eq!(
1544 diff(["a\n", "a \nb"]),
1545 vec![
1546 DiffHunk::matching(["a\n", "a \n"]),
1547 DiffHunk::different(["", "b"]),
1548 ]
1549 );
1550 }
1551
1552 #[test]
1553 fn test_diff_hunk_iterator() {
1554 let diff = Diff::by_word(["a b c", "a XX c", "a b "]);
1555 assert_eq!(
1556 diff.hunks().collect_vec(),
1557 vec![
1558 DiffHunk::matching(["a "].repeat(3)),
1559 DiffHunk::different(["b", "XX", "b"]),
1560 DiffHunk::matching([" "].repeat(3)),
1561 DiffHunk::different(["c", "c", ""]),
1562 ]
1563 );
1564 assert_eq!(
1565 diff.hunk_ranges().collect_vec(),
1566 vec![
1567 DiffHunkRange {
1568 kind: DiffHunkKind::Matching,
1569 ranges: smallvec![0..2, 0..2, 0..2],
1570 },
1571 DiffHunkRange {
1572 kind: DiffHunkKind::Different,
1573 ranges: smallvec![2..3, 2..4, 2..3],
1574 },
1575 DiffHunkRange {
1576 kind: DiffHunkKind::Matching,
1577 ranges: smallvec![3..4, 4..5, 3..4],
1578 },
1579 DiffHunkRange {
1580 kind: DiffHunkKind::Different,
1581 ranges: smallvec![4..5, 5..6, 4..4],
1582 },
1583 ]
1584 );
1585 }
1586
1587 #[test]
1588 fn test_diff_real_case_write_fmt() {
1589 #[rustfmt::skip]
1595 assert_eq!(
1596 diff([
1597 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
1598 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
1599 ]),
1600 vec![
1601 DiffHunk::matching([" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "].repeat(2)),
1602 DiffHunk::different(["", "-> io::Result<()> "]),
1603 DiffHunk::matching(["{\n self.styler().write_fmt(fmt)"].repeat(2)),
1604 DiffHunk::different([".unwrap()", ""]),
1605 DiffHunk::matching(["\n"].repeat(2))
1606 ]
1607 );
1608 }
1609
1610 #[test]
1611 fn test_diff_real_case_gitgit_read_tree_c() {
1612 #[rustfmt::skip]
1614 assert_eq!(
1615 diff([
1616 r##"/*
1617 * GIT - The information manager from hell
1618 *
1619 * Copyright (C) Linus Torvalds, 2005
1620 */
1621#include "#cache.h"
1622
1623static int unpack(unsigned char *sha1)
1624{
1625 void *buffer;
1626 unsigned long size;
1627 char type[20];
1628
1629 buffer = read_sha1_file(sha1, type, &size);
1630 if (!buffer)
1631 usage("unable to read sha1 file");
1632 if (strcmp(type, "tree"))
1633 usage("expected a 'tree' node");
1634 while (size) {
1635 int len = strlen(buffer)+1;
1636 unsigned char *sha1 = buffer + len;
1637 char *path = strchr(buffer, ' ')+1;
1638 unsigned int mode;
1639 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1640 usage("corrupt 'tree' file");
1641 buffer = sha1 + 20;
1642 size -= len + 20;
1643 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1644 }
1645 return 0;
1646}
1647
1648int main(int argc, char **argv)
1649{
1650 int fd;
1651 unsigned char sha1[20];
1652
1653 if (argc != 2)
1654 usage("read-tree <key>");
1655 if (get_sha1_hex(argv[1], sha1) < 0)
1656 usage("read-tree <key>");
1657 sha1_file_directory = getenv(DB_ENVIRONMENT);
1658 if (!sha1_file_directory)
1659 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1660 if (unpack(sha1) < 0)
1661 usage("unpack failed");
1662 return 0;
1663}
1664"##,
1665 r##"/*
1666 * GIT - The information manager from hell
1667 *
1668 * Copyright (C) Linus Torvalds, 2005
1669 */
1670#include "#cache.h"
1671
1672static void create_directories(const char *path)
1673{
1674 int len = strlen(path);
1675 char *buf = malloc(len + 1);
1676 const char *slash = path;
1677
1678 while ((slash = strchr(slash+1, '/')) != NULL) {
1679 len = slash - path;
1680 memcpy(buf, path, len);
1681 buf[len] = 0;
1682 mkdir(buf, 0700);
1683 }
1684}
1685
1686static int create_file(const char *path)
1687{
1688 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1689 if (fd < 0) {
1690 if (errno == ENOENT) {
1691 create_directories(path);
1692 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1693 }
1694 }
1695 return fd;
1696}
1697
1698static int unpack(unsigned char *sha1)
1699{
1700 void *buffer;
1701 unsigned long size;
1702 char type[20];
1703
1704 buffer = read_sha1_file(sha1, type, &size);
1705 if (!buffer)
1706 usage("unable to read sha1 file");
1707 if (strcmp(type, "tree"))
1708 usage("expected a 'tree' node");
1709 while (size) {
1710 int len = strlen(buffer)+1;
1711 unsigned char *sha1 = buffer + len;
1712 char *path = strchr(buffer, ' ')+1;
1713 char *data;
1714 unsigned long filesize;
1715 unsigned int mode;
1716 int fd;
1717
1718 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1719 usage("corrupt 'tree' file");
1720 buffer = sha1 + 20;
1721 size -= len + 20;
1722 data = read_sha1_file(sha1, type, &filesize);
1723 if (!data || strcmp(type, "blob"))
1724 usage("tree file refers to bad file data");
1725 fd = create_file(path);
1726 if (fd < 0)
1727 usage("unable to create file");
1728 if (write(fd, data, filesize) != filesize)
1729 usage("unable to write file");
1730 fchmod(fd, mode);
1731 close(fd);
1732 free(data);
1733 }
1734 return 0;
1735}
1736
1737int main(int argc, char **argv)
1738{
1739 int fd;
1740 unsigned char sha1[20];
1741
1742 if (argc != 2)
1743 usage("read-tree <key>");
1744 if (get_sha1_hex(argv[1], sha1) < 0)
1745 usage("read-tree <key>");
1746 sha1_file_directory = getenv(DB_ENVIRONMENT);
1747 if (!sha1_file_directory)
1748 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1749 if (unpack(sha1) < 0)
1750 usage("unpack failed");
1751 return 0;
1752}
1753"##,
1754 ]),
1755 vec![
1756 DiffHunk::matching(["/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"].repeat(2)),
1757 DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1758 DiffHunk::matching(["static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"].repeat(2)),
1759 DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1760 DiffHunk::matching(["\t\tunsigned int mode;\n"].repeat(2)),
1761 DiffHunk::different(["", "\t\tint fd;\n\n"]),
1762 DiffHunk::matching(["\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"].repeat(2)),
1763 DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]),
1764 DiffHunk::matching([" "].repeat(2)),
1765 DiffHunk::different(["sha1_to_hex", "read_sha1_file"]),
1766 DiffHunk::matching(["(sha1"].repeat(2)),
1767 DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
1768 DiffHunk::matching([");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"].repeat(2)),
1769 ]
1770 );
1771 }
1772}