1use std::cmp::{max, min, Ordering};
16use std::collections::{BTreeMap, HashMap};
17use std::fmt::{Debug, Formatter};
18use std::ops::Range;
19use std::slice;
20
21use itertools::Itertools;
22
23use crate::nightly_shims::BTreeMapExt;
24
25pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
26 let mut ranges = vec![];
27 let mut start = 0;
28 loop {
29 match text[start..].iter().position(|b| *b == b'\n') {
30 None => {
31 break;
32 }
33 Some(i) => {
34 ranges.push(start..start + i + 1);
35 start += i + 1;
36 }
37 }
38 }
39 if start < text.len() {
40 ranges.push(start..text.len());
41 }
42 ranges
43}
44
45fn is_word_byte(b: u8) -> bool {
46 matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
48}
49
50pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
51 let mut word_ranges = vec![];
52 let mut word_start_pos = 0;
53 let mut in_word = false;
54 for (i, b) in text.iter().enumerate() {
55 if in_word && !is_word_byte(*b) {
56 in_word = false;
57 word_ranges.push(word_start_pos..i);
58 word_start_pos = i;
59 } else if !in_word && is_word_byte(*b) {
60 in_word = true;
61 word_start_pos = i;
62 }
63 }
64 if in_word && word_start_pos < text.len() {
65 word_ranges.push(word_start_pos..text.len());
66 }
67 word_ranges
68}
69
70pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
71 let mut ranges = vec![];
72 for (i, b) in text.iter().enumerate() {
73 if !is_word_byte(*b) {
74 ranges.push(i..i + 1);
75 }
76 }
77 ranges
78}
79
80struct Histogram<'a> {
81 word_to_positions: HashMap<&'a [u8], Vec<usize>>,
82 count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
83}
84
85impl Histogram<'_> {
86 fn calculate<'a>(
87 text: &'a [u8],
88 ranges: &[Range<usize>],
89 max_occurrences: usize,
90 ) -> Histogram<'a> {
91 let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
92 for (i, range) in ranges.iter().enumerate() {
93 let positions = word_to_positions.entry(&text[range.clone()]).or_default();
94 if positions.len() <= max_occurrences {
97 positions.push(i);
98 }
99 }
100 let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
101 for (word, ranges) in &word_to_positions {
102 count_to_words.entry(ranges.len()).or_default().push(word);
103 }
104 Histogram {
105 word_to_positions,
106 count_to_words,
107 }
108 }
109}
110
111fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
122 if input.is_empty() {
123 return vec![];
124 }
125
126 let mut chain = vec![(0, 0, 0); input.len()];
127 let mut global_longest = 0;
128 let mut global_longest_right_pos = 0;
129 for (right_pos, &left_pos) in input.iter().enumerate() {
130 let mut longest_from_here = 1;
131 let mut previous_right_pos = usize::MAX;
132 for i in (0..right_pos).rev() {
133 let (previous_len, previous_left_pos, _) = chain[i];
134 if previous_left_pos < left_pos {
135 let len = previous_len + 1;
136 if len > longest_from_here {
137 longest_from_here = len;
138 previous_right_pos = i;
139 if len > global_longest {
140 global_longest = len;
141 global_longest_right_pos = right_pos;
142 break;
145 }
146 }
147 }
148 }
149 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
150 }
151
152 let mut result = vec![];
153 let mut right_pos = global_longest_right_pos;
154 loop {
155 let (_, left_pos, previous_right_pos) = chain[right_pos];
156 result.push((left_pos, right_pos));
157 if previous_right_pos == usize::MAX {
158 break;
159 }
160 right_pos = previous_right_pos;
161 }
162 result.reverse();
163
164 result
165}
166
167pub(crate) fn unchanged_ranges(
170 left: &[u8],
171 right: &[u8],
172 left_ranges: &[Range<usize>],
173 right_ranges: &[Range<usize>],
174) -> Vec<(Range<usize>, Range<usize>)> {
175 if left_ranges.is_empty() || right_ranges.is_empty() {
176 return vec![];
177 }
178
179 let max_occurrences = 100;
180 let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
181 if *left_histogram.count_to_words.first_key().unwrap() > max_occurrences {
182 return vec![];
184 }
185 let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
186 let mut uncommon_shared_words = vec![];
190 while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() {
191 let left_words = left_histogram.count_to_words.pop_first_value().unwrap();
192 for left_word in left_words {
193 if right_histogram.word_to_positions.contains_key(left_word) {
194 uncommon_shared_words.push(left_word);
195 }
196 }
197 }
198 if uncommon_shared_words.is_empty() {
199 return vec![];
200 }
201
202 let mut left_positions = vec![];
213 let mut right_positions = vec![];
214 for uncommon_shared_word in uncommon_shared_words {
215 let left_occurrences = left_histogram
216 .word_to_positions
217 .get_mut(uncommon_shared_word)
218 .unwrap();
219 let right_occurrences = right_histogram
220 .word_to_positions
221 .get_mut(uncommon_shared_word)
222 .unwrap();
223 let shared_count = min(left_occurrences.len(), right_occurrences.len());
224 for occurrence in 0..shared_count {
225 left_positions.push((
226 left_occurrences[occurrence],
227 uncommon_shared_word,
228 occurrence,
229 ));
230 right_positions.push((
231 right_occurrences[occurrence],
232 uncommon_shared_word,
233 occurrence,
234 ));
235 }
236 }
237 left_positions.sort();
238 right_positions.sort();
239 let mut left_position_map = HashMap::new();
240 for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
241 left_position_map.insert((*word, *occurrence), i);
242 }
243 let mut left_index_by_right_index = vec![];
244 for (_pos, word, occurrence) in &right_positions {
245 left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
246 }
247
248 let lcs = find_lcs(&left_index_by_right_index);
249
250 let mut result = vec![];
253 let mut previous_left_position = 0;
254 let mut previous_right_position = 0;
255 for (left_index, right_index) in lcs {
256 let left_position = left_positions[left_index].0;
257 let right_position = right_positions[right_index].0;
258 let skipped_left_positions = previous_left_position..left_position;
259 let skipped_right_positions = previous_right_position..right_position;
260 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
261 for unchanged_nested_range in unchanged_ranges(
262 left,
263 right,
264 &left_ranges[skipped_left_positions.clone()],
265 &right_ranges[skipped_right_positions.clone()],
266 ) {
267 result.push(unchanged_nested_range);
268 }
269 }
270 result.push((
271 left_ranges[left_position].clone(),
272 right_ranges[right_position].clone(),
273 ));
274 previous_left_position = left_position + 1;
275 previous_right_position = right_position + 1;
276 }
277 let skipped_left_positions = previous_left_position..left_ranges.len();
279 let skipped_right_positions = previous_right_position..right_ranges.len();
280 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
281 for unchanged_nested_range in unchanged_ranges(
282 left,
283 right,
284 &left_ranges[skipped_left_positions],
285 &right_ranges[skipped_right_positions],
286 ) {
287 result.push(unchanged_nested_range);
288 }
289 }
290
291 result
292}
293
294#[derive(Clone, PartialEq, Eq, Debug)]
295struct UnchangedRange {
296 base_range: Range<usize>,
297 offsets: Vec<isize>,
298}
299
300impl UnchangedRange {
301 fn start(&self, side: usize) -> usize {
302 self.base_range
303 .start
304 .wrapping_add(self.offsets[side] as usize)
305 }
306
307 fn end(&self, side: usize) -> usize {
308 self.base_range
309 .end
310 .wrapping_add(self.offsets[side] as usize)
311 }
312}
313
314impl PartialOrd for UnchangedRange {
315 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
316 Some(self.cmp(other))
317 }
318}
319
320impl Ord for UnchangedRange {
321 fn cmp(&self, other: &Self) -> Ordering {
322 self.base_range
323 .start
324 .cmp(&other.base_range.start)
325 .then_with(|| self.base_range.end.cmp(&other.base_range.end))
326 }
327}
328
329#[derive(Clone, Debug)]
332pub struct Diff<'input> {
333 base_input: &'input [u8],
334 other_inputs: Vec<&'input [u8]>,
335 unchanged_regions: Vec<UnchangedRange>,
339}
340
341fn intersect_regions(
345 current_ranges: Vec<UnchangedRange>,
346 new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
347) -> Vec<UnchangedRange> {
348 let mut result = vec![];
349 let mut current_ranges_iter = current_ranges.into_iter().peekable();
350 for (new_base_range, other_range) in new_unchanged_ranges.iter() {
351 assert_eq!(new_base_range.len(), other_range.len());
352 while let Some(UnchangedRange {
353 base_range,
354 offsets,
355 }) = current_ranges_iter.peek()
356 {
357 if base_range.start >= new_base_range.end {
359 break;
360 }
361 if base_range.end <= new_base_range.start {
364 current_ranges_iter.next();
365 continue;
366 }
367 let new_start = max(base_range.start, new_base_range.start);
368 let new_end = min(base_range.end, new_base_range.end);
369 let mut new_offsets = offsets.clone();
370 new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
371 result.push(UnchangedRange {
372 base_range: new_start..new_end,
373 offsets: new_offsets,
374 });
375 if base_range.end >= new_base_range.end {
376 break;
379 }
380 current_ranges_iter.next();
381 }
382 }
383 result
384}
385
386impl<'input> Diff<'input> {
387 pub fn for_tokenizer(
388 inputs: &[&'input [u8]],
389 tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>,
390 ) -> Self {
391 assert!(!inputs.is_empty());
392 let base_input = inputs[0];
393 let other_inputs = inputs.iter().skip(1).copied().collect_vec();
394 let base_token_ranges: Vec<Range<usize>> = tokenizer(base_input);
396 let other_token_ranges: Vec<Vec<Range<usize>>> = other_inputs
397 .iter()
398 .map(|other_input| tokenizer(other_input))
399 .collect_vec();
400
401 let mut unchanged_regions = vec![UnchangedRange {
406 base_range: 0..base_input.len(),
407 offsets: vec![],
408 }];
409 for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
410 let unchanged_diff_ranges = unchanged_ranges(
411 base_input,
412 other_inputs[i],
413 &base_token_ranges,
414 other_token_ranges,
415 );
416 unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
417 }
418 let offsets = other_inputs
420 .iter()
421 .map(|input| input.len().wrapping_sub(base_input.len()) as isize)
422 .collect_vec();
423 unchanged_regions.push(UnchangedRange {
424 base_range: base_input.len()..base_input.len(),
425 offsets,
426 });
427
428 let mut diff = Self {
429 base_input,
430 other_inputs,
431 unchanged_regions,
432 };
433 diff.compact_unchanged_regions();
434 diff
435 }
436
437 pub fn unrefined(inputs: &[&'input [u8]]) -> Self {
438 Diff::for_tokenizer(inputs, &|_| vec![])
439 }
440
441 pub fn default_refinement(inputs: &[&'input [u8]]) -> Self {
448 let mut diff = Diff::for_tokenizer(inputs, &find_line_ranges);
449 diff.refine_changed_regions(&find_word_ranges);
450 diff.refine_changed_regions(&find_nonword_ranges);
451 diff
452 }
453
454 pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
455 let previous_offsets = vec![0; self.other_inputs.len()];
456 DiffHunkIterator {
457 diff: self,
458 previous: UnchangedRange {
459 base_range: 0..0,
460 offsets: previous_offsets,
461 },
462 unchanged_emitted: true,
463 unchanged_iter: self.unchanged_regions.iter(),
464 }
465 }
466
467 pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) {
470 let mut previous = UnchangedRange {
471 base_range: 0..0,
472 offsets: vec![0; self.other_inputs.len()],
473 };
474 let mut new_unchanged_ranges = vec![];
475 for current in self.unchanged_regions.iter() {
476 let mut slices =
481 vec![&self.base_input[previous.base_range.end..current.base_range.start]];
482 for i in 0..current.offsets.len() {
483 let changed_range = previous.end(i)..current.start(i);
484 slices.push(&self.other_inputs[i][changed_range]);
485 }
486
487 let refined_diff = Diff::for_tokenizer(&slices, tokenizer);
488
489 for UnchangedRange {
490 base_range,
491 offsets,
492 } in refined_diff.unchanged_regions
493 {
494 let new_base_start = base_range.start + previous.base_range.end;
495 let new_base_end = base_range.end + previous.base_range.end;
496 let offsets = offsets
497 .into_iter()
498 .enumerate()
499 .map(|(i, offset)| offset + previous.offsets[i])
500 .collect_vec();
501 new_unchanged_ranges.push(UnchangedRange {
502 base_range: new_base_start..new_base_end,
503 offsets,
504 });
505 }
506 previous = current.clone();
507 }
508 self.unchanged_regions = self
509 .unchanged_regions
510 .iter()
511 .cloned()
512 .merge(new_unchanged_ranges)
513 .collect_vec();
514 self.compact_unchanged_regions();
515 }
516
517 fn compact_unchanged_regions(&mut self) {
518 let mut compacted = vec![];
519 let mut maybe_previous: Option<UnchangedRange> = None;
520 for current in self.unchanged_regions.iter() {
521 if let Some(previous) = maybe_previous {
522 if previous.base_range.end == current.base_range.start
523 && previous.offsets == *current.offsets
524 {
525 maybe_previous = Some(UnchangedRange {
526 base_range: previous.base_range.start..current.base_range.end,
527 offsets: current.offsets.clone(),
528 });
529 continue;
530 }
531 compacted.push(previous);
532 }
533 maybe_previous = Some(current.clone());
534 }
535 if let Some(previous) = maybe_previous {
536 compacted.push(previous);
537 }
538 self.unchanged_regions = compacted;
539 }
540}
541
542#[derive(PartialEq, Eq, Clone)]
543pub enum DiffHunk<'input> {
544 Matching(&'input [u8]),
545 Different(Vec<&'input [u8]>),
546}
547
548impl Debug for DiffHunk<'_> {
549 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
550 match self {
551 DiffHunk::Matching(slice) => f
552 .debug_tuple("DiffHunk::Matching")
553 .field(&String::from_utf8_lossy(slice))
554 .finish(),
555 DiffHunk::Different(slices) => f
556 .debug_tuple("DiffHunk::Different")
557 .field(
558 &slices
559 .iter()
560 .map(|slice| String::from_utf8_lossy(slice))
561 .collect_vec(),
562 )
563 .finish(),
564 }
565 }
566}
567
568pub struct DiffHunkIterator<'diff, 'input> {
569 diff: &'diff Diff<'input>,
570 previous: UnchangedRange,
571 unchanged_emitted: bool,
572 unchanged_iter: slice::Iter<'diff, UnchangedRange>,
573}
574
575impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
576 type Item = DiffHunk<'input>;
577
578 fn next(&mut self) -> Option<Self::Item> {
579 loop {
580 if !self.unchanged_emitted {
581 self.unchanged_emitted = true;
582 if !self.previous.base_range.is_empty() {
583 return Some(DiffHunk::Matching(
584 &self.diff.base_input[self.previous.base_range.clone()],
585 ));
586 }
587 }
588 if let Some(current) = self.unchanged_iter.next() {
589 let mut slices = vec![
590 &self.diff.base_input[self.previous.base_range.end..current.base_range.start],
591 ];
592 for (i, input) in self.diff.other_inputs.iter().enumerate() {
593 slices.push(&input[self.previous.end(i)..current.start(i)]);
594 }
595 self.previous = current.clone();
596 self.unchanged_emitted = false;
597 if slices.iter().any(|slice| !slice.is_empty()) {
598 return Some(DiffHunk::Different(slices));
599 }
600 } else {
601 break;
602 }
603 }
604 None
605 }
606}
607
608pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
614 if left == right {
615 return vec![DiffHunk::Matching(left)];
616 }
617 if left.is_empty() {
618 return vec![DiffHunk::Different(vec![b"", right])];
619 }
620 if right.is_empty() {
621 return vec![DiffHunk::Different(vec![left, b""])];
622 }
623
624 Diff::default_refinement(&[left, right])
625 .hunks()
626 .collect_vec()
627}
628
629#[cfg(test)]
630mod tests {
631 use super::*;
632
633 #[test]
634 fn test_find_line_ranges_empty() {
635 assert_eq!(find_line_ranges(b""), vec![]);
636 }
637
638 #[test]
639 fn test_find_line_ranges_blank_line() {
640 assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
641 }
642
643 #[test]
644 fn test_find_line_ranges_missing_newline_at_eof() {
645 assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
646 }
647
648 #[test]
649 fn test_find_line_ranges_multiple_lines() {
650 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
651 }
652
653 #[test]
654 fn test_find_word_ranges_empty() {
655 assert_eq!(find_word_ranges(b""), vec![]);
656 }
657
658 #[test]
659 fn test_find_word_ranges_single_word() {
660 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
661 }
662
663 #[test]
664 fn test_find_word_ranges_no_word() {
665 assert_eq!(find_word_ranges(b"+-*/"), vec![]);
666 }
667
668 #[test]
669 fn test_find_word_ranges_word_then_non_word() {
670 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]);
671 }
672
673 #[test]
674 fn test_find_word_ranges_non_word_then_word() {
675 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
676 }
677
678 #[test]
679 fn test_find_lcs_empty() {
680 let empty: Vec<(usize, usize)> = vec![];
681 assert_eq!(find_lcs(&[]), empty);
682 }
683
684 #[test]
685 fn test_find_lcs_single_element() {
686 assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
687 }
688
689 #[test]
690 fn test_find_lcs_in_order() {
691 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
692 }
693
694 #[test]
695 fn test_find_lcs_reverse_order() {
696 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
697 }
698
699 #[test]
700 fn test_find_lcs_two_swapped() {
701 assert_eq!(
702 find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
703 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
704 );
705 }
706
707 #[test]
708 fn test_find_lcs_element_moved_earlier() {
709 assert_eq!(
710 find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
711 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
712 );
713 }
714
715 #[test]
716 fn test_find_lcs_element_moved_later() {
717 assert_eq!(
718 find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
719 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
720 );
721 }
722
723 #[test]
724 fn test_find_lcs_interleaved_longest_chains() {
725 assert_eq!(
726 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
727 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
728 );
729 }
730
731 #[test]
732 fn test_find_word_ranges_many_words() {
733 assert_eq!(
734 find_word_ranges(b"fn find_words(text: &[u8])"),
735 vec![0..2, 3..13, 14..18, 22..24]
736 );
737 }
738
739 #[test]
740 fn test_unchanged_ranges_insert_in_middle() {
741 assert_eq!(
742 unchanged_ranges(
743 b"a b b c",
744 b"a b X b c",
745 &[0..1, 2..3, 4..5, 6..7],
746 &[0..1, 2..3, 4..5, 6..7, 8..9],
747 ),
748 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
749 );
750 }
751
752 #[test]
753 fn test_unchanged_ranges_non_unique_removed() {
754 assert_eq!(
755 unchanged_ranges(
756 b"a a a a",
757 b"a b a c",
758 &[0..1, 2..3, 4..5, 6..7],
759 &[0..1, 2..3, 4..5, 6..7],
760 ),
761 vec![(0..1, 0..1), (2..3, 4..5)]
762 );
763 }
764
765 #[test]
766 fn test_unchanged_ranges_non_unique_added() {
767 assert_eq!(
768 unchanged_ranges(
769 b"a b a c",
770 b"a a a a",
771 &[0..1, 2..3, 4..5, 6..7],
772 &[0..1, 2..3, 4..5, 6..7],
773 ),
774 vec![(0..1, 0..1), (4..5, 2..3)]
775 );
776 }
777
778 #[test]
779 fn test_intersect_regions_existing_empty() {
780 let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
781 let expected = vec![];
782 assert_eq!(actual, expected);
783 }
784
785 #[test]
786 fn test_intersect_regions_new_ranges_within_existing() {
787 let actual = intersect_regions(
788 vec![UnchangedRange {
789 base_range: 20..70,
790 offsets: vec![3],
791 }],
792 &[(25..30, 35..40), (40..50, 40..50)],
793 );
794 let expected = vec![
795 UnchangedRange {
796 base_range: 25..30,
797 offsets: vec![3, 10],
798 },
799 UnchangedRange {
800 base_range: 40..50,
801 offsets: vec![3, 0],
802 },
803 ];
804 assert_eq!(actual, expected);
805 }
806
807 #[test]
808 fn test_intersect_regions_partial_overlap() {
809 let actual = intersect_regions(
810 vec![UnchangedRange {
811 base_range: 20..50,
812 offsets: vec![-3],
813 }],
814 &[(15..25, 5..15), (45..60, 55..70)],
815 );
816 let expected = vec![
817 UnchangedRange {
818 base_range: 20..25,
819 offsets: vec![-3, -10],
820 },
821 UnchangedRange {
822 base_range: 45..50,
823 offsets: vec![-3, 10],
824 },
825 ];
826 assert_eq!(actual, expected);
827 }
828
829 #[test]
830 fn test_intersect_regions_new_range_overlaps_multiple_existing() {
831 let actual = intersect_regions(
832 vec![
833 UnchangedRange {
834 base_range: 20..50,
835 offsets: vec![3, -8],
836 },
837 UnchangedRange {
838 base_range: 70..80,
839 offsets: vec![7, 1],
840 },
841 ],
842 &[(10..100, 5..95)],
843 );
844 let expected = vec![
845 UnchangedRange {
846 base_range: 20..50,
847 offsets: vec![3, -8, -5],
848 },
849 UnchangedRange {
850 base_range: 70..80,
851 offsets: vec![7, 1, -5],
852 },
853 ];
854 assert_eq!(actual, expected);
855 }
856
857 #[test]
858 fn test_diff_single_input() {
859 let diff = Diff::default_refinement(&[b"abc"]);
860 assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]);
861 }
862
863 #[test]
864 fn test_diff_single_empty_input() {
865 let diff = Diff::default_refinement(&[b""]);
866 assert_eq!(diff.hunks().collect_vec(), vec![]);
867 }
868
869 #[test]
870 fn test_diff_two_inputs_one_different() {
871 let diff = Diff::default_refinement(&[b"a b c", b"a X c"]);
872 assert_eq!(
873 diff.hunks().collect_vec(),
874 vec![
875 DiffHunk::Matching(b"a "),
876 DiffHunk::Different(vec![b"b", b"X"]),
877 DiffHunk::Matching(b" c"),
878 ]
879 );
880 }
881
882 #[test]
883 fn test_diff_multiple_inputs_one_different() {
884 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]);
885 assert_eq!(
886 diff.hunks().collect_vec(),
887 vec![
888 DiffHunk::Matching(b"a "),
889 DiffHunk::Different(vec![b"b", b"X", b"b"]),
890 DiffHunk::Matching(b" c"),
891 ]
892 );
893 }
894
895 #[test]
896 fn test_diff_multiple_inputs_all_different() {
897 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]);
898 assert_eq!(
899 diff.hunks().collect_vec(),
900 vec![
901 DiffHunk::Matching(b"a "),
902 DiffHunk::Different(vec![b"b ", b"X ", b""]),
903 DiffHunk::Matching(b"c"),
904 DiffHunk::Different(vec![b"", b"", b" X"]),
905 ]
906 );
907 }
908
909 #[test]
910 fn test_diff_for_tokenizer_compacted() {
911 let diff = Diff::for_tokenizer(
913 &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"],
914 &find_line_ranges,
915 );
916 assert_eq!(
917 diff.hunks().collect_vec(),
918 vec![
919 DiffHunk::Matching(b"a\nb\nc\n"),
920 DiffHunk::Different(vec![b"d\n", b"X\n"]),
921 DiffHunk::Matching(b"e\nf\ng"),
922 ]
923 );
924 }
925
926 #[test]
927 fn test_diff_nothing_in_common() {
928 assert_eq!(
929 diff(b"aaa", b"bb"),
930 vec![DiffHunk::Different(vec![b"aaa", b"bb"])]
931 );
932 }
933
934 #[test]
935 fn test_diff_insert_in_middle() {
936 assert_eq!(
937 diff(b"a z", b"a S z"),
938 vec![
939 DiffHunk::Matching(b"a "),
940 DiffHunk::Different(vec![b"", b"S "]),
941 DiffHunk::Matching(b"z"),
942 ]
943 );
944 }
945
946 #[test]
947 fn test_diff_no_unique_middle_flips() {
948 assert_eq!(
949 diff(b"a R R S S z", b"a S S R R z"),
950 vec![
951 DiffHunk::Matching(b"a "),
952 DiffHunk::Different(vec![b"R R ", b""]),
953 DiffHunk::Matching(b"S S "),
954 DiffHunk::Different(vec![b"", b"R R "]),
955 DiffHunk::Matching(b"z")
956 ],
957 );
958 }
959
960 #[test]
961 fn test_diff_recursion_needed() {
962 assert_eq!(
963 diff(
964 b"a q x q y q z q b q y q x q c",
965 b"a r r x q y z q b y q x r r c",
966 ),
967 vec![
968 DiffHunk::Matching(b"a "),
969 DiffHunk::Different(vec![b"q", b"r"]),
970 DiffHunk::Matching(b" "),
971 DiffHunk::Different(vec![b"", b"r "]),
972 DiffHunk::Matching(b"x q y "),
973 DiffHunk::Different(vec![b"q ", b""]),
974 DiffHunk::Matching(b"z q b "),
975 DiffHunk::Different(vec![b"q ", b""]),
976 DiffHunk::Matching(b"y q x "),
977 DiffHunk::Different(vec![b"q", b"r"]),
978 DiffHunk::Matching(b" "),
979 DiffHunk::Different(vec![b"", b"r "]),
980 DiffHunk::Matching(b"c"),
981 ]
982 );
983 }
984
985 #[test]
986 fn test_diff_real_case_write_fmt() {
987 assert_eq!(diff(
993 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
994 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
995 ),
996 vec![
997 DiffHunk::Matching(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
998 DiffHunk::Different(vec![b"", b"-> io::Result<()> "]),
999 DiffHunk::Matching(b"{\n self.styler().write_fmt(fmt)"),
1000 DiffHunk::Different(vec![b".unwrap()", b""]),
1001 DiffHunk::Matching(b"\n")
1002 ]
1003 );
1004 }
1005
1006 #[test]
1007 fn test_diff_real_case_gitgit_read_tree_c() {
1008 assert_eq!(
1010 diff(
1011 br##"/*
1012 * GIT - The information manager from hell
1013 *
1014 * Copyright (C) Linus Torvalds, 2005
1015 */
1016#include "#cache.h"
1017
1018static int unpack(unsigned char *sha1)
1019{
1020 void *buffer;
1021 unsigned long size;
1022 char type[20];
1023
1024 buffer = read_sha1_file(sha1, type, &size);
1025 if (!buffer)
1026 usage("unable to read sha1 file");
1027 if (strcmp(type, "tree"))
1028 usage("expected a 'tree' node");
1029 while (size) {
1030 int len = strlen(buffer)+1;
1031 unsigned char *sha1 = buffer + len;
1032 char *path = strchr(buffer, ' ')+1;
1033 unsigned int mode;
1034 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1035 usage("corrupt 'tree' file");
1036 buffer = sha1 + 20;
1037 size -= len + 20;
1038 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1039 }
1040 return 0;
1041}
1042
1043int main(int argc, char **argv)
1044{
1045 int fd;
1046 unsigned char sha1[20];
1047
1048 if (argc != 2)
1049 usage("read-tree <key>");
1050 if (get_sha1_hex(argv[1], sha1) < 0)
1051 usage("read-tree <key>");
1052 sha1_file_directory = getenv(DB_ENVIRONMENT);
1053 if (!sha1_file_directory)
1054 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1055 if (unpack(sha1) < 0)
1056 usage("unpack failed");
1057 return 0;
1058}
1059"##,
1060 br##"/*
1061 * GIT - The information manager from hell
1062 *
1063 * Copyright (C) Linus Torvalds, 2005
1064 */
1065#include "#cache.h"
1066
1067static void create_directories(const char *path)
1068{
1069 int len = strlen(path);
1070 char *buf = malloc(len + 1);
1071 const char *slash = path;
1072
1073 while ((slash = strchr(slash+1, '/')) != NULL) {
1074 len = slash - path;
1075 memcpy(buf, path, len);
1076 buf[len] = 0;
1077 mkdir(buf, 0700);
1078 }
1079}
1080
1081static int create_file(const char *path)
1082{
1083 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1084 if (fd < 0) {
1085 if (errno == ENOENT) {
1086 create_directories(path);
1087 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1088 }
1089 }
1090 return fd;
1091}
1092
1093static int unpack(unsigned char *sha1)
1094{
1095 void *buffer;
1096 unsigned long size;
1097 char type[20];
1098
1099 buffer = read_sha1_file(sha1, type, &size);
1100 if (!buffer)
1101 usage("unable to read sha1 file");
1102 if (strcmp(type, "tree"))
1103 usage("expected a 'tree' node");
1104 while (size) {
1105 int len = strlen(buffer)+1;
1106 unsigned char *sha1 = buffer + len;
1107 char *path = strchr(buffer, ' ')+1;
1108 char *data;
1109 unsigned long filesize;
1110 unsigned int mode;
1111 int fd;
1112
1113 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1114 usage("corrupt 'tree' file");
1115 buffer = sha1 + 20;
1116 size -= len + 20;
1117 data = read_sha1_file(sha1, type, &filesize);
1118 if (!data || strcmp(type, "blob"))
1119 usage("tree file refers to bad file data");
1120 fd = create_file(path);
1121 if (fd < 0)
1122 usage("unable to create file");
1123 if (write(fd, data, filesize) != filesize)
1124 usage("unable to write file");
1125 fchmod(fd, mode);
1126 close(fd);
1127 free(data);
1128 }
1129 return 0;
1130}
1131
1132int main(int argc, char **argv)
1133{
1134 int fd;
1135 unsigned char sha1[20];
1136
1137 if (argc != 2)
1138 usage("read-tree <key>");
1139 if (get_sha1_hex(argv[1], sha1) < 0)
1140 usage("read-tree <key>");
1141 sha1_file_directory = getenv(DB_ENVIRONMENT);
1142 if (!sha1_file_directory)
1143 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1144 if (unpack(sha1) < 0)
1145 usage("unpack failed");
1146 return 0;
1147}
1148"##,
1149 ),
1150 vec![
1151 DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
1152 DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1153 DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
1154 DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1155 DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
1156 DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
1157 DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
1158 DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
1159 DiffHunk::Matching(b"("),
1160 DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
1161 DiffHunk::Matching(b"sha1"),
1162 DiffHunk::Different(vec![b"", b", type, &filesize"]),
1163 DiffHunk::Matching(b")"),
1164 DiffHunk::Different(vec![b")", b""]),
1165 DiffHunk::Matching(b";\n"),
1166 DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
1167 DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
1168 ]
1169 );
1170 }
1171}