#![allow(missing_docs)]
use std::cmp::{max, min, Ordering};
use std::collections::{BTreeMap, HashMap};
use std::ops::Range;
use std::{iter, slice};
use bstr::BStr;
use itertools::Itertools;
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
text.split_inclusive(|b| *b == b'\n')
.scan(0, |total, line| {
let start = *total;
*total += line.len();
Some(start..*total)
})
.collect()
}
fn is_word_byte(b: u8) -> bool {
matches!(
b,
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
)
}
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
let mut word_ranges = vec![];
let mut word_start_pos = 0;
let mut in_word = false;
for (i, b) in text.iter().enumerate() {
if in_word && !is_word_byte(*b) {
in_word = false;
word_ranges.push(word_start_pos..i);
word_start_pos = i;
} else if !in_word && is_word_byte(*b) {
in_word = true;
word_start_pos = i;
}
}
if in_word && word_start_pos < text.len() {
word_ranges.push(word_start_pos..text.len());
}
word_ranges
}
pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
text.iter()
.positions(|b| !is_word_byte(*b))
.map(|i| i..i + 1)
.collect()
}
struct Histogram<'a> {
word_to_positions: HashMap<&'a [u8], Vec<usize>>,
count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
}
impl Histogram<'_> {
fn calculate<'a>(
text: &'a [u8],
ranges: &[Range<usize>],
max_occurrences: usize,
) -> Histogram<'a> {
let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
for (i, range) in ranges.iter().enumerate() {
let positions = word_to_positions.entry(&text[range.clone()]).or_default();
if positions.len() <= max_occurrences {
positions.push(i);
}
}
let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
for (word, ranges) in &word_to_positions {
count_to_words.entry(ranges.len()).or_default().push(word);
}
Histogram {
word_to_positions,
count_to_words,
}
}
}
fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
if input.is_empty() {
return vec![];
}
let mut chain = vec![(0, 0, 0); input.len()];
let mut global_longest = 0;
let mut global_longest_right_pos = 0;
for (right_pos, &left_pos) in input.iter().enumerate() {
let mut longest_from_here = 1;
let mut previous_right_pos = usize::MAX;
for i in (0..right_pos).rev() {
let (previous_len, previous_left_pos, _) = chain[i];
if previous_left_pos < left_pos {
let len = previous_len + 1;
if len > longest_from_here {
longest_from_here = len;
previous_right_pos = i;
if len > global_longest {
global_longest = len;
global_longest_right_pos = right_pos;
break;
}
}
}
}
chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
}
let mut result = vec![];
let mut right_pos = global_longest_right_pos;
loop {
let (_, left_pos, previous_right_pos) = chain[right_pos];
result.push((left_pos, right_pos));
if previous_right_pos == usize::MAX {
break;
}
right_pos = previous_right_pos;
}
result.reverse();
result
}
pub(crate) fn unchanged_ranges(
left: &[u8],
right: &[u8],
left_ranges: &[Range<usize>],
right_ranges: &[Range<usize>],
) -> Vec<(Range<usize>, Range<usize>)> {
if left_ranges.is_empty() || right_ranges.is_empty() {
return vec![];
}
let result = unchanged_ranges_lcs(left, right, left_ranges, right_ranges);
if !result.is_empty() {
return result;
}
let common_leading_len = iter::zip(left_ranges, right_ranges)
.take_while(|&(l, r)| left[l.clone()] == right[r.clone()])
.count();
let (left_leading_ranges, left_ranges) = left_ranges.split_at(common_leading_len);
let (right_leading_ranges, right_ranges) = right_ranges.split_at(common_leading_len);
let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
.take_while(|&(l, r)| left[l.clone()] == right[r.clone()])
.count();
let left_trailing_ranges = &left_ranges[(left_ranges.len() - common_trailing_len)..];
let right_trailing_ranges = &right_ranges[(right_ranges.len() - common_trailing_len)..];
itertools::chain(
iter::zip(
left_leading_ranges.iter().cloned(),
right_leading_ranges.iter().cloned(),
),
iter::zip(
left_trailing_ranges.iter().cloned(),
right_trailing_ranges.iter().cloned(),
),
)
.collect()
}
fn unchanged_ranges_lcs(
left: &[u8],
right: &[u8],
left_ranges: &[Range<usize>],
right_ranges: &[Range<usize>],
) -> Vec<(Range<usize>, Range<usize>)> {
let max_occurrences = 100;
let left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
return vec![];
}
let right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
let Some(uncommon_shared_words) = left_histogram
.count_to_words
.iter()
.map(|(left_count, left_words)| -> Vec<&[u8]> {
left_words
.iter()
.copied()
.filter(|left_word| {
let right_count = right_histogram
.word_to_positions
.get(left_word)
.map_or(0, |right_positions| right_positions.len());
*left_count == right_count
})
.collect()
})
.find(|words| !words.is_empty())
else {
return vec![];
};
let mut left_positions = vec![];
let mut right_positions = vec![];
for uncommon_shared_word in uncommon_shared_words {
let left_occurrences = &left_histogram.word_to_positions[uncommon_shared_word];
let right_occurrences = &right_histogram.word_to_positions[uncommon_shared_word];
assert_eq!(left_occurrences.len(), right_occurrences.len());
for occurrence in 0..left_occurrences.len() {
left_positions.push((
left_occurrences[occurrence],
uncommon_shared_word,
occurrence,
));
right_positions.push((
right_occurrences[occurrence],
uncommon_shared_word,
occurrence,
));
}
}
left_positions.sort();
right_positions.sort();
let mut left_position_map = HashMap::new();
for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
left_position_map.insert((*word, *occurrence), i);
}
let mut left_index_by_right_index = vec![];
for (_pos, word, occurrence) in &right_positions {
left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
}
let lcs = find_lcs(&left_index_by_right_index);
let mut result = vec![];
let mut previous_left_position = 0;
let mut previous_right_position = 0;
for (left_index, right_index) in lcs {
let left_position = left_positions[left_index].0;
let right_position = right_positions[right_index].0;
let skipped_left_positions = previous_left_position..left_position;
let skipped_right_positions = previous_right_position..right_position;
if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
for unchanged_nested_range in unchanged_ranges(
left,
right,
&left_ranges[skipped_left_positions.clone()],
&right_ranges[skipped_right_positions.clone()],
) {
result.push(unchanged_nested_range);
}
}
result.push((
left_ranges[left_position].clone(),
right_ranges[right_position].clone(),
));
previous_left_position = left_position + 1;
previous_right_position = right_position + 1;
}
let skipped_left_positions = previous_left_position..left_ranges.len();
let skipped_right_positions = previous_right_position..right_ranges.len();
if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
for unchanged_nested_range in unchanged_ranges(
left,
right,
&left_ranges[skipped_left_positions],
&right_ranges[skipped_right_positions],
) {
result.push(unchanged_nested_range);
}
}
result
}
#[derive(Clone, PartialEq, Eq, Debug)]
struct UnchangedRange {
base_range: Range<usize>,
offsets: Vec<isize>,
}
impl UnchangedRange {
fn start(&self, side: usize) -> usize {
self.base_range
.start
.wrapping_add(self.offsets[side] as usize)
}
fn end(&self, side: usize) -> usize {
self.base_range
.end
.wrapping_add(self.offsets[side] as usize)
}
}
impl PartialOrd for UnchangedRange {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for UnchangedRange {
fn cmp(&self, other: &Self) -> Ordering {
self.base_range
.start
.cmp(&other.base_range.start)
.then_with(|| self.base_range.end.cmp(&other.base_range.end))
}
}
#[derive(Clone, Debug)]
pub struct Diff<'input> {
base_input: &'input BStr,
other_inputs: Vec<&'input BStr>,
unchanged_regions: Vec<UnchangedRange>,
}
fn intersect_regions(
current_ranges: Vec<UnchangedRange>,
new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
) -> Vec<UnchangedRange> {
let mut result = vec![];
let mut current_ranges_iter = current_ranges.into_iter().peekable();
for (new_base_range, other_range) in new_unchanged_ranges.iter() {
assert_eq!(new_base_range.len(), other_range.len());
while let Some(UnchangedRange {
base_range,
offsets,
}) = current_ranges_iter.peek()
{
if base_range.start >= new_base_range.end {
break;
}
if base_range.end <= new_base_range.start {
current_ranges_iter.next();
continue;
}
let new_start = max(base_range.start, new_base_range.start);
let new_end = min(base_range.end, new_base_range.end);
let mut new_offsets = offsets.clone();
new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
result.push(UnchangedRange {
base_range: new_start..new_end,
offsets: new_offsets,
});
if base_range.end >= new_base_range.end {
break;
}
current_ranges_iter.next();
}
}
result
}
impl<'input> Diff<'input> {
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
) -> Self {
let mut inputs = inputs.into_iter().map(BStr::new);
let base_input = inputs.next().expect("inputs must not be empty");
let other_inputs = inputs.collect_vec();
let base_token_ranges = tokenizer(base_input);
let other_token_ranges = other_inputs
.iter()
.map(|other_input| tokenizer(other_input))
.collect_vec();
Self::with_inputs_and_token_ranges(
base_input,
other_inputs,
&base_token_ranges,
&other_token_ranges,
)
}
fn with_inputs_and_token_ranges(
base_input: &'input BStr,
other_inputs: Vec<&'input BStr>,
base_token_ranges: &[Range<usize>],
other_token_ranges: &[Vec<Range<usize>>],
) -> Self {
let mut unchanged_regions = vec![UnchangedRange {
base_range: 0..base_input.len(),
offsets: vec![],
}];
for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
let unchanged_diff_ranges = unchanged_ranges(
base_input,
other_inputs[i],
base_token_ranges,
other_token_ranges,
);
unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
}
let offsets = other_inputs
.iter()
.map(|input| input.len().wrapping_sub(base_input.len()) as isize)
.collect_vec();
unchanged_regions.push(UnchangedRange {
base_range: base_input.len()..base_input.len(),
offsets,
});
let mut diff = Self {
base_input,
other_inputs,
unchanged_regions,
};
diff.compact_unchanged_regions();
diff
}
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, |_| vec![])
}
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, find_line_ranges)
}
pub fn default_refinement<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
diff.refine_changed_regions(find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
diff
}
pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
let previous_offsets = vec![0; self.other_inputs.len()];
DiffHunkIterator {
diff: self,
previous: UnchangedRange {
base_range: 0..0,
offsets: previous_offsets,
},
unchanged_emitted: true,
unchanged_iter: self.unchanged_regions.iter(),
}
}
pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
let mut previous = UnchangedRange {
base_range: 0..0,
offsets: vec![0; self.other_inputs.len()],
};
let mut new_unchanged_ranges = vec![];
for current in self.unchanged_regions.iter() {
let mut slices =
vec![&self.base_input[previous.base_range.end..current.base_range.start]];
for i in 0..current.offsets.len() {
let changed_range = previous.end(i)..current.start(i);
slices.push(&self.other_inputs[i][changed_range]);
}
let refined_diff = Diff::for_tokenizer(slices, &tokenizer);
for UnchangedRange {
base_range,
offsets,
} in refined_diff.unchanged_regions
{
let new_base_start = base_range.start + previous.base_range.end;
let new_base_end = base_range.end + previous.base_range.end;
let offsets = offsets
.into_iter()
.enumerate()
.map(|(i, offset)| offset + previous.offsets[i])
.collect_vec();
new_unchanged_ranges.push(UnchangedRange {
base_range: new_base_start..new_base_end,
offsets,
});
}
previous = current.clone();
}
self.unchanged_regions = self
.unchanged_regions
.iter()
.cloned()
.merge(new_unchanged_ranges)
.collect_vec();
self.compact_unchanged_regions();
}
fn compact_unchanged_regions(&mut self) {
let mut compacted = vec![];
let mut maybe_previous: Option<UnchangedRange> = None;
for current in self.unchanged_regions.iter() {
if let Some(previous) = maybe_previous {
if previous.base_range.end == current.base_range.start
&& previous.offsets == *current.offsets
{
maybe_previous = Some(UnchangedRange {
base_range: previous.base_range.start..current.base_range.end,
offsets: current.offsets.clone(),
});
continue;
}
compacted.push(previous);
}
maybe_previous = Some(current.clone());
}
if let Some(previous) = maybe_previous {
compacted.push(previous);
}
self.unchanged_regions = compacted;
}
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum DiffHunk<'input> {
Matching(&'input BStr),
Different(Vec<&'input BStr>),
}
impl<'input> DiffHunk<'input> {
pub fn matching<T: AsRef<[u8]> + ?Sized>(content: &'input T) -> Self {
DiffHunk::Matching(BStr::new(content))
}
pub fn different<T: AsRef<[u8]> + ?Sized + 'input>(
contents: impl IntoIterator<Item = &'input T>,
) -> Self {
DiffHunk::Different(contents.into_iter().map(BStr::new).collect())
}
}
pub struct DiffHunkIterator<'diff, 'input> {
diff: &'diff Diff<'input>,
previous: UnchangedRange,
unchanged_emitted: bool,
unchanged_iter: slice::Iter<'diff, UnchangedRange>,
}
impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
type Item = DiffHunk<'input>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if !self.unchanged_emitted {
self.unchanged_emitted = true;
if !self.previous.base_range.is_empty() {
return Some(DiffHunk::Matching(
&self.diff.base_input[self.previous.base_range.clone()],
));
}
}
if let Some(current) = self.unchanged_iter.next() {
let mut slices = vec![
&self.diff.base_input[self.previous.base_range.end..current.base_range.start],
];
for (i, input) in self.diff.other_inputs.iter().enumerate() {
slices.push(&input[self.previous.end(i)..current.start(i)]);
}
self.previous = current.clone();
self.unchanged_emitted = false;
if slices.iter().any(|slice| !slice.is_empty()) {
return Some(DiffHunk::Different(slices));
}
} else {
break;
}
}
None
}
}
pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
if left == right {
return vec![DiffHunk::matching(left)];
}
if left.is_empty() {
return vec![DiffHunk::different([b"", right])];
}
if right.is_empty() {
return vec![DiffHunk::different([left, b""])];
}
Diff::default_refinement([left, right])
.hunks()
.collect_vec()
}
#[cfg(test)]
mod tests {
use super::*;
fn no_ranges() -> Vec<Range<usize>> {
vec![]
}
#[test]
fn test_find_line_ranges_empty() {
assert_eq!(find_line_ranges(b""), no_ranges());
}
#[test]
fn test_find_line_ranges_blank_line() {
assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
}
#[test]
fn test_find_line_ranges_missing_newline_at_eof() {
assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
}
#[test]
fn test_find_line_ranges_multiple_lines() {
assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
}
#[test]
fn test_find_word_ranges_empty() {
assert_eq!(find_word_ranges(b""), no_ranges());
}
#[test]
fn test_find_word_ranges_single_word() {
assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
}
#[test]
fn test_find_word_ranges_no_word() {
assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
}
#[test]
fn test_find_word_ranges_word_then_non_word() {
assert_eq!(find_word_ranges(b"Abc "), vec![0..3]);
}
#[test]
fn test_find_word_ranges_non_word_then_word() {
assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
}
#[test]
fn test_find_word_ranges_multibyte() {
assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3])
}
#[test]
fn test_find_lcs_empty() {
let empty: Vec<(usize, usize)> = vec![];
assert_eq!(find_lcs(&[]), empty);
}
#[test]
fn test_find_lcs_single_element() {
assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
}
#[test]
fn test_find_lcs_in_order() {
assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
}
#[test]
fn test_find_lcs_reverse_order() {
assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
}
#[test]
fn test_find_lcs_two_swapped() {
assert_eq!(
find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
);
}
#[test]
fn test_find_lcs_element_moved_earlier() {
assert_eq!(
find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
);
}
#[test]
fn test_find_lcs_element_moved_later() {
assert_eq!(
find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
);
}
#[test]
fn test_find_lcs_interleaved_longest_chains() {
assert_eq!(
find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
);
}
#[test]
fn test_find_word_ranges_many_words() {
assert_eq!(
find_word_ranges(b"fn find_words(text: &[u8])"),
vec![0..2, 3..13, 14..18, 22..24]
);
}
#[test]
fn test_unchanged_ranges_insert_in_middle() {
assert_eq!(
unchanged_ranges(
b"a b b c",
b"a b X b c",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7, 8..9],
),
vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
);
}
#[test]
fn test_unchanged_ranges_non_unique_removed() {
assert_eq!(
unchanged_ranges(
b"a a a a",
b"a b a c",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
b"a a a a",
b"b a c a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
b"a a a a",
b"b a a c",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![]
);
assert_eq!(
unchanged_ranges(
b"a a a a",
b"a b c a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (6..7, 6..7)]
);
}
#[test]
fn test_unchanged_ranges_non_unique_added() {
assert_eq!(
unchanged_ranges(
b"a b a c",
b"a a a a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1)]
);
assert_eq!(
unchanged_ranges(
b"b a c a",
b"a a a a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(6..7, 6..7)]
);
assert_eq!(
unchanged_ranges(
b"b a a c",
b"a a a a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![]
);
assert_eq!(
unchanged_ranges(
b"a b c a",
b"a a a a",
&[0..1, 2..3, 4..5, 6..7],
&[0..1, 2..3, 4..5, 6..7],
),
vec![(0..1, 0..1), (6..7, 6..7)]
);
}
#[test]
fn test_intersect_regions_existing_empty() {
let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
let expected = vec![];
assert_eq!(actual, expected);
}
#[test]
fn test_intersect_regions_new_ranges_within_existing() {
let actual = intersect_regions(
vec![UnchangedRange {
base_range: 20..70,
offsets: vec![3],
}],
&[(25..30, 35..40), (40..50, 40..50)],
);
let expected = vec![
UnchangedRange {
base_range: 25..30,
offsets: vec![3, 10],
},
UnchangedRange {
base_range: 40..50,
offsets: vec![3, 0],
},
];
assert_eq!(actual, expected);
}
#[test]
fn test_intersect_regions_partial_overlap() {
let actual = intersect_regions(
vec![UnchangedRange {
base_range: 20..50,
offsets: vec![-3],
}],
&[(15..25, 5..15), (45..60, 55..70)],
);
let expected = vec![
UnchangedRange {
base_range: 20..25,
offsets: vec![-3, -10],
},
UnchangedRange {
base_range: 45..50,
offsets: vec![-3, 10],
},
];
assert_eq!(actual, expected);
}
#[test]
fn test_intersect_regions_new_range_overlaps_multiple_existing() {
let actual = intersect_regions(
vec![
UnchangedRange {
base_range: 20..50,
offsets: vec![3, -8],
},
UnchangedRange {
base_range: 70..80,
offsets: vec![7, 1],
},
],
&[(10..100, 5..95)],
);
let expected = vec![
UnchangedRange {
base_range: 20..50,
offsets: vec![3, -8, -5],
},
UnchangedRange {
base_range: 70..80,
offsets: vec![7, 1, -5],
},
];
assert_eq!(actual, expected);
}
#[test]
fn test_diff_single_input() {
let diff = Diff::default_refinement(["abc"]);
assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::matching("abc")]);
}
#[test]
fn test_diff_single_empty_input() {
let diff = Diff::default_refinement([""]);
assert_eq!(diff.hunks().collect_vec(), vec![]);
}
#[test]
fn test_diff_two_inputs_one_different() {
let diff = Diff::default_refinement(["a b c", "a X c"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["b", "X"]),
DiffHunk::matching(" c"),
]
);
}
#[test]
fn test_diff_multiple_inputs_one_different() {
let diff = Diff::default_refinement(["a b c", "a X c", "a b c"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["b", "X", "b"]),
DiffHunk::matching(" c"),
]
);
}
#[test]
fn test_diff_multiple_inputs_all_different() {
let diff = Diff::default_refinement(["a b c", "a X c", "a c X"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["b ", "X ", ""]),
DiffHunk::matching("c"),
DiffHunk::different(["", "", " X"]),
]
);
}
#[test]
fn test_diff_for_tokenizer_compacted() {
let diff = Diff::for_tokenizer(
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
find_line_ranges,
);
assert_eq!(
diff.hunks().collect_vec(),
vec![
DiffHunk::matching("a\nb\nc\n"),
DiffHunk::different(["d\n", "X\n"]),
DiffHunk::matching("e\nf\ng"),
]
);
}
#[test]
fn test_diff_nothing_in_common() {
assert_eq!(
diff(b"aaa", b"bb"),
vec![DiffHunk::different(["aaa", "bb"])]
);
}
#[test]
fn test_diff_insert_in_middle() {
assert_eq!(
diff(b"a z", b"a S z"),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["", "S "]),
DiffHunk::matching("z"),
]
);
}
#[test]
fn test_diff_no_unique_middle_flips() {
assert_eq!(
diff(b"a R R S S z", b"a S S R R z"),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["R R ", ""]),
DiffHunk::matching("S S "),
DiffHunk::different(["", "R R "]),
DiffHunk::matching("z")
],
);
}
#[test]
fn test_diff_recursion_needed() {
assert_eq!(
diff(
b"a q x q y q z q b q y q x q c",
b"a r r x q y z q b y q x r r c",
),
vec![
DiffHunk::matching("a "),
DiffHunk::different(["q", "r"]),
DiffHunk::matching(" "),
DiffHunk::different(["", "r "]),
DiffHunk::matching("x q y "),
DiffHunk::different(["q ", ""]),
DiffHunk::matching("z q b "),
DiffHunk::different(["q ", ""]),
DiffHunk::matching("y q x "),
DiffHunk::different(["q", "r"]),
DiffHunk::matching(" "),
DiffHunk::different(["", "r "]),
DiffHunk::matching("c"),
]
);
}
#[test]
fn test_diff_real_case_write_fmt() {
assert_eq!(diff(
b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
),
vec![
DiffHunk::matching(" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
DiffHunk::different(["", "-> io::Result<()> "]),
DiffHunk::matching("{\n self.styler().write_fmt(fmt)"),
DiffHunk::different([".unwrap()", ""]),
DiffHunk::matching("\n")
]
);
}
#[test]
fn test_diff_real_case_gitgit_read_tree_c() {
#[rustfmt::skip]
assert_eq!(
diff(
br##"/*
* GIT - The information manager from hell
*
* Copyright (C) Linus Torvalds, 2005
*/
#include "#cache.h"
static int unpack(unsigned char *sha1)
{
void *buffer;
unsigned long size;
char type[20];
buffer = read_sha1_file(sha1, type, &size);
if (!buffer)
usage("unable to read sha1 file");
if (strcmp(type, "tree"))
usage("expected a 'tree' node");
while (size) {
int len = strlen(buffer)+1;
unsigned char *sha1 = buffer + len;
char *path = strchr(buffer, ' ')+1;
unsigned int mode;
if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
usage("corrupt 'tree' file");
buffer = sha1 + 20;
size -= len + 20;
printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
}
return 0;
}
int main(int argc, char **argv)
{
int fd;
unsigned char sha1[20];
if (argc != 2)
usage("read-tree <key>");
if (get_sha1_hex(argv[1], sha1) < 0)
usage("read-tree <key>");
sha1_file_directory = getenv(DB_ENVIRONMENT);
if (!sha1_file_directory)
sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
if (unpack(sha1) < 0)
usage("unpack failed");
return 0;
}
"##,
br##"/*
* GIT - The information manager from hell
*
* Copyright (C) Linus Torvalds, 2005
*/
#include "#cache.h"
static void create_directories(const char *path)
{
int len = strlen(path);
char *buf = malloc(len + 1);
const char *slash = path;
while ((slash = strchr(slash+1, '/')) != NULL) {
len = slash - path;
memcpy(buf, path, len);
buf[len] = 0;
mkdir(buf, 0700);
}
}
static int create_file(const char *path)
{
int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
if (fd < 0) {
if (errno == ENOENT) {
create_directories(path);
fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
}
}
return fd;
}
static int unpack(unsigned char *sha1)
{
void *buffer;
unsigned long size;
char type[20];
buffer = read_sha1_file(sha1, type, &size);
if (!buffer)
usage("unable to read sha1 file");
if (strcmp(type, "tree"))
usage("expected a 'tree' node");
while (size) {
int len = strlen(buffer)+1;
unsigned char *sha1 = buffer + len;
char *path = strchr(buffer, ' ')+1;
char *data;
unsigned long filesize;
unsigned int mode;
int fd;
if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
usage("corrupt 'tree' file");
buffer = sha1 + 20;
size -= len + 20;
data = read_sha1_file(sha1, type, &filesize);
if (!data || strcmp(type, "blob"))
usage("tree file refers to bad file data");
fd = create_file(path);
if (fd < 0)
usage("unable to create file");
if (write(fd, data, filesize) != filesize)
usage("unable to write file");
fchmod(fd, mode);
close(fd);
free(data);
}
return 0;
}
int main(int argc, char **argv)
{
int fd;
unsigned char sha1[20];
if (argc != 2)
usage("read-tree <key>");
if (get_sha1_hex(argv[1], sha1) < 0)
usage("read-tree <key>");
sha1_file_directory = getenv(DB_ENVIRONMENT);
if (!sha1_file_directory)
sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
if (unpack(sha1) < 0)
usage("unpack failed");
return 0;
}
"##,
),
vec![
DiffHunk::matching("/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
DiffHunk::matching("static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
DiffHunk::matching("\t\tunsigned int mode;\n"),
DiffHunk::different(["", "\t\tint fd;\n\n"]),
DiffHunk::matching("\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]),
DiffHunk::matching(" "),
DiffHunk::different(["sha1_to_hex", "read_sha1_file"]),
DiffHunk::matching("(sha1"),
DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
DiffHunk::matching(");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"),
]
);
}
}