use std::{hash::Hash, ops::Range};
use rustc_hash::FxHashMap;
use crate::utils::ChangeTag;
struct ClearableArray {
values: Vec<usize>,
generations: Vec<u32>,
generation: u32,
}
impl ClearableArray {
fn new(len: usize) -> Self {
Self {
values: vec![0; len],
generations: vec![0; len],
generation: 1,
}
}
fn clear(&mut self) {
if let Some(new_gen) = self.generation.checked_add(1) {
self.generation = new_gen;
} else {
let len = self.values.len();
self.generation = 1;
self.generations = vec![0; len];
}
}
}
impl std::ops::Index<usize> for ClearableArray {
type Output = usize;
fn index(&self, i: usize) -> &usize {
if self.generations[i] == self.generation {
&self.values[i]
} else {
&0
}
}
}
impl std::ops::IndexMut<usize> for ClearableArray {
fn index_mut(&mut self, i: usize) -> &mut usize {
if self.generations[i] != self.generation {
self.values[i] = 0;
self.generations[i] = self.generation;
}
&mut self.values[i]
}
}
type ABRange = (Range<usize>, Range<usize>);
struct Differ<'a, T: Hash + Eq> {
a: &'a [T],
b: &'a [T],
b_indices: FxHashMap<&'a T, Vec<usize>>,
last_row: ClearableArray,
this_row: ClearableArray,
}
impl<'a, T: Hash + Eq> Differ<'a, T> {
fn new(a: &'a [T], b: &'a [T]) -> Self {
Self {
a,
b,
b_indices: Self::build_elem_indices(b),
last_row: ClearableArray::new(b.len()),
this_row: ClearableArray::new(b.len()),
}
}
fn build_elem_indices(list: &'a [T]) -> FxHashMap<&'a T, Vec<usize>> {
let mut elem_indices = FxHashMap::default();
for (i, elem) in list.iter().enumerate() {
elem_indices
.entry(elem)
.and_modify(|indices: &mut Vec<usize>| indices.push(i))
.or_insert_with(|| vec![i]);
}
let len = list.len();
if len >= 200 {
let max_count = 1 + len / 100;
elem_indices.retain(|_, indices| indices.len() <= max_count);
}
elem_indices
}
#[doc = include_str!("difflib_find_longest_match.md")]
fn find_longest_match(&mut self, (a_range, b_range): ABRange) -> Option<ABRange> {
let mut longest_a_b = None;
let mut longest_match = 0;
self.this_row.clear();
self.last_row.clear();
for i in a_range {
std::mem::swap(&mut self.this_row, &mut self.last_row);
self.this_row.clear();
let a_elem = &self.a[i];
let Some(b_positions) = self.b_indices.get(a_elem) else {
continue;
};
for &j in b_positions {
if j < b_range.start {
continue;
}
if j >= b_range.end {
break;
}
let match_len = if j > 0 { self.last_row[j - 1] + 1 } else { 1 };
self.this_row[j] = match_len;
if match_len > longest_match {
let longest_a_range = i - (match_len - 1)..i + 1;
let longest_b_range = j - (match_len - 1)..j + 1;
longest_a_b = Some((longest_a_range, longest_b_range));
longest_match = match_len;
}
}
}
longest_a_b
}
fn expand_match_within_window(&self, window: ABRange, match_block: ABRange) -> ABRange {
let (window_a, window_b) = window;
let (a_range, b_range) = match_block;
if a_range.is_empty() || b_range.is_empty() {
return (a_range, b_range);
}
let mut expanded_a = a_range;
let mut expanded_b = b_range;
while expanded_a.start > window_a.start
&& expanded_b.start > window_b.start
&& self.a[expanded_a.start - 1] == self.b[expanded_b.start - 1]
{
expanded_a.start -= 1;
expanded_b.start -= 1;
}
while expanded_a.end < window_a.end
&& expanded_b.end < window_b.end
&& self.a[expanded_a.end] == self.b[expanded_b.end]
{
expanded_a.end += 1;
expanded_b.end += 1;
}
(expanded_a, expanded_b)
}
fn find_matching_blocks(&mut self) -> Vec<ABRange> {
let mut match_blocks = Vec::new();
let mut work_queue = vec![(0..self.a.len(), 0..self.b.len())];
while let Some(window) = work_queue.pop() {
if let Some(match_block) = self.find_longest_match(window.clone()) {
let match_block = self.expand_match_within_window(window.clone(), match_block);
match_blocks.push(match_block.clone());
let left = (
window.0.start..match_block.0.start,
window.1.start..match_block.1.start,
);
let right = (
match_block.0.end..window.0.end,
match_block.1.end..window.1.end,
);
if !left.0.is_empty() && !left.1.is_empty() {
work_queue.push(left);
}
if !right.0.is_empty() && !right.1.is_empty() {
work_queue.push(right);
}
}
}
match_blocks.sort_by_key(|(a_range, _)| a_range.start);
match_blocks
}
fn push_gap_diffops(
&self,
diff_ops: &'_ mut Vec<(ChangeTag, &'a T)>,
(a_range, b_range): ABRange,
) {
for i in a_range {
diff_ops.push((ChangeTag::Delete, &self.a[i]));
}
for j in b_range {
diff_ops.push((ChangeTag::Insert, &self.b[j]));
}
}
fn push_equal_diffops(
&self,
diff_ops: &'_ mut Vec<(ChangeTag, &'a T)>,
(a_range, _b_range): ABRange,
) {
for i in a_range {
diff_ops.push((ChangeTag::Equal, &self.a[i]));
}
}
}
pub fn compare<'a, T: Hash + Eq>(a: &'a [T], b: &'a [T]) -> Vec<(ChangeTag, &'a T)> {
let mut differ = Differ::new(a, b);
let matching_blocks = differ.find_matching_blocks();
let mut diff_ops = Vec::with_capacity(a.len() + b.len());
let (mut last_a, mut last_b) = (0, 0);
for matching_block in matching_blocks {
let (a_range, b_range) = matching_block.clone();
debug_assert!(
last_a <= a_range.start,
"matching blocks overlap or are out of order in a: previous end {}, current {:?}",
last_a,
a_range
);
debug_assert!(
last_b <= b_range.start,
"matching blocks overlap or are out of order in b: previous end {}, current {:?}",
last_b,
b_range
);
debug_assert_eq!(
a_range.len(),
b_range.len(),
"equal blocks must have the same length: {:?} vs {:?}",
a_range,
b_range
);
differ.push_gap_diffops(
&mut diff_ops,
(last_a..a_range.start, last_b..b_range.start),
);
differ.push_equal_diffops(&mut diff_ops, matching_block);
last_a = a_range.end;
last_b = b_range.end;
}
differ.push_gap_diffops(&mut diff_ops, (last_a..a.len(), last_b..b.len()));
diff_ops
}
#[cfg(test)]
mod tests {
use super::*;
fn format_ops<T: std::fmt::Debug>(ops: &[(ChangeTag, &'_ T)]) -> Vec<String> {
ops.iter()
.map(|op| {
let prefix = match op.0 {
ChangeTag::Equal => " ",
ChangeTag::Insert => "+ ",
ChangeTag::Delete => "- ",
};
format!("{prefix}{:?}", op.1)
})
.collect()
}
fn tags<T>(ops: &[(ChangeTag, &'_ T)]) -> Vec<ChangeTag> {
ops.iter().map(|op| op.0).collect()
}
#[test]
fn empty_sequences() {
let empty: &[&str] = &[];
assert!(compare::<&str>(empty, empty).is_empty());
}
#[test]
fn one_empty_sequence() {
let a: &[&str] = &["x", "y"];
let empty: &[&str] = &[];
let ops = compare(a, empty);
assert_eq!(tags(&ops), vec![ChangeTag::Delete, ChangeTag::Delete]);
let ops = compare(empty, a);
assert_eq!(tags(&ops), vec![ChangeTag::Insert, ChangeTag::Insert]);
}
#[test]
fn identical_sequences() {
let a = &["a", "b", "c"];
let ops = compare(a, a);
assert_eq!(tags(&ops), vec![ChangeTag::Equal; 3]);
}
#[test]
fn completely_different() {
let a = &["a", "b"];
let b = &["x", "y", "z"];
let ops = compare(a, b);
assert_eq!(
tags(&ops),
vec![
ChangeTag::Delete,
ChangeTag::Delete,
ChangeTag::Insert,
ChangeTag::Insert,
ChangeTag::Insert,
]
);
}
#[test]
fn simple_insert() {
let a = &["a", "c"];
let b = &["a", "b", "c"];
let ops = compare(a, b);
assert_eq!(
tags(&ops),
vec![ChangeTag::Equal, ChangeTag::Insert, ChangeTag::Equal]
);
assert_eq!(ops[1].1, &"b");
}
#[test]
fn simple_delete() {
let a = &["a", "b", "c"];
let b = &["a", "c"];
let ops = compare(a, b);
assert_eq!(
tags(&ops),
vec![ChangeTag::Equal, ChangeTag::Delete, ChangeTag::Equal]
);
assert_eq!(ops[1].1, &"b");
}
#[test]
fn replace_shorter_deletes_first() {
let a = &["a", "x", "b"];
let b = &["a", "w", "y", "z", "b"];
let ops = compare(a, b);
assert_eq!(
format_ops(&ops),
vec![
" \"a\"", "- \"x\"", "+ \"w\"", "+ \"y\"", "+ \"z\"", " \"b\"",
]
);
}
#[test]
fn replace_equal_length_deletes_first() {
let a = &["a", "x", "b"];
let b = &["a", "y", "b"];
let ops = compare(a, b);
assert_eq!(
format_ops(&ops),
vec![" \"a\"", "- \"x\"", "+ \"y\"", " \"b\""]
);
}
#[test]
fn python_docstring_example() {
let a: Vec<char> = "qabxcd".chars().collect();
let b: Vec<char> = "abycdf".chars().collect();
let ops = compare(&a, &b);
let formatted: Vec<String> = ops
.iter()
.map(|op| {
let prefix = match op.0 {
ChangeTag::Equal => " ",
ChangeTag::Insert => "+",
ChangeTag::Delete => "-",
};
format!("{prefix}{}", op.1)
})
.collect();
assert_eq!(
formatted,
vec!["-q", " a", " b", "-x", "+y", " c", " d", "+f"]
);
}
#[test]
fn no_autojunk_below_200() {
let b199: Vec<&str> = vec!["pop"; 199];
let differ = Differ::new(&["pop"], &b199);
assert!(
differ.b_indices.contains_key(&"pop"),
"'pop' should be in b2j when b.len() < 200"
);
let b200: Vec<&str> = vec!["pop"; 200];
let differ = Differ::new(&["pop"], &b200);
assert!(
!differ.b_indices.contains_key(&"pop"),
"'pop' should be excluded from b2j when b.len() >= 200"
);
}
#[test]
fn autojunk_excludes_popular_from_anchoring() {
let n = 200;
let mut b: Vec<&str> = vec!["pop"; n];
b[100] = "rare";
let a = &["pop", "rare", "pop"];
let differ = Differ::new(a, &b);
assert!(
!differ.b_indices.contains_key(&"pop"),
"popular element 'pop' should be excluded from b2j"
);
assert!(
differ.b_indices.contains_key(&"rare"),
"rare element should remain in b2j"
);
}
#[test]
fn autojunk_extends_into_popular_tokens() {
let n = 200;
let mut b: Vec<&str> = vec!["pop"; n]; b[50] = "rare_left";
b[51] = "anchor";
b[52] = "rare_right";
let a = &["pop", "rare_left", "anchor", "rare_right", "pop"];
let mut differ = Differ::new(a, &b);
assert!(!differ.b_indices.contains_key(&"pop"));
let match_block = differ
.find_longest_match((0..a.len(), 0..b.len()))
.expect("match should be found");
let (a_range, b_range) =
differ.expand_match_within_window((0..a.len(), 0..b.len()), match_block);
assert_eq!(
a_range.len(),
5,
"match should extend into adjacent popular 'pop' tokens"
);
assert_eq!(a_range.start, 0);
assert_eq!(b_range.start, 49); }
#[test]
fn works_with_integers() {
let a = &[1, 2, 3, 4, 5];
let b = &[1, 3, 4, 6];
let ops = compare(a, b);
assert_eq!(
ops,
vec![
(ChangeTag::Equal, &1),
(ChangeTag::Delete, &2),
(ChangeTag::Equal, &3),
(ChangeTag::Equal, &4),
(ChangeTag::Delete, &5),
(ChangeTag::Insert, &6),
]
);
}
#[test]
fn repeated_elements() {
let a = &["a", "b", "a", "b", "c"];
let b = &["a", "b", "c", "a", "b"];
let ops = compare(a, b);
let formatted: Vec<String> = ops
.iter()
.map(|op| {
let c = match op.0 {
ChangeTag::Equal => ' ',
ChangeTag::Insert => '+',
ChangeTag::Delete => '-',
};
format!("{c}{}", op.1)
})
.collect();
assert_eq!(formatted, vec!["-a", "-b", " a", " b", " c", "+a", "+b"]);
}
#[test]
fn matching_blocks_do_not_overlap() {
let a = &["a", "b", "a"];
let b = &["a", "b", "b", "a"];
let mut differ = Differ::new(a, b);
let matches = differ.find_matching_blocks();
assert_eq!(matches.len(), 2);
let first = &matches[0];
let second = &matches[1];
assert!(
first.0.end <= second.0.start,
"matching blocks overlap in a: {matches:?}"
);
assert!(
first.1.end <= second.1.start,
"matching blocks overlap in b: {matches:?}"
);
}
}