#[derive(Debug, Clone)]
pub struct WordHighlight {
pub start: usize,
pub end: usize,
}
const MAX_TOKENS: usize = 200;
const MIN_SIMILARITY: f32 = 0.5;
#[cfg(test)]
pub fn compute_word_highlights(
removed: &str,
added: &str,
) -> (Vec<WordHighlight>, Vec<WordHighlight>) {
match score_pair(removed, added) {
Some((_, rh, ah)) => (rh, ah),
None => (Vec::new(), Vec::new()),
}
}
pub fn score_pair(
removed: &str,
added: &str,
) -> Option<(f32, Vec<WordHighlight>, Vec<WordHighlight>)> {
if removed.trim().is_empty() || added.trim().is_empty() {
return None;
}
let old_words = tokenize(removed);
let new_words = tokenize(added);
if old_words.len() > MAX_TOKENS || new_words.len() > MAX_TOKENS {
return None;
}
let lcs = lcs_table(&old_words, &new_words);
let mut old_matched = vec![false; old_words.len()];
let mut new_matched = vec![false; new_words.len()];
let mut i = old_words.len();
let mut j = new_words.len();
while i > 0 && j > 0 {
if old_words[i - 1].text == new_words[j - 1].text {
old_matched[i - 1] = true;
new_matched[j - 1] = true;
i -= 1;
j -= 1;
} else if lcs[i - 1][j] > lcs[i][j - 1] {
i -= 1;
} else {
j -= 1;
}
}
let unchanged: usize = old_words
.iter()
.zip(&old_matched)
.filter(|(_, m)| **m)
.map(|(t, _)| t.end - t.start)
.sum::<usize>()
+ new_words
.iter()
.zip(&new_matched)
.filter(|(_, m)| **m)
.map(|(t, _)| t.end - t.start)
.sum::<usize>();
let total = removed.len() + added.len();
if total == 0 {
return None;
}
let similarity = unchanged as f32 / total as f32;
if similarity < MIN_SIMILARITY {
return None;
}
let removed_hl = merge_unmatched(&old_words, &old_matched);
let added_hl = merge_unmatched(&new_words, &new_matched);
Some((similarity, removed_hl, added_hl))
}
#[derive(Debug)]
struct Token {
text: String,
start: usize,
end: usize,
}
fn tokenize(s: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut chars = s.char_indices().peekable();
while let Some(&(start, ch)) = chars.peek() {
if ch.is_alphanumeric() || ch == '_' {
let mut end = start;
while let Some(&(i, c)) = chars.peek() {
if c.is_alphanumeric() || c == '_' {
end = i + c.len_utf8();
chars.next();
} else {
break;
}
}
tokens.push(Token {
text: s[start..end].to_string(),
start,
end,
});
} else {
let end = start + ch.len_utf8();
tokens.push(Token {
text: s[start..end].to_string(),
start,
end,
});
chars.next();
}
}
tokens
}
fn lcs_table(a: &[Token], b: &[Token]) -> Vec<Vec<usize>> {
let m = a.len();
let n = b.len();
let mut table = vec![vec![0usize; n + 1]; m + 1];
for i in 1..=m {
for j in 1..=n {
if a[i - 1].text == b[j - 1].text {
table[i][j] = table[i - 1][j - 1] + 1;
} else {
table[i][j] = table[i - 1][j].max(table[i][j - 1]);
}
}
}
table
}
fn merge_unmatched(tokens: &[Token], matched: &[bool]) -> Vec<WordHighlight> {
let mut highlights = Vec::new();
let mut i = 0;
while i < tokens.len() {
if !matched[i] {
let start = tokens[i].start;
let mut end = tokens[i].end;
while i + 1 < tokens.len() && !matched[i + 1] {
i += 1;
end = tokens[i].end;
}
highlights.push(WordHighlight { start, end });
}
i += 1;
}
highlights
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_word_change() {
let (rem, add) = compute_word_highlights("fn foo() {}", "fn bar() {}");
assert_eq!(rem.len(), 1);
assert_eq!(&"fn foo() {}"[rem[0].start..rem[0].end], "foo");
assert_eq!(add.len(), 1);
assert_eq!(&"fn bar() {}"[add[0].start..add[0].end], "bar");
}
#[test]
fn no_change() {
let (rem, add) = compute_word_highlights("hello world", "hello world");
assert!(rem.is_empty());
assert!(add.is_empty());
}
#[test]
fn full_change_skips_word_diff() {
let (rem, add) = compute_word_highlights("aaa", "bbb");
assert!(rem.is_empty());
assert!(add.is_empty());
}
#[test]
fn below_threshold_skips() {
let (rem, add) = compute_word_highlights(
"fn totally_different_function_name_here(x: i32)",
"fn some_other_completely_unrelated_name(y: String, z: bool)",
);
assert!(rem.is_empty());
assert!(add.is_empty());
}
}