#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub text: String,
pub highlight: bool,
}
impl Token {
fn new(text: impl Into<String>, highlight: bool) -> Self {
Self {
text: text.into(),
highlight,
}
}
}
const LCS_CELL_BUDGET: usize = 4_000_000;
pub fn highlight_line_pair(old: &str, new: &str) -> (Vec<Token>, Vec<Token>) {
if old == new {
return (vec![Token::new(old, false)], vec![Token::new(new, false)]);
}
let old_tokens = tokenize(old);
let new_tokens = tokenize(new);
if old_tokens.is_empty() {
let new_out = new_tokens
.into_iter()
.map(|t| Token::new(t, true))
.collect();
return (vec![Token::new(old, false)], new_out);
}
if new_tokens.is_empty() {
let old_out = old_tokens
.into_iter()
.map(|t| Token::new(t, true))
.collect();
return (old_out, vec![Token::new(new, false)]);
}
if old_tokens
.len()
.checked_mul(new_tokens.len())
.is_none_or(|n| n > LCS_CELL_BUDGET)
{
return (vec![Token::new(old, true)], vec![Token::new(new, true)]);
}
let rows = old_tokens.len();
let cols = new_tokens.len();
let mut lcs = vec![0u32; (rows + 1) * (cols + 1)];
let mut dir = vec![0u8; (rows + 1) * (cols + 1)];
for i in 1..=rows {
for j in 1..=cols {
let idx = i * (cols + 1) + j;
if old_tokens[i - 1] == new_tokens[j - 1] {
lcs[idx] = lcs[(i - 1) * (cols + 1) + (j - 1)] + 1;
dir[idx] = 0;
} else {
let up = lcs[(i - 1) * (cols + 1) + j];
let left = lcs[i * (cols + 1) + (j - 1)];
if up >= left {
lcs[idx] = up;
dir[idx] = 1;
} else {
lcs[idx] = left;
dir[idx] = 2;
}
}
}
}
let mut old_flags: Vec<bool> = vec![false; rows];
let mut new_flags: Vec<bool> = vec![false; cols];
let mut i = rows;
let mut j = cols;
while i > 0 || j > 0 {
if i > 0 && j > 0 && dir[i * (cols + 1) + j] == 0 {
i -= 1;
j -= 1;
} else if j == 0 || (i > 0 && dir[i * (cols + 1) + j] == 1) {
i -= 1;
old_flags[i] = true;
} else {
j -= 1;
new_flags[j] = true;
}
}
let old_out = merge_runs(&old_tokens, &old_flags);
let new_out = merge_runs(&new_tokens, &new_flags);
(old_out, new_out)
}
fn tokenize(s: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut current = String::new();
for ch in s.chars() {
if ch.is_alphanumeric() || ch == '_' {
current.push(ch);
} else {
if !current.is_empty() {
out.push(std::mem::take(&mut current));
}
out.push(ch.to_string());
}
}
if !current.is_empty() {
out.push(current);
}
out
}
fn merge_runs(tokens: &[String], flags: &[bool]) -> Vec<Token> {
debug_assert_eq!(tokens.len(), flags.len());
if tokens.is_empty() {
return Vec::new();
}
let mut out: Vec<Token> = Vec::new();
let mut buf = String::new();
let mut buf_flag = flags[0];
for (tok, &flag) in tokens.iter().zip(flags.iter()) {
if flag == buf_flag {
buf.push_str(tok);
} else {
out.push(Token::new(std::mem::take(&mut buf), buf_flag));
buf.push_str(tok);
buf_flag = flag;
}
}
if !buf.is_empty() {
out.push(Token::new(buf, buf_flag));
}
out
}
#[cfg(test)]
mod tests {
use super::*;
fn concat(tokens: &[Token]) -> String {
tokens.iter().map(|t| t.text.as_str()).collect()
}
#[test]
fn word_diff_identical_lines_returns_no_highlights() {
let (old, new) = highlight_line_pair("let x = 1;", "let x = 1;");
assert_eq!(old.len(), 1);
assert_eq!(new.len(), 1);
assert!(!old[0].highlight);
assert!(!new[0].highlight);
assert_eq!(concat(&old), "let x = 1;");
assert_eq!(concat(&new), "let x = 1;");
}
#[test]
fn word_diff_single_word_change_highlights_only_that_word() {
let (old, new) = highlight_line_pair("let x = 1;", "let y = 1;");
assert_eq!(concat(&old), "let x = 1;");
assert_eq!(concat(&new), "let y = 1;");
let old_hl: Vec<&str> = old
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
let new_hl: Vec<&str> = new
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
assert_eq!(old_hl, vec!["x"]);
assert_eq!(new_hl, vec!["y"]);
}
#[test]
fn word_diff_insertion_at_end_highlights_only_new_words() {
let (old, new) = highlight_line_pair("hello world", "hello world again");
assert_eq!(concat(&old), "hello world");
assert_eq!(concat(&new), "hello world again");
assert!(old.iter().all(|t| !t.highlight));
let new_hl: String = new
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
assert_eq!(new_hl, " again");
}
#[test]
fn word_diff_deletion_from_middle_highlights_only_removed_words() {
let (old, new) = highlight_line_pair("one two three", "one three");
assert_eq!(concat(&old), "one two three");
assert_eq!(concat(&new), "one three");
let old_hl: String = old
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
assert!(
old_hl == "two " || old_hl == " two",
"unexpected old-side highlight {old_hl:?}"
);
assert!(new.iter().all(|t| !t.highlight));
}
#[test]
fn word_diff_empty_inputs_are_safe() {
let (old, new) = highlight_line_pair("", "");
assert_eq!(old.len(), 1);
assert_eq!(new.len(), 1);
assert_eq!(old[0].text, "");
assert_eq!(new[0].text, "");
assert!(!old[0].highlight);
assert!(!new[0].highlight);
let (old, new) = highlight_line_pair("", "hello");
assert_eq!(concat(&old), "");
assert_eq!(concat(&new), "hello");
assert!(new.iter().all(|t| t.highlight));
let (old, new) = highlight_line_pair("goodbye", "");
assert_eq!(concat(&old), "goodbye");
assert_eq!(concat(&new), "");
assert!(old.iter().all(|t| t.highlight));
}
#[test]
fn word_diff_determinism_same_inputs_same_output() {
let a1 = highlight_line_pair("foo bar baz", "foo BAR baz");
let a2 = highlight_line_pair("foo bar baz", "foo BAR baz");
assert_eq!(a1, a2);
}
#[test]
fn tokenize_splits_on_word_boundaries() {
let tokens = tokenize("let x = 1;");
assert_eq!(
tokens,
vec!["let", " ", "x", " ", "=", " ", "1", ";"]
.into_iter()
.map(std::string::ToString::to_string)
.collect::<Vec<_>>()
);
}
#[test]
fn tokenize_keeps_underscore_as_word_char() {
let tokens = tokenize("foo_bar baz");
assert_eq!(
tokens,
vec!["foo_bar", " ", "baz"]
.into_iter()
.map(std::string::ToString::to_string)
.collect::<Vec<_>>()
);
}
#[test]
fn word_diff_unicode_tokens_reconstruct_exactly() {
let old = "旅行 agent 🧳 ready";
let new = "旅行 agent 🌴 ready";
let (old_tokens, new_tokens) = highlight_line_pair(old, new);
assert_eq!(concat(&old_tokens), old);
assert_eq!(concat(&new_tokens), new);
let old_hl: String = old_tokens
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
let new_hl: String = new_tokens
.iter()
.filter(|t| t.highlight)
.map(|t| t.text.as_str())
.collect();
assert!(
old_hl.contains('\u{1F9F3}'),
"expected luggage emoji highlighted on old side, got {old_hl:?}"
);
assert!(
new_hl.contains('\u{1F334}'),
"expected palm emoji highlighted on new side, got {new_hl:?}"
);
}
#[test]
fn pathological_input_falls_back_to_whole_line_highlight() {
let build = |ch: char| -> String {
let mut s = String::new();
for i in 0..3000 {
s.push(ch);
s.push_str(&i.to_string());
s.push(' ');
}
s
};
let left = build('a');
let right = build('b');
let (old, new) = highlight_line_pair(&left, &right);
assert_eq!(old.len(), 1);
assert_eq!(new.len(), 1);
assert!(old[0].highlight);
assert!(new[0].highlight);
assert_eq!(old[0].text, left);
assert_eq!(new[0].text, right);
}
}