use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Segment {
Static(String),
Field(String),
}
impl Segment {
pub fn text(&self) -> &str {
match self {
Segment::Static(t) | Segment::Field(t) => t,
}
}
}
pub fn diff(original: &str, corrected: &str) -> Vec<Segment> {
let orig_toks = tokenize(original);
let corr_toks = tokenize(corrected);
let orig_words: Vec<&str> = orig_toks.iter().filter_map(Tok::word).collect();
let corr_words: Vec<&str> = corr_toks.iter().filter_map(Tok::word).collect();
let matched = lcs_matched_b_indices(&orig_words, &corr_words);
let mut segments: Vec<Segment> = Vec::new();
let mut corr_word_idx = 0usize;
for tok in &corr_toks {
match tok {
Tok::Sep(s) => push_static(&mut segments, s),
Tok::Word(w) => {
let unchanged = matched.contains(&corr_word_idx);
corr_word_idx += 1;
if unchanged {
push_static(&mut segments, w);
} else {
segments.push(Segment::Field(w.clone()));
}
}
}
}
segments
}
pub fn reconstruct(segments: &[Segment]) -> String {
let mut out = String::new();
for seg in segments {
out.push_str(seg.text());
}
out
}
pub fn field_start_offset(segments: &[Segment], ordinal: usize) -> Option<usize> {
let mut offset = 0usize;
let mut seen = 0usize;
for seg in segments {
if let Segment::Field(_) = seg {
if seen == ordinal {
return Some(offset);
}
seen += 1;
}
offset += seg.text().len();
}
None
}
pub fn changed_word_ranges(text: &str, other: &str) -> Vec<(usize, usize)> {
let text_spans = word_spans(text);
let other_words: Vec<&str> = word_spans(other).into_iter().map(|s| s.0).collect();
let text_words: Vec<&str> = text_spans.iter().map(|s| s.0).collect();
let matched = lcs_matched_b_indices(&other_words, &text_words);
text_spans
.iter()
.enumerate()
.filter(|(i, _)| !matched.contains(i))
.map(|(_, &(_, s, e))| (s, e))
.collect()
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AlignLayout {
pub col_widths: Vec<usize>,
pub orig_cols: Vec<usize>,
pub corr_cols: Vec<usize>,
}
pub fn align(original: &str, corrected: &str) -> Option<AlignLayout> {
let orig_owned = word_list(original);
let corr_owned = word_list(corrected);
if orig_owned.is_empty() && corr_owned.is_empty() {
return None;
}
let orig: Vec<&str> = orig_owned.iter().map(String::as_str).collect();
let corr: Vec<&str> = corr_owned.iter().map(String::as_str).collect();
let oe = word_extents(original);
let ce = word_extents(corrected);
let mut layout = AlignLayout {
col_widths: Vec::new(),
orig_cols: vec![0; orig.len()],
corr_cols: vec![0; corr.len()],
};
let (mut i, mut j) = (0usize, 0usize);
for (mi, mj) in lcs_matched_pairs(&orig, &corr) {
emit_run(&mut layout, &oe, &ce, &mut i, &mut j, mi, mj);
let col = layout.col_widths.len();
layout.orig_cols[i] = col;
layout.corr_cols[j] = col;
layout.col_widths.push(oe[i].max(ce[j]));
i += 1;
j += 1;
}
emit_run(
&mut layout,
&oe,
&ce,
&mut i,
&mut j,
orig.len(),
corr.len(),
);
Some(layout)
}
fn emit_run(
layout: &mut AlignLayout,
oe: &[usize],
ce: &[usize],
i: &mut usize,
j: &mut usize,
ai: usize,
aj: usize,
) {
let (da, db) = (ai - *i, aj - *j);
let sub = da.min(db);
for k in 0..sub {
let col = layout.col_widths.len();
layout.orig_cols[*i + k] = col;
layout.corr_cols[*j + k] = col;
layout.col_widths.push(oe[*i + k].max(ce[*j + k]));
}
for k in sub..da {
let col = layout.col_widths.len();
layout.orig_cols[*i + k] = col;
layout.col_widths.push(oe[*i + k]);
}
for k in sub..db {
let col = layout.col_widths.len();
layout.corr_cols[*j + k] = col;
layout.col_widths.push(ce[*j + k]);
}
*i = ai;
*j = aj;
}
pub fn split_separator(sep: &str) -> (&str, &str) {
let end = sep.find(char::is_whitespace).unwrap_or(sep.len());
sep.split_at(end)
}
fn word_extents(s: &str) -> Vec<usize> {
let toks = split_tokens(s);
let mut out = Vec::new();
for (idx, (is_word, text)) in toks.iter().enumerate() {
if *is_word {
let punct = toks
.get(idx + 1)
.filter(|(next_is_word, _)| !next_is_word)
.map(|(_, sep)| split_separator(sep).0.chars().count())
.unwrap_or(0);
out.push(text.chars().count() + punct);
}
}
out
}
pub fn word_at(s: &str, byte: usize) -> Option<(usize, usize)> {
word_spans(s)
.into_iter()
.find(|&(_, st, e)| byte >= st && byte < e)
.map(|(_, st, e)| (st, e))
}
pub fn word_byte_ranges(s: &str) -> Vec<(usize, usize)> {
word_spans(s)
.into_iter()
.map(|(_, st, e)| (st, e))
.collect()
}
pub fn word_list(s: &str) -> Vec<String> {
word_spans(s)
.into_iter()
.map(|(w, _, _)| w.to_string())
.collect()
}
pub fn split_tokens(s: &str) -> Vec<(bool, String)> {
tokenize(s)
.into_iter()
.map(|t| match t {
Tok::Word(w) => (true, w),
Tok::Sep(s) => (false, s),
})
.collect()
}
fn word_spans(s: &str) -> Vec<(&str, usize, usize)> {
let mut out = Vec::new();
let mut start = 0usize;
let mut in_word = false;
for (i, c) in s.char_indices() {
let w = is_word_char(c);
if w && !in_word {
start = i;
in_word = true;
} else if !w && in_word {
out.push((&s[start..i], start, i));
in_word = false;
}
}
if in_word {
out.push((&s[start..], start, s.len()));
}
out
}
fn push_static(segments: &mut Vec<Segment>, text: &str) {
if let Some(Segment::Static(last)) = segments.last_mut() {
last.push_str(text);
} else {
segments.push(Segment::Static(text.to_string()));
}
}
enum Tok {
Word(String),
Sep(String),
}
impl Tok {
fn word(&self) -> Option<&str> {
match self {
Tok::Word(w) => Some(w),
Tok::Sep(_) => None,
}
}
}
fn tokenize(s: &str) -> Vec<Tok> {
let mut out = Vec::new();
let mut cur = String::new();
let mut cur_is_word: Option<bool> = None;
for c in s.chars() {
let is_word = is_word_char(c);
match cur_is_word {
Some(prev) if prev == is_word => cur.push(c),
Some(prev) => {
out.push(finish(prev, std::mem::take(&mut cur)));
cur.push(c);
cur_is_word = Some(is_word);
}
None => {
cur.push(c);
cur_is_word = Some(is_word);
}
}
}
if let Some(prev) = cur_is_word {
out.push(finish(prev, cur));
}
out
}
fn finish(is_word: bool, text: String) -> Tok {
if is_word {
Tok::Word(text)
} else {
Tok::Sep(text)
}
}
fn is_word_char(c: char) -> bool {
c.is_alphanumeric() || c == '\''
}
fn lcs_matched_b_indices(a: &[&str], b: &[&str]) -> HashSet<usize> {
lcs_matched_pairs(a, b)
.into_iter()
.map(|(_, j)| j)
.collect()
}
fn lcs_matched_pairs(a: &[&str], b: &[&str]) -> Vec<(usize, usize)> {
let (n, m) = (a.len(), b.len());
let mut dp = vec![vec![0usize; m + 1]; n + 1];
for i in (0..n).rev() {
for j in (0..m).rev() {
dp[i][j] = if a[i] == b[j] {
dp[i + 1][j + 1] + 1
} else {
dp[i + 1][j].max(dp[i][j + 1])
};
}
}
let mut pairs = Vec::new();
let (mut i, mut j) = (0, 0);
while i < n && j < m {
if a[i] == b[j] {
pairs.push((i, j));
i += 1;
j += 1;
} else if dp[i + 1][j] >= dp[i][j + 1] {
i += 1;
} else {
j += 1;
}
}
pairs
}
#[cfg(test)]
mod tests {
use super::*;
fn fields(segments: &[Segment]) -> Vec<&str> {
segments
.iter()
.filter_map(|s| match s {
Segment::Field(t) => Some(t.as_str()),
Segment::Static(_) => None,
})
.collect()
}
#[test]
fn single_word_substitution_is_one_field() {
let segs = diff("recieve", "receive");
assert_eq!(segs, vec![Segment::Field("receive".into())]);
assert_eq!(reconstruct(&segs), "receive");
}
#[test]
fn only_the_changed_word_is_editable() {
let segs = diff("teh quick", "the quick");
assert_eq!(fields(&segs), vec!["the"]);
assert_eq!(
segs,
vec![
Segment::Field("the".into()),
Segment::Static(" quick".into())
]
);
assert_eq!(reconstruct(&segs), "the quick");
}
#[test]
fn an_inserted_word_becomes_a_field() {
let segs = diff("the fox", "the quick fox");
assert_eq!(fields(&segs), vec!["quick"]);
assert_eq!(reconstruct(&segs), "the quick fox");
}
#[test]
fn a_deleted_word_leaves_no_field() {
let segs = diff("the quick fox", "the fox");
assert!(fields(&segs).is_empty());
assert_eq!(reconstruct(&segs), "the fox");
}
#[test]
fn case_only_change_is_editable() {
let segs = diff("hello", "Hello");
assert_eq!(fields(&segs), vec!["Hello"]);
}
#[test]
fn separator_only_change_has_no_fields() {
let segs = diff("hello world", "hello, world");
assert!(fields(&segs).is_empty());
assert_eq!(reconstruct(&segs), "hello, world");
}
#[test]
fn multi_word_correction_marks_each_changed_word() {
let segs = diff("i went too the stor", "I went to the store");
assert_eq!(fields(&segs), vec!["I", "to", "store"]);
assert_eq!(reconstruct(&segs), "I went to the store");
}
#[test]
fn field_start_offset_points_at_each_field() {
let segs = diff("i went too the stor", "I went to the store");
assert_eq!(field_start_offset(&segs, 0), Some(0)); assert_eq!(field_start_offset(&segs, 1), Some(7)); assert_eq!(field_start_offset(&segs, 2), Some(14)); assert_eq!(field_start_offset(&segs, 3), None);
}
#[test]
fn reconstruct_round_trips_for_varied_pairs() {
let pairs = [
("teh quick brown fox", "the quick brown fox"),
("recieve", "receive"),
("its a test", "it's a test"),
("hello world", "hello, world"),
("the fox", "the quick brown fox"),
("a b c d e", "a x c y e"),
];
for (o, c) in pairs {
assert_eq!(reconstruct(&diff(o, c)), c, "round-trip failed for {c:?}");
}
}
#[test]
fn multibyte_words_round_trip() {
let segs = diff("cafe au lait", "café au lait");
assert_eq!(fields(&segs), vec!["café"]);
assert_eq!(reconstruct(&segs), "café au lait");
assert_eq!(field_start_offset(&segs, 0), Some(0));
}
#[test]
fn align_substitutions_share_columns() {
let a = align("teh quick browne fox jumpd", "the quick brown fox jumped").unwrap();
assert_eq!(a.col_widths, vec![3, 5, 6, 3, 6]);
assert_eq!(a.orig_cols, vec![0, 1, 2, 3, 4]);
assert_eq!(a.corr_cols, vec![0, 1, 2, 3, 4]);
}
#[test]
fn align_inserted_word_gets_its_own_column() {
let a = align("the fox", "the quick fox").unwrap();
assert_eq!(a.col_widths, vec![3, 5, 3]);
assert_eq!(a.orig_cols, vec![0, 2]); assert_eq!(a.corr_cols, vec![0, 1, 2]); }
#[test]
fn align_deleted_word_gets_its_own_column() {
let a = align("the quick fox", "the fox").unwrap();
assert_eq!(a.col_widths, vec![3, 5, 3]);
assert_eq!(a.orig_cols, vec![0, 1, 2]); assert_eq!(a.corr_cols, vec![0, 2]); }
#[test]
fn align_split_word_substitutes_then_inserts() {
let a = align("i eat alot of food", "I eat a lot of food").unwrap();
assert_eq!(a.col_widths, vec![1, 3, 4, 3, 2, 4]);
assert_eq!(a.orig_cols, vec![0, 1, 2, 4, 5]); assert_eq!(a.corr_cols, vec![0, 1, 2, 3, 4, 5]); }
#[test]
fn align_folds_trailing_punctuation_into_the_column() {
let a = align("well i think", "well, I think").unwrap();
assert_eq!(a.col_widths, vec![5, 1, 5]);
assert_eq!(a.orig_cols, vec![0, 1, 2]);
assert_eq!(a.corr_cols, vec![0, 1, 2]);
}
#[test]
fn split_separator_peels_leading_punctuation() {
assert_eq!(split_separator(", "), (",", " "));
assert_eq!(split_separator(" "), ("", " "));
assert_eq!(split_separator("?"), ("?", ""));
assert_eq!(split_separator(". "), (".", " "));
}
#[test]
fn align_none_only_when_wordless() {
assert!(align("", "").is_none());
assert!(align("hi", "").is_some());
assert!(align("", "hi").is_some());
}
#[test]
fn changed_word_ranges_marks_differing_words() {
let r = changed_word_ranges("teh quick browne fox", "the quick brown fox");
assert_eq!(r, vec![(0, 3), (10, 16)]); let r2 = changed_word_ranges("the quick brown fox", "teh quick browne fox");
assert_eq!(r2, vec![(0, 3), (10, 15)]); }
}