use similar::{ChangeTag, TextDiff};
const STITCH_DELIMITER: &str = "\n\n";
#[derive(Debug, Clone)]
pub struct Unit {
pub id: String,
pub position: usize,
pub text: String,
}
#[derive(Debug, Clone)]
pub struct StitchedPair {
pub previous_snippet: String,
pub current_snippet: String,
pub stitched: bool,
pub stitched_previous_ids: Vec<String>,
pub stitched_current_ids: Vec<String>,
pub stitched_previous_positions: Vec<usize>,
pub stitched_current_positions: Vec<usize>,
}
pub fn focused_pair(
previous_text: &str,
current_text: &str,
snippet_max: usize,
context_tokens: usize,
) -> (String, String) {
let previous = normalize_text(previous_text);
let current = normalize_text(current_text);
if previous == current {
return (truncate(&previous, snippet_max), truncate(¤t, snippet_max));
}
let prev_tokens: Vec<&str> = previous.split_whitespace().collect();
let curr_tokens: Vec<&str> = current.split_whitespace().collect();
let (prev_changed, curr_changed) = changed_positions(&prev_tokens, &curr_tokens);
if prev_changed.is_empty() && curr_changed.is_empty() {
return (truncate(&previous, snippet_max), truncate(¤t, snippet_max));
}
let prev_span = minmax_or_full(&prev_changed, prev_tokens.len());
let curr_span = minmax_or_full(&curr_changed, curr_tokens.len());
let prev_snippet = token_excerpt_with_delta(
&prev_tokens,
prev_span.0,
prev_span.1,
snippet_max,
context_tokens,
);
let curr_snippet = token_excerpt_with_delta(
&curr_tokens,
curr_span.0,
curr_span.1,
snippet_max,
context_tokens,
);
(prev_snippet, curr_snippet)
}
pub fn boundary_aware_stitched_pair(
previous_units: &[Unit],
current_units: &[Unit],
target_previous_id: &str,
target_current_id: &str,
snippet_max: usize,
context_tokens: usize,
stitch_radius: usize,
stitch_max_chars: usize,
boundary_threshold_chars: usize,
) -> StitchedPair {
let target_previous = previous_units.iter().find(|u| u.id == target_previous_id);
let target_current = current_units.iter().find(|u| u.id == target_current_id);
let prev_target_text = normalize_text(target_previous.map(|u| u.text.as_str()).unwrap_or(""));
let curr_target_text = normalize_text(target_current.map(|u| u.text.as_str()).unwrap_or(""));
let boundary_sensitive = is_boundary_sensitive_change(
&prev_target_text,
&curr_target_text,
boundary_threshold_chars,
);
let radius = if boundary_sensitive { stitch_radius } else { 0 };
let prev_window = window_for_units(previous_units, target_previous_id, radius);
let curr_window = window_for_units(current_units, target_current_id, radius);
let mut prev_text = stitched_text(&prev_window, stitch_max_chars);
let mut curr_text = stitched_text(&curr_window, stitch_max_chars);
if prev_text.is_empty() {
prev_text = prev_target_text.clone();
}
if curr_text.is_empty() {
curr_text = curr_target_text.clone();
}
let (prev_snippet, curr_snippet) =
focused_pair(&prev_text, &curr_text, snippet_max, context_tokens);
let stitched = boundary_sensitive && (prev_window.len() > 1 || curr_window.len() > 1);
StitchedPair {
previous_snippet: prev_snippet,
current_snippet: curr_snippet,
stitched,
stitched_previous_ids: prev_window.iter().map(|u| u.id.clone()).collect(),
stitched_current_ids: curr_window.iter().map(|u| u.id.clone()).collect(),
stitched_previous_positions: prev_window.iter().map(|u| u.position).collect(),
stitched_current_positions: curr_window.iter().map(|u| u.position).collect(),
}
}
fn changed_positions(old_tokens: &[&str], new_tokens: &[&str]) -> (Vec<usize>, Vec<usize>) {
let diff = TextDiff::from_slices(old_tokens, new_tokens);
let mut old_pos = 0usize;
let mut new_pos = 0usize;
let mut old_changed = Vec::new();
let mut new_changed = Vec::new();
for change in diff.iter_all_changes() {
match change.tag() {
ChangeTag::Equal => {
old_pos += 1;
new_pos += 1;
}
ChangeTag::Delete => {
old_changed.push(old_pos);
old_pos += 1;
}
ChangeTag::Insert => {
new_changed.push(new_pos);
new_pos += 1;
}
}
}
(old_changed, new_changed)
}
fn minmax_or_full(positions: &[usize], seq_len: usize) -> (usize, usize) {
if positions.is_empty() || seq_len == 0 {
return (0, seq_len.saturating_sub(1));
}
let min = *positions.iter().min().unwrap();
let max = *positions.iter().max().unwrap();
(min, max)
}
fn token_excerpt_with_delta(
tokens: &[&str],
changed_start: usize,
changed_end: usize,
snippet_max: usize,
context_tokens: usize,
) -> String {
if tokens.is_empty() {
return String::new();
}
let start_index = changed_start.saturating_sub(context_tokens);
let end_index = (changed_end + context_tokens).min(tokens.len() - 1);
let excerpt = &tokens[start_index..=end_index];
if excerpt.is_empty() {
return String::new();
}
let local_start = changed_start.saturating_sub(start_index);
let local_end = (changed_end - start_index).min(excerpt.len() - 1);
let local_end = local_end.max(local_start);
let prefix = &excerpt[..local_start];
let delta = &excerpt[local_start..=local_end];
let suffix = if local_end + 1 < excerpt.len() { &excerpt[local_end + 1..] } else { &[] };
let mut parts: Vec<String> = Vec::new();
if !prefix.is_empty() {
parts.push(prefix.join(" "));
}
if !delta.is_empty() {
parts.push(format!("[[{}]]", delta.join(" ")));
}
if !suffix.is_empty() {
parts.push(suffix.join(" "));
}
let mut text = parts.join(" ");
if start_index > 0 {
text = format!("...{}", text);
}
if end_index < tokens.len() - 1 {
text = format!("{}...", text);
}
truncate(&text, snippet_max)
}
fn window_for_units<'a>(units: &'a [Unit], target_id: &str, radius: usize) -> Vec<&'a Unit> {
let Some(idx) = units.iter().position(|u| u.id == target_id) else {
return Vec::new();
};
let start = idx.saturating_sub(radius);
let end = (idx + radius).min(units.len() - 1);
units[start..=end].iter().collect()
}
fn stitched_text(units: &[&Unit], max_chars: usize) -> String {
let raw: String = units
.iter()
.map(|u| normalize_text(&u.text))
.collect::<Vec<_>>()
.join(STITCH_DELIMITER);
truncate(&normalize_text(&raw), max_chars)
}
fn is_boundary_sensitive_change(prev: &str, curr: &str, threshold_chars: usize) -> bool {
if prev.is_empty() || curr.is_empty() || prev == curr {
return false;
}
let prefix = common_prefix_len(prev, curr);
let suffix = common_suffix_len(prev, curr, prefix);
prefix <= threshold_chars || suffix <= threshold_chars
}
fn common_prefix_len(a: &str, b: &str) -> usize {
a.chars()
.zip(b.chars())
.take_while(|(x, y)| x == y)
.count()
}
fn common_suffix_len(a: &str, b: &str, prefix: usize) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let mut ai = a_chars.len();
let mut bi = b_chars.len();
let mut count = 0;
while ai > prefix && bi > prefix && a_chars[ai - 1] == b_chars[bi - 1] {
count += 1;
ai -= 1;
bi -= 1;
}
count
}
fn normalize_text(text: &str) -> String {
let s: String = text
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
s
}
fn truncate(text: &str, max: usize) -> String {
if max == 0 {
return String::new();
}
let mut end = max.min(text.len());
while !text.is_char_boundary(end) {
end -= 1;
}
text[..end].to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_collapses_whitespace() {
assert_eq!(normalize_text(" hello world "), "hello world");
}
#[test]
fn normalize_empty_is_empty() {
assert_eq!(normalize_text(""), "");
}
#[test]
fn truncate_short_text_unchanged() {
assert_eq!(truncate("hello", 100), "hello");
}
#[test]
fn truncate_at_exact_length() {
assert_eq!(truncate("hello", 5), "hello");
}
#[test]
fn truncate_longer_text() {
assert_eq!(truncate("hello world", 5), "hello");
}
#[test]
fn truncate_zero_returns_empty() {
assert_eq!(truncate("anything", 0), "");
}
#[test]
fn prefix_len_identical_strings() {
assert_eq!(common_prefix_len("abc", "abc"), 3);
}
#[test]
fn prefix_len_no_common() {
assert_eq!(common_prefix_len("abc", "xyz"), 0);
}
#[test]
fn prefix_len_partial() {
assert_eq!(common_prefix_len("abcdef", "abcxyz"), 3);
}
#[test]
fn suffix_len_identical() {
assert_eq!(common_suffix_len("abcdef", "abcdef", 0), 6);
}
#[test]
fn suffix_len_partial() {
assert_eq!(common_suffix_len("prefix_abc", "other_abc", 0), 3);
}
#[test]
fn not_sensitive_when_equal() {
assert!(!is_boundary_sensitive_change("same text", "same text", 10));
}
#[test]
fn not_sensitive_when_empty() {
assert!(!is_boundary_sensitive_change("", "new text", 10));
}
#[test]
fn sensitive_when_prefix_short() {
assert!(is_boundary_sensitive_change("aXXXXXXXX", "aYYYYYYYY", 5));
}
#[test]
fn not_sensitive_when_long_common_prefix_and_suffix() {
let prev = "The quick brown fox jumps over the lazy dog";
let curr = "The quick REPLACED fox jumps over the lazy dog";
assert!(!is_boundary_sensitive_change(prev, curr, 3));
}
#[test]
fn no_changes_returns_empty_vecs() {
let tokens = vec!["hello", "world"];
let (old, new) = changed_positions(&tokens, &tokens);
assert!(old.is_empty());
assert!(new.is_empty());
}
#[test]
fn deletion_appears_in_old_positions() {
let old = vec!["a", "b", "c"];
let new = vec!["a", "c"];
let (old_changed, new_changed) = changed_positions(&old, &new);
assert!(old_changed.contains(&1)); assert!(new_changed.is_empty());
}
#[test]
fn insertion_appears_in_new_positions() {
let old = vec!["a", "c"];
let new = vec!["a", "b", "c"];
let (old_changed, new_changed) = changed_positions(&old, &new);
assert!(old_changed.is_empty());
assert!(new_changed.contains(&1)); }
#[test]
fn substitution_appears_in_both() {
let old = vec!["hello", "world"];
let new = vec!["hello", "rust"];
let (old_changed, new_changed) = changed_positions(&old, &new);
assert!(!old_changed.is_empty());
assert!(!new_changed.is_empty());
}
#[test]
fn empty_tokens_returns_empty() {
assert_eq!(token_excerpt_with_delta(&[], 0, 0, 100, 3), "");
}
#[test]
fn full_excerpt_no_context_needed() {
let tokens = vec!["a", "b", "c"];
let result = token_excerpt_with_delta(&tokens, 1, 1, 1000, 0);
assert!(result.contains("[[b]]"));
assert!(result.contains("..."));
}
#[test]
fn context_tokens_included() {
let tokens = vec!["before", "CHANGED", "after"];
let result = token_excerpt_with_delta(&tokens, 1, 1, 1000, 1);
assert!(result.contains("before"));
assert!(result.contains("[[CHANGED]]"));
assert!(result.contains("after"));
}
#[test]
fn ellipsis_added_when_truncated_at_start() {
let tokens: Vec<&str> = (0..20).map(|_| "x").collect();
let result = token_excerpt_with_delta(&tokens, 15, 15, 1000, 2);
assert!(result.starts_with("..."));
}
#[test]
fn ellipsis_added_when_truncated_at_end() {
let tokens: Vec<&str> = (0..20).map(|_| "x").collect();
let result = token_excerpt_with_delta(&tokens, 2, 2, 1000, 2);
assert!(result.ends_with("..."));
}
#[test]
fn result_truncated_to_snippet_max() {
let tokens: Vec<&str> = vec!["hello", "world", "this", "is", "a", "test"];
let result = token_excerpt_with_delta(&tokens, 1, 2, 5, 10);
assert!(result.len() <= 5);
}
#[test]
fn identical_texts_return_unchanged_pair() {
let (prev, curr) = focused_pair("hello world", "hello world", 1000, 5);
assert_eq!(prev, "hello world");
assert_eq!(curr, "hello world");
}
#[test]
fn changed_word_is_highlighted_in_each_snippet() {
let prev_text = "The quick brown fox";
let curr_text = "The quick red fox";
let (prev, curr) = focused_pair(prev_text, curr_text, 1000, 5);
assert!(prev.contains("[[brown]]"), "prev snippet: {}", prev);
assert!(curr.contains("[[red]]"), "curr snippet: {}", curr);
}
#[test]
fn addition_highlighted_in_current() {
let (prev, curr) = focused_pair("hello world", "hello beautiful world", 1000, 2);
assert!(curr.contains("[[beautiful]]"), "curr: {}", curr);
assert!(!prev.contains("[[beautiful]]"));
}
#[test]
fn deletion_highlighted_in_previous() {
let (prev, curr) = focused_pair("hello cruel world", "hello world", 1000, 2);
assert!(prev.contains("[[cruel]]"), "prev: {}", prev);
assert!(!curr.contains("[[cruel]]"));
}
#[test]
fn snippets_are_truncated_to_max() {
let long = "word ".repeat(200);
let modified = long.replacen("word", "WORD", 1);
let (prev, curr) = focused_pair(&long, &modified, 50, 2);
assert!(prev.len() <= 50);
assert!(curr.len() <= 50);
}
#[test]
fn empty_previous_returns_marked_current() {
let (prev, curr) = focused_pair("", "hello world", 1000, 3);
assert!(prev.is_empty() || prev == "");
assert!(!curr.is_empty());
}
fn units(ids: &[&str]) -> Vec<Unit> {
ids.iter()
.enumerate()
.map(|(i, id)| Unit { id: id.to_string(), position: i, text: format!("text {}", id) })
.collect()
}
#[test]
fn window_radius_zero_returns_only_target() {
let us = units(&["a", "b", "c"]);
let w = window_for_units(&us, "b", 0);
assert_eq!(w.len(), 1);
assert_eq!(w[0].id, "b");
}
#[test]
fn window_radius_one_returns_three() {
let us = units(&["a", "b", "c"]);
let w = window_for_units(&us, "b", 1);
assert_eq!(w.len(), 3);
}
#[test]
fn window_clamps_at_boundaries() {
let us = units(&["a", "b", "c"]);
let w = window_for_units(&us, "a", 5);
assert_eq!(w.len(), 3); }
#[test]
fn window_missing_id_returns_empty() {
let us = units(&["a", "b"]);
let w = window_for_units(&us, "z", 1);
assert!(w.is_empty());
}
#[test]
fn stitched_pair_not_stitched_for_stable_boundaries() {
let prev_units = vec![
Unit { id: "1".into(), position: 0, text: "The quick brown fox jumps over the lazy dog".into() },
Unit { id: "2".into(), position: 1, text: "Section two content here".into() },
];
let curr_units = vec![
Unit { id: "1".into(), position: 0, text: "The quick brown fox jumps over the FAST dog".into() },
Unit { id: "2".into(), position: 1, text: "Section two content here".into() },
];
let result = boundary_aware_stitched_pair(
&prev_units, &curr_units,
"1", "1",
1000, 5, 2, 2000, 3,
);
assert!(!result.stitched);
assert!(result.previous_snippet.contains("[[brown]]") || result.previous_snippet.contains("[["));
}
#[test]
fn stitched_pair_stitches_boundary_sensitive_change() {
let prev_units = vec![
Unit { id: "1".into(), position: 0, text: "Alpha section".into() },
Unit { id: "2".into(), position: 1, text: "Beta section text".into() },
Unit { id: "3".into(), position: 2, text: "Gamma section".into() },
];
let curr_units = vec![
Unit { id: "1".into(), position: 0, text: "Alpha section".into() },
Unit { id: "2".into(), position: 1, text: "Changed section text".into() },
Unit { id: "3".into(), position: 2, text: "Gamma section".into() },
];
let result = boundary_aware_stitched_pair(
&prev_units, &curr_units,
"2", "2",
2000, 3, 1, 5000, 100,
);
assert!(result.stitched);
assert_eq!(result.stitched_previous_ids.len(), 3);
}
#[test]
fn stitched_pair_ids_and_positions_populated() {
let prev_units = vec![
Unit { id: "sec-1".into(), position: 1, text: "First section.".into() },
Unit { id: "sec-2".into(), position: 2, text: "Second section.".into() },
];
let curr_units = prev_units.clone();
let result = boundary_aware_stitched_pair(
&prev_units, &curr_units,
"sec-1", "sec-1",
1000, 3, 1, 5000, 5,
);
assert!(!result.stitched_previous_ids.is_empty());
assert!(!result.stitched_previous_positions.is_empty());
}
}