shuire 0.1.1 - Docs.rs

//! Word-level diff highlighting.
//!
//! Given a pair of removed/added lines, compute which character ranges
//! within each line actually changed.

/// A highlighted range within a line: [start, end) as byte offsets.
#[derive(Debug, Clone)]
pub struct WordHighlight {
    pub start: usize,
    pub end: usize,
}

const MAX_TOKENS: usize = 200;

/// Minimum fraction of characters that must be unchanged for two lines to
/// be considered a "pair" worth highlighting at the word level. Below this,
/// the lines are so different that they are treated as unrelated delete +
/// insert rather than an edit of one line into another. Mirrors delta's
/// `max_line_distance` (default 0.6, i.e. similarity >= 0.4); we set 0.5
/// to be slightly stricter and reduce noise further.
/// See delta `src/edits.rs::infer_edits`.
const MIN_SIMILARITY: f32 = 0.5;

/// Compute word-level highlights for a removed/added pair.
/// Returns (removed_highlights, added_highlights), or empty vectors if the
/// pair is not similar enough to be considered a rewrite of the same logical
/// line.
#[cfg(test)]
pub fn compute_word_highlights(
    removed: &str,
    added: &str,
) -> (Vec<WordHighlight>, Vec<WordHighlight>) {
    match score_pair(removed, added) {
        Some((_, rh, ah)) => (rh, ah),
        None => (Vec::new(), Vec::new()),
    }
}

/// Score a removed/added line pair and compute word-level highlights in a
/// single LCS pass. Returns `Some((similarity, removed_hl, added_hl))` when
/// the pair clears `MIN_SIMILARITY`, `None` otherwise.
///
/// `similarity` is the fraction of characters (across both lines) that are
/// matched by the LCS — higher means the lines are more alike. Use this to
/// rank competing pair candidates during greedy assignment.
///
/// Returns `None` when:
/// - either side is empty/whitespace-only
/// - tokenization exceeds `MAX_TOKENS` on either side (O(n*m) guard)
/// - similarity is below `MIN_SIMILARITY`
pub fn score_pair(
    removed: &str,
    added: &str,
) -> Option<(f32, Vec<WordHighlight>, Vec<WordHighlight>)> {
    if removed.trim().is_empty() || added.trim().is_empty() {
        return None;
    }

    let old_words = tokenize(removed);
    let new_words = tokenize(added);

    if old_words.len() > MAX_TOKENS || new_words.len() > MAX_TOKENS {
        return None;
    }

    let lcs = lcs_table(&old_words, &new_words);
    let mut old_matched = vec![false; old_words.len()];
    let mut new_matched = vec![false; new_words.len()];

    // Backtrack LCS
    let mut i = old_words.len();
    let mut j = new_words.len();
    while i > 0 && j > 0 {
        if old_words[i - 1].text == new_words[j - 1].text {
            old_matched[i - 1] = true;
            new_matched[j - 1] = true;
            i -= 1;
            j -= 1;
        } else if lcs[i - 1][j] > lcs[i][j - 1] {
            i -= 1;
        } else {
            j -= 1;
        }
    }

    // Matched token bytes appear on both sides, so count them once per side
    // against the total characters in both lines.
    let unchanged: usize = old_words
        .iter()
        .zip(&old_matched)
        .filter(|(_, m)| **m)
        .map(|(t, _)| t.end - t.start)
        .sum::<usize>()
        + new_words
            .iter()
            .zip(&new_matched)
            .filter(|(_, m)| **m)
            .map(|(t, _)| t.end - t.start)
            .sum::<usize>();
    let total = removed.len() + added.len();
    if total == 0 {
        return None;
    }
    let similarity = unchanged as f32 / total as f32;
    if similarity < MIN_SIMILARITY {
        return None;
    }

    let removed_hl = merge_unmatched(&old_words, &old_matched);
    let added_hl = merge_unmatched(&new_words, &new_matched);

    Some((similarity, removed_hl, added_hl))
}

#[derive(Debug)]
struct Token {
    text: String,
    start: usize,
    end: usize,
}

fn tokenize(s: &str) -> Vec<Token> {
    let mut tokens = Vec::new();
    let mut chars = s.char_indices().peekable();

    while let Some(&(start, ch)) = chars.peek() {
        if ch.is_alphanumeric() || ch == '_' {
            let mut end = start;
            while let Some(&(i, c)) = chars.peek() {
                if c.is_alphanumeric() || c == '_' {
                    end = i + c.len_utf8();
                    chars.next();
                } else {
                    break;
                }
            }
            tokens.push(Token {
                text: s[start..end].to_string(),
                start,
                end,
            });
        } else {
            let end = start + ch.len_utf8();
            tokens.push(Token {
                text: s[start..end].to_string(),
                start,
                end,
            });
            chars.next();
        }
    }

    tokens
}

fn lcs_table(a: &[Token], b: &[Token]) -> Vec<Vec<usize>> {
    let m = a.len();
    let n = b.len();
    let mut table = vec![vec![0usize; n + 1]; m + 1];
    for i in 1..=m {
        for j in 1..=n {
            if a[i - 1].text == b[j - 1].text {
                table[i][j] = table[i - 1][j - 1] + 1;
            } else {
                table[i][j] = table[i - 1][j].max(table[i][j - 1]);
            }
        }
    }
    table
}

fn merge_unmatched(tokens: &[Token], matched: &[bool]) -> Vec<WordHighlight> {
    let mut highlights = Vec::new();
    let mut i = 0;
    while i < tokens.len() {
        if !matched[i] {
            let start = tokens[i].start;
            let mut end = tokens[i].end;
            while i + 1 < tokens.len() && !matched[i + 1] {
                i += 1;
                end = tokens[i].end;
            }
            highlights.push(WordHighlight { start, end });
        }
        i += 1;
    }
    highlights
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn simple_word_change() {
        let (rem, add) = compute_word_highlights("fn foo() {}", "fn bar() {}");
        assert_eq!(rem.len(), 1);
        assert_eq!(&"fn foo() {}"[rem[0].start..rem[0].end], "foo");
        assert_eq!(add.len(), 1);
        assert_eq!(&"fn bar() {}"[add[0].start..add[0].end], "bar");
    }

    #[test]
    fn no_change() {
        let (rem, add) = compute_word_highlights("hello world", "hello world");
        assert!(rem.is_empty());
        assert!(add.is_empty());
    }

    #[test]
    fn full_change_skips_word_diff() {
        // Lines share no tokens — similarity is 0, so we skip word-level
        // highlighting to avoid marking every token as changed.
        let (rem, add) = compute_word_highlights("aaa", "bbb");
        assert!(rem.is_empty());
        assert!(add.is_empty());
    }

    #[test]
    fn below_threshold_skips() {
        // Only "fn" in common; far less than 20% of characters.
        let (rem, add) = compute_word_highlights(
            "fn totally_different_function_name_here(x: i32)",
            "fn some_other_completely_unrelated_name(y: String, z: bool)",
        );
        assert!(rem.is_empty());
        assert!(add.is_empty());
    }
}