travelagent-core 1.10.2

Core library for travelagent code review tool
Documentation
//! Word-level diff highlighting within modified lines.
//!
//! Given an `old` line and a `new` line, [`highlight_line_pair`] returns two
//! parallel token streams such that tokens marked `highlight = true` are the
//! words that actually changed. The pair-wise coloring in the TUI can then use
//! a bolder background for those tokens instead of painting the entire line.
//!
//! Implementation notes
//! --------------------
//! We avoid pulling in the `similar` crate (not in the workspace) and instead
//! run a simple LCS (longest common subsequence) over word tokens. Tokens are
//! produced by [`tokenize`]: consecutive word characters (`is_alphanumeric` or
//! `_`) form one token and every other non-word character forms its own
//! single-character token. This keeps punctuation / whitespace aligned with
//! the original text so reconstructing the line from tokens yields the exact
//! input back.
//!
//! The LCS itself is `O(n * m)` in both time and memory. To keep the
//! allocation bounded for pathological inputs we hard-cap the product of the
//! token counts; on overflow we fall back to a single "whole line changed"
//! token so the caller still gets something useful (and deterministic).
//!
//! This module is deterministic: identical inputs always produce identical
//! outputs.
//!
//! [`tokenize`]: fn@tokenize

/// A styled segment of a rendered diff line.
///
/// `text` is kept as a plain `String` so the caller can re-use it when
/// building `ratatui` spans. Concatenating every `text` in the returned
/// `Vec<Token>` reconstructs the original input.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
    /// The raw text of this token (word, punctuation, or whitespace run).
    pub text: String,
    /// `true` if this token was added/removed relative to the other side.
    pub highlight: bool,
}

impl Token {
    fn new(text: impl Into<String>, highlight: bool) -> Self {
        Self {
            text: text.into(),
            highlight,
        }
    }
}

/// Upper bound on the LCS table size (token_old.len() * token_new.len()).
/// ~4 million cells keeps worst-case memory under a few MB and keeps the
/// function interactive even on pathological input.
const LCS_CELL_BUDGET: usize = 4_000_000;

/// Compute word-level highlights for a pair of (old, new) lines.
///
/// Returns `(old_tokens, new_tokens)`. Tokens whose text survived on both
/// sides have `highlight = false`; tokens unique to one side are marked
/// `highlight = true`.
///
/// Special cases:
/// - Identical inputs (including both empty) return a single unhighlighted
///   token on each side containing the original text.
/// - If the LCS table would exceed [`LCS_CELL_BUDGET`] we bail out and return
///   the whole line as a single highlighted token on each side; this keeps
///   the function allocation-bounded.
pub fn highlight_line_pair(old: &str, new: &str) -> (Vec<Token>, Vec<Token>) {
    // Fast path: identical lines (covers empty/empty).
    if old == new {
        return (vec![Token::new(old, false)], vec![Token::new(new, false)]);
    }

    let old_tokens = tokenize(old);
    let new_tokens = tokenize(new);

    // Handle one-sided empty input: everything on the non-empty side is new.
    if old_tokens.is_empty() {
        let new_out = new_tokens
            .into_iter()
            .map(|t| Token::new(t, true))
            .collect();
        return (vec![Token::new(old, false)], new_out);
    }
    if new_tokens.is_empty() {
        let old_out = old_tokens
            .into_iter()
            .map(|t| Token::new(t, true))
            .collect();
        return (old_out, vec![Token::new(new, false)]);
    }

    // Pathological-input guard.
    if old_tokens
        .len()
        .checked_mul(new_tokens.len())
        .is_none_or(|n| n > LCS_CELL_BUDGET)
    {
        return (vec![Token::new(old, true)], vec![Token::new(new, true)]);
    }

    // Standard LCS DP table with direction tracking.
    let rows = old_tokens.len();
    let cols = new_tokens.len();

    // `lcs` holds the length of the longest common subsequence for prefix (i, j);
    // `dir` holds 0 = match (diagonal), 1 = up (skip old), 2 = left (skip new).
    let mut lcs = vec![0u32; (rows + 1) * (cols + 1)];
    let mut dir = vec![0u8; (rows + 1) * (cols + 1)];

    for i in 1..=rows {
        for j in 1..=cols {
            let idx = i * (cols + 1) + j;
            if old_tokens[i - 1] == new_tokens[j - 1] {
                lcs[idx] = lcs[(i - 1) * (cols + 1) + (j - 1)] + 1;
                dir[idx] = 0;
            } else {
                let up = lcs[(i - 1) * (cols + 1) + j];
                let left = lcs[i * (cols + 1) + (j - 1)];
                if up >= left {
                    lcs[idx] = up;
                    dir[idx] = 1;
                } else {
                    lcs[idx] = left;
                    dir[idx] = 2;
                }
            }
        }
    }

    // Walk the DP table from (rows, cols) to (0, 0) collecting flags.
    let mut old_flags: Vec<bool> = vec![false; rows];
    let mut new_flags: Vec<bool> = vec![false; cols];
    let mut i = rows;
    let mut j = cols;
    while i > 0 || j > 0 {
        if i > 0 && j > 0 && dir[i * (cols + 1) + j] == 0 {
            i -= 1;
            j -= 1;
        } else if j == 0 || (i > 0 && dir[i * (cols + 1) + j] == 1) {
            // Token on old side only -> highlight.
            i -= 1;
            old_flags[i] = true;
        } else {
            // Token on new side only -> highlight.
            j -= 1;
            new_flags[j] = true;
        }
    }

    let old_out = merge_runs(&old_tokens, &old_flags);
    let new_out = merge_runs(&new_tokens, &new_flags);

    (old_out, new_out)
}

/// Split a line into word / non-word tokens.
///
/// Rules:
/// - A run of `is_alphanumeric() || '_'` characters is one token.
/// - Every other character is its own single-character token (so whitespace
///   and punctuation align with themselves across sides).
fn tokenize(s: &str) -> Vec<String> {
    let mut out: Vec<String> = Vec::new();
    let mut current = String::new();
    for ch in s.chars() {
        if ch.is_alphanumeric() || ch == '_' {
            current.push(ch);
        } else {
            if !current.is_empty() {
                out.push(std::mem::take(&mut current));
            }
            out.push(ch.to_string());
        }
    }
    if !current.is_empty() {
        out.push(current);
    }
    out
}

/// Merge consecutive tokens with the same `highlight` flag into a single
/// `Token` so the caller emits fewer `Span`s.
fn merge_runs(tokens: &[String], flags: &[bool]) -> Vec<Token> {
    debug_assert_eq!(tokens.len(), flags.len());
    if tokens.is_empty() {
        return Vec::new();
    }
    let mut out: Vec<Token> = Vec::new();
    let mut buf = String::new();
    let mut buf_flag = flags[0];
    for (tok, &flag) in tokens.iter().zip(flags.iter()) {
        if flag == buf_flag {
            buf.push_str(tok);
        } else {
            out.push(Token::new(std::mem::take(&mut buf), buf_flag));
            buf.push_str(tok);
            buf_flag = flag;
        }
    }
    if !buf.is_empty() {
        out.push(Token::new(buf, buf_flag));
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    fn concat(tokens: &[Token]) -> String {
        tokens.iter().map(|t| t.text.as_str()).collect()
    }

    #[test]
    fn word_diff_identical_lines_returns_no_highlights() {
        let (old, new) = highlight_line_pair("let x = 1;", "let x = 1;");
        assert_eq!(old.len(), 1);
        assert_eq!(new.len(), 1);
        assert!(!old[0].highlight);
        assert!(!new[0].highlight);
        assert_eq!(concat(&old), "let x = 1;");
        assert_eq!(concat(&new), "let x = 1;");
    }

    #[test]
    fn word_diff_single_word_change_highlights_only_that_word() {
        let (old, new) = highlight_line_pair("let x = 1;", "let y = 1;");
        assert_eq!(concat(&old), "let x = 1;");
        assert_eq!(concat(&new), "let y = 1;");
        // Only the `x` / `y` tokens should be highlighted.
        let old_hl: Vec<&str> = old
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        let new_hl: Vec<&str> = new
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        assert_eq!(old_hl, vec!["x"]);
        assert_eq!(new_hl, vec!["y"]);
    }

    #[test]
    fn word_diff_insertion_at_end_highlights_only_new_words() {
        let (old, new) = highlight_line_pair("hello world", "hello world again");
        assert_eq!(concat(&old), "hello world");
        assert_eq!(concat(&new), "hello world again");
        assert!(old.iter().all(|t| !t.highlight));
        // On the new side, only the trailing " again" should be highlighted.
        let new_hl: String = new
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        assert_eq!(new_hl, " again");
    }

    #[test]
    fn word_diff_deletion_from_middle_highlights_only_removed_words() {
        let (old, new) = highlight_line_pair("one two three", "one three");
        assert_eq!(concat(&old), "one two three");
        assert_eq!(concat(&new), "one three");
        let old_hl: String = old
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        // LCS may consume either the leading or trailing space alongside the
        // removed word -- both are valid. The important thing is that "two"
        // is the only word touched and exactly one adjacent space goes with it.
        assert!(
            old_hl == "two " || old_hl == " two",
            "unexpected old-side highlight {old_hl:?}"
        );
        assert!(new.iter().all(|t| !t.highlight));
    }

    #[test]
    fn word_diff_empty_inputs_are_safe() {
        let (old, new) = highlight_line_pair("", "");
        assert_eq!(old.len(), 1);
        assert_eq!(new.len(), 1);
        assert_eq!(old[0].text, "");
        assert_eq!(new[0].text, "");
        assert!(!old[0].highlight);
        assert!(!new[0].highlight);

        // One side empty: every token on the other side is highlighted.
        let (old, new) = highlight_line_pair("", "hello");
        assert_eq!(concat(&old), "");
        assert_eq!(concat(&new), "hello");
        assert!(new.iter().all(|t| t.highlight));

        let (old, new) = highlight_line_pair("goodbye", "");
        assert_eq!(concat(&old), "goodbye");
        assert_eq!(concat(&new), "");
        assert!(old.iter().all(|t| t.highlight));
    }

    #[test]
    fn word_diff_determinism_same_inputs_same_output() {
        let a1 = highlight_line_pair("foo bar baz", "foo BAR baz");
        let a2 = highlight_line_pair("foo bar baz", "foo BAR baz");
        assert_eq!(a1, a2);
    }

    #[test]
    fn tokenize_splits_on_word_boundaries() {
        let tokens = tokenize("let x = 1;");
        assert_eq!(
            tokens,
            vec!["let", " ", "x", " ", "=", " ", "1", ";"]
                .into_iter()
                .map(std::string::ToString::to_string)
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn tokenize_keeps_underscore_as_word_char() {
        let tokens = tokenize("foo_bar baz");
        assert_eq!(
            tokens,
            vec!["foo_bar", " ", "baz"]
                .into_iter()
                .map(std::string::ToString::to_string)
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn word_diff_unicode_tokens_reconstruct_exactly() {
        // CJK + emoji round-trip: concatenating the returned tokens must yield
        // the exact input on each side, and the non-shared characters must be
        // the ones highlighted.
        let old = "旅行 agent 🧳 ready";
        let new = "旅行 agent 🌴 ready";
        let (old_tokens, new_tokens) = highlight_line_pair(old, new);
        assert_eq!(concat(&old_tokens), old);
        assert_eq!(concat(&new_tokens), new);

        // The luggage / palm emojis differ but everything else is shared.
        let old_hl: String = old_tokens
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        let new_hl: String = new_tokens
            .iter()
            .filter(|t| t.highlight)
            .map(|t| t.text.as_str())
            .collect();
        assert!(
            old_hl.contains('\u{1F9F3}'),
            "expected luggage emoji highlighted on old side, got {old_hl:?}"
        );
        assert!(
            new_hl.contains('\u{1F334}'),
            "expected palm emoji highlighted on new side, got {new_hl:?}"
        );
    }

    #[test]
    fn pathological_input_falls_back_to_whole_line_highlight() {
        // Construct two huge different token streams to blow the budget.
        // Use 3000 words each side -> 9_000_000 cells, well above 4M budget.
        let build = |ch: char| -> String {
            let mut s = String::new();
            for i in 0..3000 {
                s.push(ch);
                s.push_str(&i.to_string());
                s.push(' ');
            }
            s
        };
        let left = build('a');
        let right = build('b');
        let (old, new) = highlight_line_pair(&left, &right);
        assert_eq!(old.len(), 1);
        assert_eq!(new.len(), 1);
        assert!(old[0].highlight);
        assert!(new[0].highlight);
        assert_eq!(old[0].text, left);
        assert_eq!(new[0].text, right);
    }
}