oyo-core 0.1.29

//! Diff computation engine

use crate::change::{Change, ChangeKind, ChangeSpan};
use imara_diff::{Algorithm, Diff, InternedInput, TokenSource};
use rustc_hash::{FxHashMap, FxHashSet};
use std::hash::Hash;
use std::ops::Range;
use std::path::Path;
use thiserror::Error;

#[derive(Error, Debug)]
pub enum DiffError {
    #[error("Failed to read file: {0}")]
    FileRead(#[from] std::io::Error),
    #[error("Diff computation failed: {0}")]
    ComputeFailed(String),
}

/// A hunk is a group of related changes that are close together
#[derive(Debug, Clone)]
pub struct Hunk {
    /// Unique ID for this hunk
    pub id: usize,
    /// IDs of changes in this hunk (in order)
    pub change_ids: Vec<usize>,
    /// Starting line number in old file
    pub old_start: Option<usize>,
    /// Starting line number in new file
    pub new_start: Option<usize>,
    /// Number of insertions in this hunk
    pub insertions: usize,
    /// Number of deletions in this hunk
    pub deletions: usize,
}

impl Hunk {
    /// Get the number of changes in this hunk
    pub fn len(&self) -> usize {
        self.change_ids.len()
    }

    /// Check if hunk is empty
    pub fn is_empty(&self) -> bool {
        self.change_ids.is_empty()
    }
}

/// Result of a diff operation
#[derive(Debug, Clone)]
pub struct DiffResult {
    /// All changes in order
    pub changes: Vec<Change>,
    /// Only the actual changes (excluding context)
    pub significant_changes: Vec<usize>,
    /// Hunks (groups of related changes)
    pub hunks: Vec<Hunk>,
    /// Total number of insertions
    pub insertions: usize,
    /// Total number of deletions
    pub deletions: usize,
}

impl DiffResult {
    /// Get only the significant (non-context) changes
    pub fn get_significant_changes(&self) -> Vec<&Change> {
        self.significant_changes
            .iter()
            .filter_map(|&id| self.changes.iter().find(|c| c.id == id))
            .collect()
    }

    /// Get a hunk by ID
    pub fn get_hunk(&self, hunk_id: usize) -> Option<&Hunk> {
        self.hunks.iter().find(|h| h.id == hunk_id)
    }

    /// Find which hunk a change belongs to
    pub fn hunk_for_change(&self, change_id: usize) -> Option<&Hunk> {
        self.hunks
            .iter()
            .find(|h| h.change_ids.contains(&change_id))
    }
}

/// A diff for a single file
#[derive(Debug, Clone)]
pub struct FileDiff {
    pub old_path: Option<String>,
    pub new_path: Option<String>,
    pub result: DiffResult,
}

/// The main diff engine
pub struct DiffEngine {
    /// Number of context lines to include
    context_lines: usize,
    /// Whether to do word-level diffing within changed lines
    word_level: bool,
}

fn diff_ranges<I, T>(algorithm: Algorithm, before: I, after: I) -> Vec<(Range<usize>, Range<usize>)>
where
    I: TokenSource<Token = T>,
    T: Eq + Hash,
{
    let input = InternedInput::new(before, after);
    let diff = Diff::compute(algorithm, &input);
    diff.hunks()
        .map(|hunk| {
            (
                hunk.before.start as usize..hunk.before.end as usize,
                hunk.after.start as usize..hunk.after.end as usize,
            )
        })
        .collect()
}

impl Default for DiffEngine {
    fn default() -> Self {
        Self {
            context_lines: 3,
            word_level: true,
        }
    }
}

impl DiffEngine {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn with_context(mut self, lines: usize) -> Self {
        self.context_lines = lines;
        self
    }

    pub fn with_word_level(mut self, enabled: bool) -> Self {
        self.word_level = enabled;
        self
    }

    /// Compute diff between two strings
    pub fn diff_strings(&self, old: &str, new: &str) -> DiffResult {
        let mut changes = Vec::new();
        let mut significant_changes = Vec::new();
        let mut insertions = 0;
        let mut deletions = 0;
        let mut change_id = 0;

        let mut old_line_num = 1usize;
        let mut new_line_num = 1usize;

        // Group consecutive changes together for word-level diffing
        let mut pending_deletes: Vec<(String, usize)> = Vec::new();
        let mut pending_inserts: Vec<(String, usize)> = Vec::new();

        let old_lines: Vec<&str> = old.lines().collect();
        let new_lines: Vec<&str> = new.lines().collect();
        let ranges = diff_ranges(Algorithm::Histogram, old, new);

        let mut old_idx = 0usize;

        for (before, after) in ranges {
            if old_idx < before.start {
                self.flush_pending_changes(
                    &mut pending_deletes,
                    &mut pending_inserts,
                    &mut changes,
                    &mut significant_changes,
                    &mut change_id,
                    &mut insertions,
                    &mut deletions,
                );

                while old_idx < before.start {
                    let line = old_lines.get(old_idx).copied().unwrap_or("");
                    let span =
                        ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
                    changes.push(Change::single(change_id, span));
                    change_id += 1;
                    old_idx += 1;
                    old_line_num += 1;
                    new_line_num += 1;
                }
            }

            for idx in before.start..before.end {
                let line = old_lines.get(idx).copied().unwrap_or("");
                pending_deletes.push((line.to_string(), old_line_num));
                old_line_num += 1;
            }

            for idx in after.start..after.end {
                let line = new_lines.get(idx).copied().unwrap_or("");
                pending_inserts.push((line.to_string(), new_line_num));
                new_line_num += 1;
            }

            old_idx = before.end;
        }

        if old_idx < old_lines.len() {
            self.flush_pending_changes(
                &mut pending_deletes,
                &mut pending_inserts,
                &mut changes,
                &mut significant_changes,
                &mut change_id,
                &mut insertions,
                &mut deletions,
            );

            while old_idx < old_lines.len() {
                let line = old_lines.get(old_idx).copied().unwrap_or("");
                let span =
                    ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
                changes.push(Change::single(change_id, span));
                change_id += 1;
                old_idx += 1;
                old_line_num += 1;
                new_line_num += 1;
            }
        }

        self.flush_pending_changes(
            &mut pending_deletes,
            &mut pending_inserts,
            &mut changes,
            &mut significant_changes,
            &mut change_id,
            &mut insertions,
            &mut deletions,
        );

        let (changes, significant_changes) = if self.context_lines != usize::MAX {
            let mut id_to_idx = FxHashMap::default();
            for (idx, change) in changes.iter().enumerate() {
                id_to_idx.insert(change.id, idx);
            }
            let mut include = vec![false; changes.len()];
            for &change_id in &significant_changes {
                if let Some(&idx) = id_to_idx.get(&change_id) {
                    let start = idx.saturating_sub(self.context_lines);
                    let end = (idx + self.context_lines).min(changes.len().saturating_sub(1));
                    for slot in include.iter_mut().take(end + 1).skip(start) {
                        *slot = true;
                    }
                }
            }
            let significant_set: FxHashSet<usize> = significant_changes.iter().copied().collect();
            let mut filtered_changes = Vec::new();
            let mut filtered_significant = Vec::new();
            for (idx, change) in changes.into_iter().enumerate() {
                if include.get(idx).copied().unwrap_or(false) {
                    if significant_set.contains(&change.id) {
                        filtered_significant.push(change.id);
                    }
                    filtered_changes.push(change);
                }
            }
            (filtered_changes, filtered_significant)
        } else {
            (changes, significant_changes)
        };

        // Compute hunks by grouping nearby changes
        let hunks = Self::compute_hunks(&significant_changes, &changes);

        DiffResult {
            changes,
            significant_changes,
            hunks,
            insertions,
            deletions,
        }
    }

    /// Compute hunks by grouping consecutive changes that are close together
    /// Changes within PROXIMITY_THRESHOLD lines are grouped into the same hunk
    fn compute_hunks(significant_changes: &[usize], changes: &[Change]) -> Vec<Hunk> {
        const PROXIMITY_THRESHOLD: usize = 3;

        let mut hunks = Vec::new();
        if significant_changes.is_empty() {
            return hunks;
        }

        let mut id_to_index =
            FxHashMap::with_capacity_and_hasher(changes.len(), Default::default());
        for (idx, change) in changes.iter().enumerate() {
            id_to_index.insert(change.id, idx);
        }

        let mut current_hunk_changes: Vec<usize> = Vec::new();
        let mut current_hunk_old_start: Option<usize> = None;
        let mut current_hunk_new_start: Option<usize> = None;
        let mut last_old_line: Option<usize> = None;
        let mut last_new_line: Option<usize> = None;
        let mut current_insertions = 0;
        let mut current_deletions = 0;
        let mut hunk_id = 0;

        for &change_id in significant_changes {
            let change = match id_to_index
                .get(&change_id)
                .and_then(|idx| changes.get(*idx))
            {
                Some(c) => c,
                None => continue,
            };

            // Get line numbers from first span
            let (old_line, new_line) = change
                .spans
                .first()
                .map(|s| (s.old_line, s.new_line))
                .unwrap_or((None, None));

            // Determine if this change is close to the previous one
            let is_close = match (last_old_line, last_new_line, old_line, new_line) {
                (Some(lo), _, Some(co), _) => co.saturating_sub(lo) <= PROXIMITY_THRESHOLD,
                (_, Some(ln), _, Some(cn)) => cn.saturating_sub(ln) <= PROXIMITY_THRESHOLD,
                _ => current_hunk_changes.is_empty(), // First change always starts a hunk
            };

            if is_close {
                // Add to current hunk
                current_hunk_changes.push(change_id);
                if current_hunk_old_start.is_none() {
                    current_hunk_old_start = old_line;
                }
                if current_hunk_new_start.is_none() {
                    current_hunk_new_start = new_line;
                }
            } else {
                // Save current hunk and start a new one
                if !current_hunk_changes.is_empty() {
                    hunks.push(Hunk {
                        id: hunk_id,
                        change_ids: current_hunk_changes.clone(),
                        old_start: current_hunk_old_start,
                        new_start: current_hunk_new_start,
                        insertions: current_insertions,
                        deletions: current_deletions,
                    });
                    hunk_id += 1;
                }

                // Start new hunk
                current_hunk_changes = vec![change_id];
                current_hunk_old_start = old_line;
                current_hunk_new_start = new_line;
                current_insertions = 0;
                current_deletions = 0;
            }

            // Update last line numbers
            if old_line.is_some() {
                last_old_line = old_line;
            }
            if new_line.is_some() {
                last_new_line = new_line;
            }

            // Count insertions/deletions in this change
            for span in &change.spans {
                match span.kind {
                    ChangeKind::Insert => current_insertions += 1,
                    ChangeKind::Delete => current_deletions += 1,
                    ChangeKind::Replace => {
                        current_insertions += 1;
                        current_deletions += 1;
                    }
                    ChangeKind::Equal => {}
                }
            }
        }

        // Don't forget the last hunk
        if !current_hunk_changes.is_empty() {
            hunks.push(Hunk {
                id: hunk_id,
                change_ids: current_hunk_changes,
                old_start: current_hunk_old_start,
                new_start: current_hunk_new_start,
                insertions: current_insertions,
                deletions: current_deletions,
            });
        }

        hunks
    }

    #[allow(clippy::too_many_arguments)]
    fn flush_pending_changes(
        &self,
        pending_deletes: &mut Vec<(String, usize)>,
        pending_inserts: &mut Vec<(String, usize)>,
        changes: &mut Vec<Change>,
        significant_changes: &mut Vec<usize>,
        change_id: &mut usize,
        insertions: &mut usize,
        deletions: &mut usize,
    ) {
        if pending_deletes.is_empty() && pending_inserts.is_empty() {
            return;
        }

        // Try to match deletes with inserts for replace operations
        if self.word_level && pending_deletes.len() == pending_inserts.len() {
            for ((old_text, old_line), (new_text, new_line)) in
                pending_deletes.iter().zip(pending_inserts.iter())
            {
                let spans = self.compute_word_diff(old_text, new_text, *old_line, *new_line);
                let change = Change::new(*change_id, spans);
                significant_changes.push(*change_id);
                changes.push(change);
                *change_id += 1;
                *insertions += 1;
                *deletions += 1;
            }
        } else {
            // Output as separate deletes and inserts
            for (text, line) in pending_deletes.iter() {
                let span = ChangeSpan::delete(text.clone()).with_lines(Some(*line), None);
                significant_changes.push(*change_id);
                changes.push(Change::single(*change_id, span));
                *change_id += 1;
                *deletions += 1;
            }
            for (text, line) in pending_inserts.iter() {
                let span = ChangeSpan::insert(text.clone()).with_lines(None, Some(*line));
                significant_changes.push(*change_id);
                changes.push(Change::single(*change_id, span));
                *change_id += 1;
                *insertions += 1;
            }
        }

        pending_deletes.clear();
        pending_inserts.clear();
    }
}

/// Tokenize code for word-level diffing
/// Separates identifiers from punctuation for accurate diffs
fn tokenize_code(line: &str) -> Vec<String> {
    let mut tokens = Vec::new();
    let mut buf = String::new();
    let mut in_word = false;

    for ch in line.chars() {
        let is_word = ch.is_alphanumeric() || ch == '_';
        if is_word {
            if !in_word {
                if !buf.is_empty() {
                    tokens.push(std::mem::take(&mut buf));
                }
                in_word = true;
            }
            buf.push(ch);
        } else {
            if in_word {
                if !buf.is_empty() {
                    tokens.push(std::mem::take(&mut buf));
                }
                in_word = false;
            }
            if ch.is_whitespace() {
                // Group consecutive whitespace
                if !buf.is_empty() && !buf.chars().all(char::is_whitespace) {
                    tokens.push(std::mem::take(&mut buf));
                }
                buf.push(ch);
            } else {
                // Each punctuation char is its own token
                if !buf.is_empty() {
                    tokens.push(std::mem::take(&mut buf));
                }
                tokens.push(ch.to_string());
            }
        }
    }
    if !buf.is_empty() {
        tokens.push(buf);
    }
    tokens
}

#[derive(Clone, Copy)]
struct TokenSlice<'a> {
    tokens: &'a [&'a str],
}

impl<'a> TokenSource for TokenSlice<'a> {
    type Token = &'a str;
    type Tokenizer = std::iter::Copied<std::slice::Iter<'a, &'a str>>;

    fn tokenize(&self) -> Self::Tokenizer {
        self.tokens.iter().copied()
    }

    fn estimate_tokens(&self) -> u32 {
        self.tokens.len() as u32
    }
}

impl DiffEngine {
    /// Compute word-level diff within a line
    fn compute_word_diff(
        &self,
        old: &str,
        new: &str,
        old_line: usize,
        new_line: usize,
    ) -> Vec<ChangeSpan> {
        let old_tokens = tokenize_code(old);
        let new_tokens = tokenize_code(new);
        let old_refs: Vec<&str> = old_tokens.iter().map(|s| s.as_str()).collect();
        let new_refs: Vec<&str> = new_tokens.iter().map(|s| s.as_str()).collect();
        let ranges = diff_ranges(
            Algorithm::Histogram,
            TokenSlice { tokens: &old_refs },
            TokenSlice { tokens: &new_refs },
        );
        let mut spans = Vec::new();
        let mut old_idx = 0usize;

        for (before, after) in ranges {
            while old_idx < before.start {
                let token = old_refs.get(old_idx).copied().unwrap_or("");
                spans.push(
                    ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
                );
                old_idx += 1;
            }

            for idx in before.start..before.end {
                let token = old_refs.get(idx).copied().unwrap_or("");
                spans.push(
                    ChangeSpan::delete(token.to_string())
                        .with_lines(Some(old_line), Some(new_line)),
                );
            }

            for idx in after.start..after.end {
                let token = new_refs.get(idx).copied().unwrap_or("");
                spans.push(
                    ChangeSpan::insert(token.to_string())
                        .with_lines(Some(old_line), Some(new_line)),
                );
            }

            old_idx = before.end;
        }

        while old_idx < old_refs.len() {
            let token = old_refs.get(old_idx).copied().unwrap_or("");
            spans.push(
                ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
            );
            old_idx += 1;
        }

        spans
    }

    /// Compute diff between two files
    pub fn diff_files(&self, old_path: &Path, new_path: &Path) -> Result<FileDiff, DiffError> {
        let old_content = std::fs::read_to_string(old_path)?;
        let new_content = std::fs::read_to_string(new_path)?;

        let result = self.diff_strings(&old_content, &new_content);

        Ok(FileDiff {
            old_path: Some(old_path.to_string_lossy().to_string()),
            new_path: Some(new_path.to_string_lossy().to_string()),
            result,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_simple_diff() {
        let engine = DiffEngine::new();
        let old = "foo\nbar\nbaz";
        let new = "foo\nqux\nbaz";

        let result = engine.diff_strings(old, new);

        assert_eq!(result.insertions, 1);
        assert_eq!(result.deletions, 1);
        assert!(!result.significant_changes.is_empty());
    }

    #[test]
    fn test_no_changes() {
        let engine = DiffEngine::new();
        let text = "foo\nbar\nbaz";

        let result = engine.diff_strings(text, text);

        assert_eq!(result.insertions, 0);
        assert_eq!(result.deletions, 0);
        assert!(result.significant_changes.is_empty());
    }

    #[test]
    fn test_word_level_diff() {
        let engine = DiffEngine::new().with_word_level(true);
        let old = "const foo = 4";
        let new = "const bar = 4";

        let result = engine.diff_strings(old, new);

        // Should have a single change with word-level spans
        assert_eq!(result.significant_changes.len(), 1);
    }

    #[test]
    fn test_tokenize_code_basic() {
        let tokens = tokenize_code("KeyModifiers, MouseEventKind}");
        assert_eq!(
            tokens,
            vec!["KeyModifiers", ",", " ", "MouseEventKind", "}"]
        );
    }

    #[test]
    fn test_tokenize_code_identifiers() {
        let tokens = tokenize_code("foo_bar baz123");
        assert_eq!(tokens, vec!["foo_bar", " ", "baz123"]);
    }

    #[test]
    fn test_tokenize_code_punctuation() {
        let tokens = tokenize_code("use foo::{A, B};");
        assert_eq!(
            tokens,
            vec!["use", " ", "foo", ":", ":", "{", "A", ",", " ", "B", "}", ";"]
        );
    }

    #[test]
    fn test_word_diff_punctuation_separation() {
        use crate::change::ChangeKind;

        // This is the exact bug case: adding MouseEventKind to an import list
        let engine = DiffEngine::new().with_word_level(true);
        let old = "use foo::{KeyModifiers};";
        let new = "use foo::{KeyModifiers, MouseEventKind};";

        let result = engine.diff_strings(old, new);

        // Should have one change
        assert_eq!(result.significant_changes.len(), 1);

        let change = &result.changes[result.significant_changes[0]];

        // Find spans by kind
        let equal_content: String = change
            .spans
            .iter()
            .filter(|s| s.kind == ChangeKind::Equal)
            .map(|s| s.text.as_str())
            .collect();
        let insert_content: String = change
            .spans
            .iter()
            .filter(|s| s.kind == ChangeKind::Insert)
            .map(|s| s.text.as_str())
            .collect();

        // KeyModifiers should be in equal spans (unchanged)
        assert!(
            equal_content.contains("KeyModifiers"),
            "KeyModifiers should be equal, got equal: '{}', insert: '{}'",
            equal_content,
            insert_content
        );

        // MouseEventKind should be in insert spans (new)
        assert!(
            insert_content.contains("MouseEventKind"),
            "MouseEventKind should be inserted, got equal: '{}', insert: '{}'",
            equal_content,
            insert_content
        );

        // KeyModifiers should NOT be in insert spans
        assert!(
            !insert_content.contains("KeyModifiers"),
            "KeyModifiers should not be inserted, got insert: '{}'",
            insert_content
        );
    }

    #[test]
    fn test_hunks_are_contiguous_in_significant_changes() {
        let engine = DiffEngine::new();
        let old = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\n";
        let new = "a\nB\nc\nd\ne\nf\ng\nh\nI\nj\nk\nL\nm\nn\n";

        let result = engine.diff_strings(old, new);

        assert!(
            result.hunks.len() >= 2,
            "expected multiple hunks for contiguity test"
        );

        for hunk in &result.hunks {
            if hunk.change_ids.is_empty() {
                continue;
            }
            let mut positions = Vec::new();
            for id in &hunk.change_ids {
                let pos = result
                    .significant_changes
                    .iter()
                    .position(|sid| sid == id)
                    .expect("hunk change id should exist in significant_changes");
                positions.push(pos);
            }
            for pair in positions.windows(2) {
                assert_eq!(
                    pair[1],
                    pair[0] + 1,
                    "hunk change ids should be contiguous in significant_changes"
                );
            }
            let start = positions[0];
            for (offset, id) in hunk.change_ids.iter().enumerate() {
                assert_eq!(
                    result.significant_changes[start + offset],
                    *id,
                    "hunk change order should match significant_changes"
                );
            }
        }
    }
}