oyo_core/
diff.rs

1//! Diff computation engine
2
3use crate::change::{Change, ChangeKind, ChangeSpan};
4use similar::{ChangeTag, TextDiff};
5use std::path::Path;
6use thiserror::Error;
7
8#[derive(Error, Debug)]
9pub enum DiffError {
10    #[error("Failed to read file: {0}")]
11    FileRead(#[from] std::io::Error),
12    #[error("Diff computation failed: {0}")]
13    ComputeFailed(String),
14}
15
16/// A hunk is a group of related changes that are close together
17#[derive(Debug, Clone)]
18pub struct Hunk {
19    /// Unique ID for this hunk
20    pub id: usize,
21    /// IDs of changes in this hunk (in order)
22    pub change_ids: Vec<usize>,
23    /// Starting line number in old file
24    pub old_start: Option<usize>,
25    /// Starting line number in new file
26    pub new_start: Option<usize>,
27    /// Number of insertions in this hunk
28    pub insertions: usize,
29    /// Number of deletions in this hunk
30    pub deletions: usize,
31}
32
33impl Hunk {
34    /// Get the number of changes in this hunk
35    pub fn len(&self) -> usize {
36        self.change_ids.len()
37    }
38
39    /// Check if hunk is empty
40    pub fn is_empty(&self) -> bool {
41        self.change_ids.is_empty()
42    }
43}
44
45/// Result of a diff operation
46#[derive(Debug, Clone)]
47pub struct DiffResult {
48    /// All changes in order
49    pub changes: Vec<Change>,
50    /// Only the actual changes (excluding context)
51    pub significant_changes: Vec<usize>,
52    /// Hunks (groups of related changes)
53    pub hunks: Vec<Hunk>,
54    /// Total number of insertions
55    pub insertions: usize,
56    /// Total number of deletions
57    pub deletions: usize,
58}
59
60impl DiffResult {
61    /// Get only the significant (non-context) changes
62    pub fn get_significant_changes(&self) -> Vec<&Change> {
63        self.significant_changes
64            .iter()
65            .filter_map(|&id| self.changes.iter().find(|c| c.id == id))
66            .collect()
67    }
68
69    /// Get a hunk by ID
70    pub fn get_hunk(&self, hunk_id: usize) -> Option<&Hunk> {
71        self.hunks.iter().find(|h| h.id == hunk_id)
72    }
73
74    /// Find which hunk a change belongs to
75    pub fn hunk_for_change(&self, change_id: usize) -> Option<&Hunk> {
76        self.hunks
77            .iter()
78            .find(|h| h.change_ids.contains(&change_id))
79    }
80}
81
82/// A diff for a single file
83#[derive(Debug, Clone)]
84pub struct FileDiff {
85    pub old_path: Option<String>,
86    pub new_path: Option<String>,
87    pub result: DiffResult,
88}
89
90/// The main diff engine
91pub struct DiffEngine {
92    /// Number of context lines to include
93    context_lines: usize,
94    /// Whether to do word-level diffing within changed lines
95    word_level: bool,
96}
97
98impl Default for DiffEngine {
99    fn default() -> Self {
100        Self {
101            context_lines: 3,
102            word_level: true,
103        }
104    }
105}
106
107impl DiffEngine {
108    pub fn new() -> Self {
109        Self::default()
110    }
111
112    pub fn with_context(mut self, lines: usize) -> Self {
113        self.context_lines = lines;
114        self
115    }
116
117    pub fn with_word_level(mut self, enabled: bool) -> Self {
118        self.word_level = enabled;
119        self
120    }
121
122    /// Compute diff between two strings
123    pub fn diff_strings(&self, old: &str, new: &str) -> DiffResult {
124        let text_diff = TextDiff::from_lines(old, new);
125        let mut changes = Vec::new();
126        let mut significant_changes = Vec::new();
127        let mut insertions = 0;
128        let mut deletions = 0;
129        let mut change_id = 0;
130
131        let mut old_line_num = 1usize;
132        let mut new_line_num = 1usize;
133
134        // Group consecutive changes together for word-level diffing
135        let mut pending_deletes: Vec<(String, usize)> = Vec::new();
136        let mut pending_inserts: Vec<(String, usize)> = Vec::new();
137
138        let ops: Vec<_> = text_diff.iter_all_changes().collect();
139
140        for change in ops.iter() {
141            match change.tag() {
142                ChangeTag::Equal => {
143                    // Flush any pending changes before processing equal
144                    self.flush_pending_changes(
145                        &mut pending_deletes,
146                        &mut pending_inserts,
147                        &mut changes,
148                        &mut significant_changes,
149                        &mut change_id,
150                        &mut insertions,
151                        &mut deletions,
152                    );
153
154                    let span = ChangeSpan::equal(change.value().trim_end_matches('\n'))
155                        .with_lines(Some(old_line_num), Some(new_line_num));
156                    changes.push(Change::single(change_id, span));
157                    change_id += 1;
158                    old_line_num += 1;
159                    new_line_num += 1;
160                }
161                ChangeTag::Delete => {
162                    pending_deletes.push((
163                        change.value().trim_end_matches('\n').to_string(),
164                        old_line_num,
165                    ));
166                    old_line_num += 1;
167                }
168                ChangeTag::Insert => {
169                    pending_inserts.push((
170                        change.value().trim_end_matches('\n').to_string(),
171                        new_line_num,
172                    ));
173                    new_line_num += 1;
174                }
175            }
176        }
177
178        // Flush remaining changes
179        self.flush_pending_changes(
180            &mut pending_deletes,
181            &mut pending_inserts,
182            &mut changes,
183            &mut significant_changes,
184            &mut change_id,
185            &mut insertions,
186            &mut deletions,
187        );
188
189        // Compute hunks by grouping nearby changes
190        let hunks = Self::compute_hunks(&significant_changes, &changes);
191
192        DiffResult {
193            changes,
194            significant_changes,
195            hunks,
196            insertions,
197            deletions,
198        }
199    }
200
201    /// Compute hunks by grouping consecutive changes that are close together
202    /// Changes within PROXIMITY_THRESHOLD lines are grouped into the same hunk
203    fn compute_hunks(significant_changes: &[usize], changes: &[Change]) -> Vec<Hunk> {
204        const PROXIMITY_THRESHOLD: usize = 3;
205
206        let mut hunks = Vec::new();
207        if significant_changes.is_empty() {
208            return hunks;
209        }
210
211        let mut current_hunk_changes: Vec<usize> = Vec::new();
212        let mut current_hunk_old_start: Option<usize> = None;
213        let mut current_hunk_new_start: Option<usize> = None;
214        let mut last_old_line: Option<usize> = None;
215        let mut last_new_line: Option<usize> = None;
216        let mut current_insertions = 0;
217        let mut current_deletions = 0;
218        let mut hunk_id = 0;
219
220        for &change_id in significant_changes {
221            let change = match changes.iter().find(|c| c.id == change_id) {
222                Some(c) => c,
223                None => continue,
224            };
225
226            // Get line numbers from first span
227            let (old_line, new_line) = change
228                .spans
229                .first()
230                .map(|s| (s.old_line, s.new_line))
231                .unwrap_or((None, None));
232
233            // Determine if this change is close to the previous one
234            let is_close = match (last_old_line, last_new_line, old_line, new_line) {
235                (Some(lo), _, Some(co), _) => co.saturating_sub(lo) <= PROXIMITY_THRESHOLD,
236                (_, Some(ln), _, Some(cn)) => cn.saturating_sub(ln) <= PROXIMITY_THRESHOLD,
237                _ => current_hunk_changes.is_empty(), // First change always starts a hunk
238            };
239
240            if is_close {
241                // Add to current hunk
242                current_hunk_changes.push(change_id);
243                if current_hunk_old_start.is_none() {
244                    current_hunk_old_start = old_line;
245                }
246                if current_hunk_new_start.is_none() {
247                    current_hunk_new_start = new_line;
248                }
249            } else {
250                // Save current hunk and start a new one
251                if !current_hunk_changes.is_empty() {
252                    hunks.push(Hunk {
253                        id: hunk_id,
254                        change_ids: current_hunk_changes.clone(),
255                        old_start: current_hunk_old_start,
256                        new_start: current_hunk_new_start,
257                        insertions: current_insertions,
258                        deletions: current_deletions,
259                    });
260                    hunk_id += 1;
261                }
262
263                // Start new hunk
264                current_hunk_changes = vec![change_id];
265                current_hunk_old_start = old_line;
266                current_hunk_new_start = new_line;
267                current_insertions = 0;
268                current_deletions = 0;
269            }
270
271            // Update last line numbers
272            if old_line.is_some() {
273                last_old_line = old_line;
274            }
275            if new_line.is_some() {
276                last_new_line = new_line;
277            }
278
279            // Count insertions/deletions in this change
280            for span in &change.spans {
281                match span.kind {
282                    ChangeKind::Insert => current_insertions += 1,
283                    ChangeKind::Delete => current_deletions += 1,
284                    ChangeKind::Replace => {
285                        current_insertions += 1;
286                        current_deletions += 1;
287                    }
288                    ChangeKind::Equal => {}
289                }
290            }
291        }
292
293        // Don't forget the last hunk
294        if !current_hunk_changes.is_empty() {
295            hunks.push(Hunk {
296                id: hunk_id,
297                change_ids: current_hunk_changes,
298                old_start: current_hunk_old_start,
299                new_start: current_hunk_new_start,
300                insertions: current_insertions,
301                deletions: current_deletions,
302            });
303        }
304
305        hunks
306    }
307
308    #[allow(clippy::too_many_arguments)]
309    fn flush_pending_changes(
310        &self,
311        pending_deletes: &mut Vec<(String, usize)>,
312        pending_inserts: &mut Vec<(String, usize)>,
313        changes: &mut Vec<Change>,
314        significant_changes: &mut Vec<usize>,
315        change_id: &mut usize,
316        insertions: &mut usize,
317        deletions: &mut usize,
318    ) {
319        if pending_deletes.is_empty() && pending_inserts.is_empty() {
320            return;
321        }
322
323        // Try to match deletes with inserts for replace operations
324        if self.word_level && pending_deletes.len() == pending_inserts.len() {
325            for ((old_text, old_line), (new_text, new_line)) in
326                pending_deletes.iter().zip(pending_inserts.iter())
327            {
328                let spans = self.compute_word_diff(old_text, new_text, *old_line, *new_line);
329                let change = Change::new(*change_id, spans);
330                significant_changes.push(*change_id);
331                changes.push(change);
332                *change_id += 1;
333                *insertions += 1;
334                *deletions += 1;
335            }
336        } else {
337            // Output as separate deletes and inserts
338            for (text, line) in pending_deletes.iter() {
339                let span = ChangeSpan::delete(text.clone()).with_lines(Some(*line), None);
340                significant_changes.push(*change_id);
341                changes.push(Change::single(*change_id, span));
342                *change_id += 1;
343                *deletions += 1;
344            }
345            for (text, line) in pending_inserts.iter() {
346                let span = ChangeSpan::insert(text.clone()).with_lines(None, Some(*line));
347                significant_changes.push(*change_id);
348                changes.push(Change::single(*change_id, span));
349                *change_id += 1;
350                *insertions += 1;
351            }
352        }
353
354        pending_deletes.clear();
355        pending_inserts.clear();
356    }
357}
358
359/// Tokenize code for word-level diffing
360/// Separates identifiers from punctuation for accurate diffs
361fn tokenize_code(line: &str) -> Vec<String> {
362    let mut tokens = Vec::new();
363    let mut buf = String::new();
364    let mut in_word = false;
365
366    for ch in line.chars() {
367        let is_word = ch.is_alphanumeric() || ch == '_';
368        if is_word {
369            if !in_word {
370                if !buf.is_empty() {
371                    tokens.push(std::mem::take(&mut buf));
372                }
373                in_word = true;
374            }
375            buf.push(ch);
376        } else {
377            if in_word {
378                if !buf.is_empty() {
379                    tokens.push(std::mem::take(&mut buf));
380                }
381                in_word = false;
382            }
383            if ch.is_whitespace() {
384                // Group consecutive whitespace
385                if !buf.is_empty() && !buf.chars().all(char::is_whitespace) {
386                    tokens.push(std::mem::take(&mut buf));
387                }
388                buf.push(ch);
389            } else {
390                // Each punctuation char is its own token
391                if !buf.is_empty() {
392                    tokens.push(std::mem::take(&mut buf));
393                }
394                tokens.push(ch.to_string());
395            }
396        }
397    }
398    if !buf.is_empty() {
399        tokens.push(buf);
400    }
401    tokens
402}
403
404impl DiffEngine {
405    /// Compute word-level diff within a line
406    fn compute_word_diff(
407        &self,
408        old: &str,
409        new: &str,
410        old_line: usize,
411        new_line: usize,
412    ) -> Vec<ChangeSpan> {
413        let old_tokens = tokenize_code(old);
414        let new_tokens = tokenize_code(new);
415        let old_refs: Vec<&str> = old_tokens.iter().map(|s| s.as_str()).collect();
416        let new_refs: Vec<&str> = new_tokens.iter().map(|s| s.as_str()).collect();
417        let word_diff = TextDiff::from_slices(&old_refs, &new_refs);
418        let mut spans = Vec::new();
419
420        for change in word_diff.iter_all_changes() {
421            let text = change.value().to_string();
422            let span = match change.tag() {
423                ChangeTag::Equal => ChangeSpan::equal(text),
424                ChangeTag::Delete => ChangeSpan::delete(text),
425                ChangeTag::Insert => ChangeSpan::insert(text),
426            }
427            .with_lines(Some(old_line), Some(new_line));
428            spans.push(span);
429        }
430
431        spans
432    }
433
434    /// Compute diff between two files
435    pub fn diff_files(&self, old_path: &Path, new_path: &Path) -> Result<FileDiff, DiffError> {
436        let old_content = std::fs::read_to_string(old_path)?;
437        let new_content = std::fs::read_to_string(new_path)?;
438
439        let result = self.diff_strings(&old_content, &new_content);
440
441        Ok(FileDiff {
442            old_path: Some(old_path.to_string_lossy().to_string()),
443            new_path: Some(new_path.to_string_lossy().to_string()),
444            result,
445        })
446    }
447}
448
449#[cfg(test)]
450mod tests {
451    use super::*;
452
453    #[test]
454    fn test_simple_diff() {
455        let engine = DiffEngine::new();
456        let old = "foo\nbar\nbaz";
457        let new = "foo\nqux\nbaz";
458
459        let result = engine.diff_strings(old, new);
460
461        assert_eq!(result.insertions, 1);
462        assert_eq!(result.deletions, 1);
463        assert!(!result.significant_changes.is_empty());
464    }
465
466    #[test]
467    fn test_no_changes() {
468        let engine = DiffEngine::new();
469        let text = "foo\nbar\nbaz";
470
471        let result = engine.diff_strings(text, text);
472
473        assert_eq!(result.insertions, 0);
474        assert_eq!(result.deletions, 0);
475        assert!(result.significant_changes.is_empty());
476    }
477
478    #[test]
479    fn test_word_level_diff() {
480        let engine = DiffEngine::new().with_word_level(true);
481        let old = "const foo = 4";
482        let new = "const bar = 4";
483
484        let result = engine.diff_strings(old, new);
485
486        // Should have a single change with word-level spans
487        assert_eq!(result.significant_changes.len(), 1);
488    }
489
490    #[test]
491    fn test_tokenize_code_basic() {
492        let tokens = tokenize_code("KeyModifiers, MouseEventKind}");
493        assert_eq!(
494            tokens,
495            vec!["KeyModifiers", ",", " ", "MouseEventKind", "}"]
496        );
497    }
498
499    #[test]
500    fn test_tokenize_code_identifiers() {
501        let tokens = tokenize_code("foo_bar baz123");
502        assert_eq!(tokens, vec!["foo_bar", " ", "baz123"]);
503    }
504
505    #[test]
506    fn test_tokenize_code_punctuation() {
507        let tokens = tokenize_code("use foo::{A, B};");
508        assert_eq!(
509            tokens,
510            vec!["use", " ", "foo", ":", ":", "{", "A", ",", " ", "B", "}", ";"]
511        );
512    }
513
514    #[test]
515    fn test_word_diff_punctuation_separation() {
516        use crate::change::ChangeKind;
517
518        // This is the exact bug case: adding MouseEventKind to an import list
519        let engine = DiffEngine::new().with_word_level(true);
520        let old = "use foo::{KeyModifiers};";
521        let new = "use foo::{KeyModifiers, MouseEventKind};";
522
523        let result = engine.diff_strings(old, new);
524
525        // Should have one change
526        assert_eq!(result.significant_changes.len(), 1);
527
528        let change = &result.changes[result.significant_changes[0]];
529
530        // Find spans by kind
531        let equal_content: String = change
532            .spans
533            .iter()
534            .filter(|s| s.kind == ChangeKind::Equal)
535            .map(|s| s.text.as_str())
536            .collect();
537        let insert_content: String = change
538            .spans
539            .iter()
540            .filter(|s| s.kind == ChangeKind::Insert)
541            .map(|s| s.text.as_str())
542            .collect();
543
544        // KeyModifiers should be in equal spans (unchanged)
545        assert!(
546            equal_content.contains("KeyModifiers"),
547            "KeyModifiers should be equal, got equal: '{}', insert: '{}'",
548            equal_content,
549            insert_content
550        );
551
552        // MouseEventKind should be in insert spans (new)
553        assert!(
554            insert_content.contains("MouseEventKind"),
555            "MouseEventKind should be inserted, got equal: '{}', insert: '{}'",
556            equal_content,
557            insert_content
558        );
559
560        // KeyModifiers should NOT be in insert spans
561        assert!(
562            !insert_content.contains("KeyModifiers"),
563            "KeyModifiers should not be inserted, got insert: '{}'",
564            insert_content
565        );
566    }
567}