Skip to main content

oyo_core/
diff.rs

1//! Diff computation engine
2
3use crate::change::{Change, ChangeKind, ChangeSpan};
4use imara_diff::{Algorithm, Diff, InternedInput, TokenSource};
5use rustc_hash::{FxHashMap, FxHashSet};
6use std::hash::Hash;
7use std::ops::Range;
8use std::path::Path;
9use thiserror::Error;
10
11#[derive(Error, Debug)]
12pub enum DiffError {
13    #[error("Failed to read file: {0}")]
14    FileRead(#[from] std::io::Error),
15    #[error("Diff computation failed: {0}")]
16    ComputeFailed(String),
17}
18
19/// A hunk is a group of related changes that are close together
20#[derive(Debug, Clone)]
21pub struct Hunk {
22    /// Unique ID for this hunk
23    pub id: usize,
24    /// IDs of changes in this hunk (in order)
25    pub change_ids: Vec<usize>,
26    /// Starting line number in old file
27    pub old_start: Option<usize>,
28    /// Starting line number in new file
29    pub new_start: Option<usize>,
30    /// Number of insertions in this hunk
31    pub insertions: usize,
32    /// Number of deletions in this hunk
33    pub deletions: usize,
34}
35
36impl Hunk {
37    /// Get the number of changes in this hunk
38    pub fn len(&self) -> usize {
39        self.change_ids.len()
40    }
41
42    /// Check if hunk is empty
43    pub fn is_empty(&self) -> bool {
44        self.change_ids.is_empty()
45    }
46}
47
48/// Result of a diff operation
49#[derive(Debug, Clone)]
50pub struct DiffResult {
51    /// All changes in order
52    pub changes: Vec<Change>,
53    /// Only the actual changes (excluding context)
54    pub significant_changes: Vec<usize>,
55    /// Hunks (groups of related changes)
56    pub hunks: Vec<Hunk>,
57    /// Total number of insertions
58    pub insertions: usize,
59    /// Total number of deletions
60    pub deletions: usize,
61}
62
63impl DiffResult {
64    /// Get only the significant (non-context) changes
65    pub fn get_significant_changes(&self) -> Vec<&Change> {
66        self.significant_changes
67            .iter()
68            .filter_map(|&id| self.changes.iter().find(|c| c.id == id))
69            .collect()
70    }
71
72    /// Get a hunk by ID
73    pub fn get_hunk(&self, hunk_id: usize) -> Option<&Hunk> {
74        self.hunks.iter().find(|h| h.id == hunk_id)
75    }
76
77    /// Find which hunk a change belongs to
78    pub fn hunk_for_change(&self, change_id: usize) -> Option<&Hunk> {
79        self.hunks
80            .iter()
81            .find(|h| h.change_ids.contains(&change_id))
82    }
83}
84
85/// A diff for a single file
86#[derive(Debug, Clone)]
87pub struct FileDiff {
88    pub old_path: Option<String>,
89    pub new_path: Option<String>,
90    pub result: DiffResult,
91}
92
93/// The main diff engine
94pub struct DiffEngine {
95    /// Number of context lines to include
96    context_lines: usize,
97    /// Whether to do word-level diffing within changed lines
98    word_level: bool,
99}
100
101fn diff_ranges<I, T>(algorithm: Algorithm, before: I, after: I) -> Vec<(Range<usize>, Range<usize>)>
102where
103    I: TokenSource<Token = T>,
104    T: Eq + Hash,
105{
106    let input = InternedInput::new(before, after);
107    let diff = Diff::compute(algorithm, &input);
108    diff.hunks()
109        .map(|hunk| {
110            (
111                hunk.before.start as usize..hunk.before.end as usize,
112                hunk.after.start as usize..hunk.after.end as usize,
113            )
114        })
115        .collect()
116}
117
118impl Default for DiffEngine {
119    fn default() -> Self {
120        Self {
121            context_lines: 3,
122            word_level: true,
123        }
124    }
125}
126
127impl DiffEngine {
128    pub fn new() -> Self {
129        Self::default()
130    }
131
132    pub fn with_context(mut self, lines: usize) -> Self {
133        self.context_lines = lines;
134        self
135    }
136
137    pub fn with_word_level(mut self, enabled: bool) -> Self {
138        self.word_level = enabled;
139        self
140    }
141
142    /// Compute diff between two strings
143    pub fn diff_strings(&self, old: &str, new: &str) -> DiffResult {
144        let mut changes = Vec::new();
145        let mut significant_changes = Vec::new();
146        let mut insertions = 0;
147        let mut deletions = 0;
148        let mut change_id = 0;
149
150        let mut old_line_num = 1usize;
151        let mut new_line_num = 1usize;
152
153        // Group consecutive changes together for word-level diffing
154        let mut pending_deletes: Vec<(String, usize)> = Vec::new();
155        let mut pending_inserts: Vec<(String, usize)> = Vec::new();
156
157        let old_lines: Vec<&str> = old.lines().collect();
158        let new_lines: Vec<&str> = new.lines().collect();
159        let ranges = diff_ranges(Algorithm::Histogram, old, new);
160
161        let mut old_idx = 0usize;
162
163        for (before, after) in ranges {
164            if old_idx < before.start {
165                self.flush_pending_changes(
166                    &mut pending_deletes,
167                    &mut pending_inserts,
168                    &mut changes,
169                    &mut significant_changes,
170                    &mut change_id,
171                    &mut insertions,
172                    &mut deletions,
173                );
174
175                while old_idx < before.start {
176                    let line = old_lines.get(old_idx).copied().unwrap_or("");
177                    let span =
178                        ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
179                    changes.push(Change::single(change_id, span));
180                    change_id += 1;
181                    old_idx += 1;
182                    old_line_num += 1;
183                    new_line_num += 1;
184                }
185            }
186
187            for idx in before.start..before.end {
188                let line = old_lines.get(idx).copied().unwrap_or("");
189                pending_deletes.push((line.to_string(), old_line_num));
190                old_line_num += 1;
191            }
192
193            for idx in after.start..after.end {
194                let line = new_lines.get(idx).copied().unwrap_or("");
195                pending_inserts.push((line.to_string(), new_line_num));
196                new_line_num += 1;
197            }
198
199            old_idx = before.end;
200        }
201
202        if old_idx < old_lines.len() {
203            self.flush_pending_changes(
204                &mut pending_deletes,
205                &mut pending_inserts,
206                &mut changes,
207                &mut significant_changes,
208                &mut change_id,
209                &mut insertions,
210                &mut deletions,
211            );
212
213            while old_idx < old_lines.len() {
214                let line = old_lines.get(old_idx).copied().unwrap_or("");
215                let span =
216                    ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
217                changes.push(Change::single(change_id, span));
218                change_id += 1;
219                old_idx += 1;
220                old_line_num += 1;
221                new_line_num += 1;
222            }
223        }
224
225        self.flush_pending_changes(
226            &mut pending_deletes,
227            &mut pending_inserts,
228            &mut changes,
229            &mut significant_changes,
230            &mut change_id,
231            &mut insertions,
232            &mut deletions,
233        );
234
235        let (changes, significant_changes) = if self.context_lines != usize::MAX {
236            let mut id_to_idx = FxHashMap::default();
237            for (idx, change) in changes.iter().enumerate() {
238                id_to_idx.insert(change.id, idx);
239            }
240            let mut include = vec![false; changes.len()];
241            for &change_id in &significant_changes {
242                if let Some(&idx) = id_to_idx.get(&change_id) {
243                    let start = idx.saturating_sub(self.context_lines);
244                    let end = (idx + self.context_lines).min(changes.len().saturating_sub(1));
245                    for slot in include.iter_mut().take(end + 1).skip(start) {
246                        *slot = true;
247                    }
248                }
249            }
250            let significant_set: FxHashSet<usize> = significant_changes.iter().copied().collect();
251            let mut filtered_changes = Vec::new();
252            let mut filtered_significant = Vec::new();
253            for (idx, change) in changes.into_iter().enumerate() {
254                if include.get(idx).copied().unwrap_or(false) {
255                    if significant_set.contains(&change.id) {
256                        filtered_significant.push(change.id);
257                    }
258                    filtered_changes.push(change);
259                }
260            }
261            (filtered_changes, filtered_significant)
262        } else {
263            (changes, significant_changes)
264        };
265
266        // Compute hunks by grouping nearby changes
267        let hunks = Self::compute_hunks(&significant_changes, &changes);
268
269        DiffResult {
270            changes,
271            significant_changes,
272            hunks,
273            insertions,
274            deletions,
275        }
276    }
277
278    /// Compute hunks by grouping consecutive changes that are close together
279    /// Changes within PROXIMITY_THRESHOLD lines are grouped into the same hunk
280    fn compute_hunks(significant_changes: &[usize], changes: &[Change]) -> Vec<Hunk> {
281        const PROXIMITY_THRESHOLD: usize = 3;
282
283        let mut hunks = Vec::new();
284        if significant_changes.is_empty() {
285            return hunks;
286        }
287
288        let mut id_to_index =
289            FxHashMap::with_capacity_and_hasher(changes.len(), Default::default());
290        for (idx, change) in changes.iter().enumerate() {
291            id_to_index.insert(change.id, idx);
292        }
293
294        let mut current_hunk_changes: Vec<usize> = Vec::new();
295        let mut current_hunk_old_start: Option<usize> = None;
296        let mut current_hunk_new_start: Option<usize> = None;
297        let mut last_old_line: Option<usize> = None;
298        let mut last_new_line: Option<usize> = None;
299        let mut current_insertions = 0;
300        let mut current_deletions = 0;
301        let mut hunk_id = 0;
302
303        for &change_id in significant_changes {
304            let change = match id_to_index
305                .get(&change_id)
306                .and_then(|idx| changes.get(*idx))
307            {
308                Some(c) => c,
309                None => continue,
310            };
311
312            // Get line numbers from first span
313            let (old_line, new_line) = change
314                .spans
315                .first()
316                .map(|s| (s.old_line, s.new_line))
317                .unwrap_or((None, None));
318
319            // Determine if this change is close to the previous one
320            let is_close = match (last_old_line, last_new_line, old_line, new_line) {
321                (Some(lo), _, Some(co), _) => co.saturating_sub(lo) <= PROXIMITY_THRESHOLD,
322                (_, Some(ln), _, Some(cn)) => cn.saturating_sub(ln) <= PROXIMITY_THRESHOLD,
323                _ => current_hunk_changes.is_empty(), // First change always starts a hunk
324            };
325
326            if is_close {
327                // Add to current hunk
328                current_hunk_changes.push(change_id);
329                if current_hunk_old_start.is_none() {
330                    current_hunk_old_start = old_line;
331                }
332                if current_hunk_new_start.is_none() {
333                    current_hunk_new_start = new_line;
334                }
335            } else {
336                // Save current hunk and start a new one
337                if !current_hunk_changes.is_empty() {
338                    hunks.push(Hunk {
339                        id: hunk_id,
340                        change_ids: current_hunk_changes.clone(),
341                        old_start: current_hunk_old_start,
342                        new_start: current_hunk_new_start,
343                        insertions: current_insertions,
344                        deletions: current_deletions,
345                    });
346                    hunk_id += 1;
347                }
348
349                // Start new hunk
350                current_hunk_changes = vec![change_id];
351                current_hunk_old_start = old_line;
352                current_hunk_new_start = new_line;
353                current_insertions = 0;
354                current_deletions = 0;
355            }
356
357            // Update last line numbers
358            if old_line.is_some() {
359                last_old_line = old_line;
360            }
361            if new_line.is_some() {
362                last_new_line = new_line;
363            }
364
365            // Count insertions/deletions in this change
366            for span in &change.spans {
367                match span.kind {
368                    ChangeKind::Insert => current_insertions += 1,
369                    ChangeKind::Delete => current_deletions += 1,
370                    ChangeKind::Replace => {
371                        current_insertions += 1;
372                        current_deletions += 1;
373                    }
374                    ChangeKind::Equal => {}
375                }
376            }
377        }
378
379        // Don't forget the last hunk
380        if !current_hunk_changes.is_empty() {
381            hunks.push(Hunk {
382                id: hunk_id,
383                change_ids: current_hunk_changes,
384                old_start: current_hunk_old_start,
385                new_start: current_hunk_new_start,
386                insertions: current_insertions,
387                deletions: current_deletions,
388            });
389        }
390
391        hunks
392    }
393
394    #[allow(clippy::too_many_arguments)]
395    fn flush_pending_changes(
396        &self,
397        pending_deletes: &mut Vec<(String, usize)>,
398        pending_inserts: &mut Vec<(String, usize)>,
399        changes: &mut Vec<Change>,
400        significant_changes: &mut Vec<usize>,
401        change_id: &mut usize,
402        insertions: &mut usize,
403        deletions: &mut usize,
404    ) {
405        if pending_deletes.is_empty() && pending_inserts.is_empty() {
406            return;
407        }
408
409        // Try to match deletes with inserts for replace operations
410        if self.word_level && pending_deletes.len() == pending_inserts.len() {
411            for ((old_text, old_line), (new_text, new_line)) in
412                pending_deletes.iter().zip(pending_inserts.iter())
413            {
414                let spans = self.compute_word_diff(old_text, new_text, *old_line, *new_line);
415                let change = Change::new(*change_id, spans);
416                significant_changes.push(*change_id);
417                changes.push(change);
418                *change_id += 1;
419                *insertions += 1;
420                *deletions += 1;
421            }
422        } else {
423            // Output as separate deletes and inserts
424            for (text, line) in pending_deletes.iter() {
425                let span = ChangeSpan::delete(text.clone()).with_lines(Some(*line), None);
426                significant_changes.push(*change_id);
427                changes.push(Change::single(*change_id, span));
428                *change_id += 1;
429                *deletions += 1;
430            }
431            for (text, line) in pending_inserts.iter() {
432                let span = ChangeSpan::insert(text.clone()).with_lines(None, Some(*line));
433                significant_changes.push(*change_id);
434                changes.push(Change::single(*change_id, span));
435                *change_id += 1;
436                *insertions += 1;
437            }
438        }
439
440        pending_deletes.clear();
441        pending_inserts.clear();
442    }
443}
444
445/// Tokenize code for word-level diffing
446/// Separates identifiers from punctuation for accurate diffs
447fn tokenize_code(line: &str) -> Vec<String> {
448    let mut tokens = Vec::new();
449    let mut buf = String::new();
450    let mut in_word = false;
451
452    for ch in line.chars() {
453        let is_word = ch.is_alphanumeric() || ch == '_';
454        if is_word {
455            if !in_word {
456                if !buf.is_empty() {
457                    tokens.push(std::mem::take(&mut buf));
458                }
459                in_word = true;
460            }
461            buf.push(ch);
462        } else {
463            if in_word {
464                if !buf.is_empty() {
465                    tokens.push(std::mem::take(&mut buf));
466                }
467                in_word = false;
468            }
469            if ch.is_whitespace() {
470                // Group consecutive whitespace
471                if !buf.is_empty() && !buf.chars().all(char::is_whitespace) {
472                    tokens.push(std::mem::take(&mut buf));
473                }
474                buf.push(ch);
475            } else {
476                // Each punctuation char is its own token
477                if !buf.is_empty() {
478                    tokens.push(std::mem::take(&mut buf));
479                }
480                tokens.push(ch.to_string());
481            }
482        }
483    }
484    if !buf.is_empty() {
485        tokens.push(buf);
486    }
487    tokens
488}
489
490#[derive(Clone, Copy)]
491struct TokenSlice<'a> {
492    tokens: &'a [&'a str],
493}
494
495impl<'a> TokenSource for TokenSlice<'a> {
496    type Token = &'a str;
497    type Tokenizer = std::iter::Copied<std::slice::Iter<'a, &'a str>>;
498
499    fn tokenize(&self) -> Self::Tokenizer {
500        self.tokens.iter().copied()
501    }
502
503    fn estimate_tokens(&self) -> u32 {
504        self.tokens.len() as u32
505    }
506}
507
508impl DiffEngine {
509    /// Compute word-level diff within a line
510    fn compute_word_diff(
511        &self,
512        old: &str,
513        new: &str,
514        old_line: usize,
515        new_line: usize,
516    ) -> Vec<ChangeSpan> {
517        let old_tokens = tokenize_code(old);
518        let new_tokens = tokenize_code(new);
519        let old_refs: Vec<&str> = old_tokens.iter().map(|s| s.as_str()).collect();
520        let new_refs: Vec<&str> = new_tokens.iter().map(|s| s.as_str()).collect();
521        let ranges = diff_ranges(
522            Algorithm::Histogram,
523            TokenSlice { tokens: &old_refs },
524            TokenSlice { tokens: &new_refs },
525        );
526        let mut spans = Vec::new();
527        let mut old_idx = 0usize;
528
529        for (before, after) in ranges {
530            while old_idx < before.start {
531                let token = old_refs.get(old_idx).copied().unwrap_or("");
532                spans.push(
533                    ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
534                );
535                old_idx += 1;
536            }
537
538            for idx in before.start..before.end {
539                let token = old_refs.get(idx).copied().unwrap_or("");
540                spans.push(
541                    ChangeSpan::delete(token.to_string())
542                        .with_lines(Some(old_line), Some(new_line)),
543                );
544            }
545
546            for idx in after.start..after.end {
547                let token = new_refs.get(idx).copied().unwrap_or("");
548                spans.push(
549                    ChangeSpan::insert(token.to_string())
550                        .with_lines(Some(old_line), Some(new_line)),
551                );
552            }
553
554            old_idx = before.end;
555        }
556
557        while old_idx < old_refs.len() {
558            let token = old_refs.get(old_idx).copied().unwrap_or("");
559            spans.push(
560                ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
561            );
562            old_idx += 1;
563        }
564
565        spans
566    }
567
568    /// Compute diff between two files
569    pub fn diff_files(&self, old_path: &Path, new_path: &Path) -> Result<FileDiff, DiffError> {
570        let old_content = std::fs::read_to_string(old_path)?;
571        let new_content = std::fs::read_to_string(new_path)?;
572
573        let result = self.diff_strings(&old_content, &new_content);
574
575        Ok(FileDiff {
576            old_path: Some(old_path.to_string_lossy().to_string()),
577            new_path: Some(new_path.to_string_lossy().to_string()),
578            result,
579        })
580    }
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    #[test]
588    fn test_simple_diff() {
589        let engine = DiffEngine::new();
590        let old = "foo\nbar\nbaz";
591        let new = "foo\nqux\nbaz";
592
593        let result = engine.diff_strings(old, new);
594
595        assert_eq!(result.insertions, 1);
596        assert_eq!(result.deletions, 1);
597        assert!(!result.significant_changes.is_empty());
598    }
599
600    #[test]
601    fn test_no_changes() {
602        let engine = DiffEngine::new();
603        let text = "foo\nbar\nbaz";
604
605        let result = engine.diff_strings(text, text);
606
607        assert_eq!(result.insertions, 0);
608        assert_eq!(result.deletions, 0);
609        assert!(result.significant_changes.is_empty());
610    }
611
612    #[test]
613    fn test_word_level_diff() {
614        let engine = DiffEngine::new().with_word_level(true);
615        let old = "const foo = 4";
616        let new = "const bar = 4";
617
618        let result = engine.diff_strings(old, new);
619
620        // Should have a single change with word-level spans
621        assert_eq!(result.significant_changes.len(), 1);
622    }
623
624    #[test]
625    fn test_tokenize_code_basic() {
626        let tokens = tokenize_code("KeyModifiers, MouseEventKind}");
627        assert_eq!(
628            tokens,
629            vec!["KeyModifiers", ",", " ", "MouseEventKind", "}"]
630        );
631    }
632
633    #[test]
634    fn test_tokenize_code_identifiers() {
635        let tokens = tokenize_code("foo_bar baz123");
636        assert_eq!(tokens, vec!["foo_bar", " ", "baz123"]);
637    }
638
639    #[test]
640    fn test_tokenize_code_punctuation() {
641        let tokens = tokenize_code("use foo::{A, B};");
642        assert_eq!(
643            tokens,
644            vec!["use", " ", "foo", ":", ":", "{", "A", ",", " ", "B", "}", ";"]
645        );
646    }
647
648    #[test]
649    fn test_word_diff_punctuation_separation() {
650        use crate::change::ChangeKind;
651
652        // This is the exact bug case: adding MouseEventKind to an import list
653        let engine = DiffEngine::new().with_word_level(true);
654        let old = "use foo::{KeyModifiers};";
655        let new = "use foo::{KeyModifiers, MouseEventKind};";
656
657        let result = engine.diff_strings(old, new);
658
659        // Should have one change
660        assert_eq!(result.significant_changes.len(), 1);
661
662        let change = &result.changes[result.significant_changes[0]];
663
664        // Find spans by kind
665        let equal_content: String = change
666            .spans
667            .iter()
668            .filter(|s| s.kind == ChangeKind::Equal)
669            .map(|s| s.text.as_str())
670            .collect();
671        let insert_content: String = change
672            .spans
673            .iter()
674            .filter(|s| s.kind == ChangeKind::Insert)
675            .map(|s| s.text.as_str())
676            .collect();
677
678        // KeyModifiers should be in equal spans (unchanged)
679        assert!(
680            equal_content.contains("KeyModifiers"),
681            "KeyModifiers should be equal, got equal: '{}', insert: '{}'",
682            equal_content,
683            insert_content
684        );
685
686        // MouseEventKind should be in insert spans (new)
687        assert!(
688            insert_content.contains("MouseEventKind"),
689            "MouseEventKind should be inserted, got equal: '{}', insert: '{}'",
690            equal_content,
691            insert_content
692        );
693
694        // KeyModifiers should NOT be in insert spans
695        assert!(
696            !insert_content.contains("KeyModifiers"),
697            "KeyModifiers should not be inserted, got insert: '{}'",
698            insert_content
699        );
700    }
701
702    #[test]
703    fn test_hunks_are_contiguous_in_significant_changes() {
704        let engine = DiffEngine::new();
705        let old = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\n";
706        let new = "a\nB\nc\nd\ne\nf\ng\nh\nI\nj\nk\nL\nm\nn\n";
707
708        let result = engine.diff_strings(old, new);
709
710        assert!(
711            result.hunks.len() >= 2,
712            "expected multiple hunks for contiguity test"
713        );
714
715        for hunk in &result.hunks {
716            if hunk.change_ids.is_empty() {
717                continue;
718            }
719            let mut positions = Vec::new();
720            for id in &hunk.change_ids {
721                let pos = result
722                    .significant_changes
723                    .iter()
724                    .position(|sid| sid == id)
725                    .expect("hunk change id should exist in significant_changes");
726                positions.push(pos);
727            }
728            for pair in positions.windows(2) {
729                assert_eq!(
730                    pair[1],
731                    pair[0] + 1,
732                    "hunk change ids should be contiguous in significant_changes"
733                );
734            }
735            let start = positions[0];
736            for (offset, id) in hunk.change_ids.iter().enumerate() {
737                assert_eq!(
738                    result.significant_changes[start + offset],
739                    *id,
740                    "hunk change order should match significant_changes"
741                );
742            }
743        }
744    }
745}