1use crate::change::{Change, ChangeKind, ChangeSpan};
4use imara_diff::{Algorithm, Diff, InternedInput, TokenSource};
5use rustc_hash::{FxHashMap, FxHashSet};
6use std::hash::Hash;
7use std::ops::Range;
8use std::path::Path;
9use thiserror::Error;
10
11#[derive(Error, Debug)]
12pub enum DiffError {
13 #[error("Failed to read file: {0}")]
14 FileRead(#[from] std::io::Error),
15 #[error("Diff computation failed: {0}")]
16 ComputeFailed(String),
17}
18
19#[derive(Debug, Clone)]
21pub struct Hunk {
22 pub id: usize,
24 pub change_ids: Vec<usize>,
26 pub old_start: Option<usize>,
28 pub new_start: Option<usize>,
30 pub insertions: usize,
32 pub deletions: usize,
34}
35
36impl Hunk {
37 pub fn len(&self) -> usize {
39 self.change_ids.len()
40 }
41
42 pub fn is_empty(&self) -> bool {
44 self.change_ids.is_empty()
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct DiffResult {
51 pub changes: Vec<Change>,
53 pub significant_changes: Vec<usize>,
55 pub hunks: Vec<Hunk>,
57 pub insertions: usize,
59 pub deletions: usize,
61}
62
63impl DiffResult {
64 pub fn get_significant_changes(&self) -> Vec<&Change> {
66 self.significant_changes
67 .iter()
68 .filter_map(|&id| self.changes.iter().find(|c| c.id == id))
69 .collect()
70 }
71
72 pub fn get_hunk(&self, hunk_id: usize) -> Option<&Hunk> {
74 self.hunks.iter().find(|h| h.id == hunk_id)
75 }
76
77 pub fn hunk_for_change(&self, change_id: usize) -> Option<&Hunk> {
79 self.hunks
80 .iter()
81 .find(|h| h.change_ids.contains(&change_id))
82 }
83}
84
85#[derive(Debug, Clone)]
87pub struct FileDiff {
88 pub old_path: Option<String>,
89 pub new_path: Option<String>,
90 pub result: DiffResult,
91}
92
93pub struct DiffEngine {
95 context_lines: usize,
97 word_level: bool,
99}
100
101fn diff_ranges<I, T>(algorithm: Algorithm, before: I, after: I) -> Vec<(Range<usize>, Range<usize>)>
102where
103 I: TokenSource<Token = T>,
104 T: Eq + Hash,
105{
106 let input = InternedInput::new(before, after);
107 let diff = Diff::compute(algorithm, &input);
108 diff.hunks()
109 .map(|hunk| {
110 (
111 hunk.before.start as usize..hunk.before.end as usize,
112 hunk.after.start as usize..hunk.after.end as usize,
113 )
114 })
115 .collect()
116}
117
118impl Default for DiffEngine {
119 fn default() -> Self {
120 Self {
121 context_lines: 3,
122 word_level: true,
123 }
124 }
125}
126
127impl DiffEngine {
128 pub fn new() -> Self {
129 Self::default()
130 }
131
132 pub fn with_context(mut self, lines: usize) -> Self {
133 self.context_lines = lines;
134 self
135 }
136
137 pub fn with_word_level(mut self, enabled: bool) -> Self {
138 self.word_level = enabled;
139 self
140 }
141
142 pub fn diff_strings(&self, old: &str, new: &str) -> DiffResult {
144 let mut changes = Vec::new();
145 let mut significant_changes = Vec::new();
146 let mut insertions = 0;
147 let mut deletions = 0;
148 let mut change_id = 0;
149
150 let mut old_line_num = 1usize;
151 let mut new_line_num = 1usize;
152
153 let mut pending_deletes: Vec<(String, usize)> = Vec::new();
155 let mut pending_inserts: Vec<(String, usize)> = Vec::new();
156
157 let old_lines: Vec<&str> = old.lines().collect();
158 let new_lines: Vec<&str> = new.lines().collect();
159 let ranges = diff_ranges(Algorithm::Histogram, old, new);
160
161 let mut old_idx = 0usize;
162
163 for (before, after) in ranges {
164 if old_idx < before.start {
165 self.flush_pending_changes(
166 &mut pending_deletes,
167 &mut pending_inserts,
168 &mut changes,
169 &mut significant_changes,
170 &mut change_id,
171 &mut insertions,
172 &mut deletions,
173 );
174
175 while old_idx < before.start {
176 let line = old_lines.get(old_idx).copied().unwrap_or("");
177 let span =
178 ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
179 changes.push(Change::single(change_id, span));
180 change_id += 1;
181 old_idx += 1;
182 old_line_num += 1;
183 new_line_num += 1;
184 }
185 }
186
187 for idx in before.start..before.end {
188 let line = old_lines.get(idx).copied().unwrap_or("");
189 pending_deletes.push((line.to_string(), old_line_num));
190 old_line_num += 1;
191 }
192
193 for idx in after.start..after.end {
194 let line = new_lines.get(idx).copied().unwrap_or("");
195 pending_inserts.push((line.to_string(), new_line_num));
196 new_line_num += 1;
197 }
198
199 old_idx = before.end;
200 }
201
202 if old_idx < old_lines.len() {
203 self.flush_pending_changes(
204 &mut pending_deletes,
205 &mut pending_inserts,
206 &mut changes,
207 &mut significant_changes,
208 &mut change_id,
209 &mut insertions,
210 &mut deletions,
211 );
212
213 while old_idx < old_lines.len() {
214 let line = old_lines.get(old_idx).copied().unwrap_or("");
215 let span =
216 ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
217 changes.push(Change::single(change_id, span));
218 change_id += 1;
219 old_idx += 1;
220 old_line_num += 1;
221 new_line_num += 1;
222 }
223 }
224
225 self.flush_pending_changes(
226 &mut pending_deletes,
227 &mut pending_inserts,
228 &mut changes,
229 &mut significant_changes,
230 &mut change_id,
231 &mut insertions,
232 &mut deletions,
233 );
234
235 let (changes, significant_changes) = if self.context_lines != usize::MAX {
236 let mut id_to_idx = FxHashMap::default();
237 for (idx, change) in changes.iter().enumerate() {
238 id_to_idx.insert(change.id, idx);
239 }
240 let mut include = vec![false; changes.len()];
241 for &change_id in &significant_changes {
242 if let Some(&idx) = id_to_idx.get(&change_id) {
243 let start = idx.saturating_sub(self.context_lines);
244 let end = (idx + self.context_lines).min(changes.len().saturating_sub(1));
245 for slot in include.iter_mut().take(end + 1).skip(start) {
246 *slot = true;
247 }
248 }
249 }
250 let significant_set: FxHashSet<usize> = significant_changes.iter().copied().collect();
251 let mut filtered_changes = Vec::new();
252 let mut filtered_significant = Vec::new();
253 for (idx, change) in changes.into_iter().enumerate() {
254 if include.get(idx).copied().unwrap_or(false) {
255 if significant_set.contains(&change.id) {
256 filtered_significant.push(change.id);
257 }
258 filtered_changes.push(change);
259 }
260 }
261 (filtered_changes, filtered_significant)
262 } else {
263 (changes, significant_changes)
264 };
265
266 let hunks = Self::compute_hunks(&significant_changes, &changes);
268
269 DiffResult {
270 changes,
271 significant_changes,
272 hunks,
273 insertions,
274 deletions,
275 }
276 }
277
278 fn compute_hunks(significant_changes: &[usize], changes: &[Change]) -> Vec<Hunk> {
281 const PROXIMITY_THRESHOLD: usize = 3;
282
283 let mut hunks = Vec::new();
284 if significant_changes.is_empty() {
285 return hunks;
286 }
287
288 let mut id_to_index =
289 FxHashMap::with_capacity_and_hasher(changes.len(), Default::default());
290 for (idx, change) in changes.iter().enumerate() {
291 id_to_index.insert(change.id, idx);
292 }
293
294 let mut current_hunk_changes: Vec<usize> = Vec::new();
295 let mut current_hunk_old_start: Option<usize> = None;
296 let mut current_hunk_new_start: Option<usize> = None;
297 let mut last_old_line: Option<usize> = None;
298 let mut last_new_line: Option<usize> = None;
299 let mut current_insertions = 0;
300 let mut current_deletions = 0;
301 let mut hunk_id = 0;
302
303 for &change_id in significant_changes {
304 let change = match id_to_index
305 .get(&change_id)
306 .and_then(|idx| changes.get(*idx))
307 {
308 Some(c) => c,
309 None => continue,
310 };
311
312 let (old_line, new_line) = change
314 .spans
315 .first()
316 .map(|s| (s.old_line, s.new_line))
317 .unwrap_or((None, None));
318
319 let is_close = match (last_old_line, last_new_line, old_line, new_line) {
321 (Some(lo), _, Some(co), _) => co.saturating_sub(lo) <= PROXIMITY_THRESHOLD,
322 (_, Some(ln), _, Some(cn)) => cn.saturating_sub(ln) <= PROXIMITY_THRESHOLD,
323 _ => current_hunk_changes.is_empty(), };
325
326 if is_close {
327 current_hunk_changes.push(change_id);
329 if current_hunk_old_start.is_none() {
330 current_hunk_old_start = old_line;
331 }
332 if current_hunk_new_start.is_none() {
333 current_hunk_new_start = new_line;
334 }
335 } else {
336 if !current_hunk_changes.is_empty() {
338 hunks.push(Hunk {
339 id: hunk_id,
340 change_ids: current_hunk_changes.clone(),
341 old_start: current_hunk_old_start,
342 new_start: current_hunk_new_start,
343 insertions: current_insertions,
344 deletions: current_deletions,
345 });
346 hunk_id += 1;
347 }
348
349 current_hunk_changes = vec![change_id];
351 current_hunk_old_start = old_line;
352 current_hunk_new_start = new_line;
353 current_insertions = 0;
354 current_deletions = 0;
355 }
356
357 if old_line.is_some() {
359 last_old_line = old_line;
360 }
361 if new_line.is_some() {
362 last_new_line = new_line;
363 }
364
365 for span in &change.spans {
367 match span.kind {
368 ChangeKind::Insert => current_insertions += 1,
369 ChangeKind::Delete => current_deletions += 1,
370 ChangeKind::Replace => {
371 current_insertions += 1;
372 current_deletions += 1;
373 }
374 ChangeKind::Equal => {}
375 }
376 }
377 }
378
379 if !current_hunk_changes.is_empty() {
381 hunks.push(Hunk {
382 id: hunk_id,
383 change_ids: current_hunk_changes,
384 old_start: current_hunk_old_start,
385 new_start: current_hunk_new_start,
386 insertions: current_insertions,
387 deletions: current_deletions,
388 });
389 }
390
391 hunks
392 }
393
394 #[allow(clippy::too_many_arguments)]
395 fn flush_pending_changes(
396 &self,
397 pending_deletes: &mut Vec<(String, usize)>,
398 pending_inserts: &mut Vec<(String, usize)>,
399 changes: &mut Vec<Change>,
400 significant_changes: &mut Vec<usize>,
401 change_id: &mut usize,
402 insertions: &mut usize,
403 deletions: &mut usize,
404 ) {
405 if pending_deletes.is_empty() && pending_inserts.is_empty() {
406 return;
407 }
408
409 if self.word_level && pending_deletes.len() == pending_inserts.len() {
411 for ((old_text, old_line), (new_text, new_line)) in
412 pending_deletes.iter().zip(pending_inserts.iter())
413 {
414 let spans = self.compute_word_diff(old_text, new_text, *old_line, *new_line);
415 let change = Change::new(*change_id, spans);
416 significant_changes.push(*change_id);
417 changes.push(change);
418 *change_id += 1;
419 *insertions += 1;
420 *deletions += 1;
421 }
422 } else {
423 for (text, line) in pending_deletes.iter() {
425 let span = ChangeSpan::delete(text.clone()).with_lines(Some(*line), None);
426 significant_changes.push(*change_id);
427 changes.push(Change::single(*change_id, span));
428 *change_id += 1;
429 *deletions += 1;
430 }
431 for (text, line) in pending_inserts.iter() {
432 let span = ChangeSpan::insert(text.clone()).with_lines(None, Some(*line));
433 significant_changes.push(*change_id);
434 changes.push(Change::single(*change_id, span));
435 *change_id += 1;
436 *insertions += 1;
437 }
438 }
439
440 pending_deletes.clear();
441 pending_inserts.clear();
442 }
443}
444
445fn tokenize_code(line: &str) -> Vec<String> {
448 let mut tokens = Vec::new();
449 let mut buf = String::new();
450 let mut in_word = false;
451
452 for ch in line.chars() {
453 let is_word = ch.is_alphanumeric() || ch == '_';
454 if is_word {
455 if !in_word {
456 if !buf.is_empty() {
457 tokens.push(std::mem::take(&mut buf));
458 }
459 in_word = true;
460 }
461 buf.push(ch);
462 } else {
463 if in_word {
464 if !buf.is_empty() {
465 tokens.push(std::mem::take(&mut buf));
466 }
467 in_word = false;
468 }
469 if ch.is_whitespace() {
470 if !buf.is_empty() && !buf.chars().all(char::is_whitespace) {
472 tokens.push(std::mem::take(&mut buf));
473 }
474 buf.push(ch);
475 } else {
476 if !buf.is_empty() {
478 tokens.push(std::mem::take(&mut buf));
479 }
480 tokens.push(ch.to_string());
481 }
482 }
483 }
484 if !buf.is_empty() {
485 tokens.push(buf);
486 }
487 tokens
488}
489
490#[derive(Clone, Copy)]
491struct TokenSlice<'a> {
492 tokens: &'a [&'a str],
493}
494
495impl<'a> TokenSource for TokenSlice<'a> {
496 type Token = &'a str;
497 type Tokenizer = std::iter::Copied<std::slice::Iter<'a, &'a str>>;
498
499 fn tokenize(&self) -> Self::Tokenizer {
500 self.tokens.iter().copied()
501 }
502
503 fn estimate_tokens(&self) -> u32 {
504 self.tokens.len() as u32
505 }
506}
507
508impl DiffEngine {
509 fn compute_word_diff(
511 &self,
512 old: &str,
513 new: &str,
514 old_line: usize,
515 new_line: usize,
516 ) -> Vec<ChangeSpan> {
517 let old_tokens = tokenize_code(old);
518 let new_tokens = tokenize_code(new);
519 let old_refs: Vec<&str> = old_tokens.iter().map(|s| s.as_str()).collect();
520 let new_refs: Vec<&str> = new_tokens.iter().map(|s| s.as_str()).collect();
521 let ranges = diff_ranges(
522 Algorithm::Histogram,
523 TokenSlice { tokens: &old_refs },
524 TokenSlice { tokens: &new_refs },
525 );
526 let mut spans = Vec::new();
527 let mut old_idx = 0usize;
528
529 for (before, after) in ranges {
530 while old_idx < before.start {
531 let token = old_refs.get(old_idx).copied().unwrap_or("");
532 spans.push(
533 ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
534 );
535 old_idx += 1;
536 }
537
538 for idx in before.start..before.end {
539 let token = old_refs.get(idx).copied().unwrap_or("");
540 spans.push(
541 ChangeSpan::delete(token.to_string())
542 .with_lines(Some(old_line), Some(new_line)),
543 );
544 }
545
546 for idx in after.start..after.end {
547 let token = new_refs.get(idx).copied().unwrap_or("");
548 spans.push(
549 ChangeSpan::insert(token.to_string())
550 .with_lines(Some(old_line), Some(new_line)),
551 );
552 }
553
554 old_idx = before.end;
555 }
556
557 while old_idx < old_refs.len() {
558 let token = old_refs.get(old_idx).copied().unwrap_or("");
559 spans.push(
560 ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
561 );
562 old_idx += 1;
563 }
564
565 spans
566 }
567
568 pub fn diff_files(&self, old_path: &Path, new_path: &Path) -> Result<FileDiff, DiffError> {
570 let old_content = std::fs::read_to_string(old_path)?;
571 let new_content = std::fs::read_to_string(new_path)?;
572
573 let result = self.diff_strings(&old_content, &new_content);
574
575 Ok(FileDiff {
576 old_path: Some(old_path.to_string_lossy().to_string()),
577 new_path: Some(new_path.to_string_lossy().to_string()),
578 result,
579 })
580 }
581}
582
583#[cfg(test)]
584mod tests {
585 use super::*;
586
587 #[test]
588 fn test_simple_diff() {
589 let engine = DiffEngine::new();
590 let old = "foo\nbar\nbaz";
591 let new = "foo\nqux\nbaz";
592
593 let result = engine.diff_strings(old, new);
594
595 assert_eq!(result.insertions, 1);
596 assert_eq!(result.deletions, 1);
597 assert!(!result.significant_changes.is_empty());
598 }
599
600 #[test]
601 fn test_no_changes() {
602 let engine = DiffEngine::new();
603 let text = "foo\nbar\nbaz";
604
605 let result = engine.diff_strings(text, text);
606
607 assert_eq!(result.insertions, 0);
608 assert_eq!(result.deletions, 0);
609 assert!(result.significant_changes.is_empty());
610 }
611
612 #[test]
613 fn test_word_level_diff() {
614 let engine = DiffEngine::new().with_word_level(true);
615 let old = "const foo = 4";
616 let new = "const bar = 4";
617
618 let result = engine.diff_strings(old, new);
619
620 assert_eq!(result.significant_changes.len(), 1);
622 }
623
624 #[test]
625 fn test_tokenize_code_basic() {
626 let tokens = tokenize_code("KeyModifiers, MouseEventKind}");
627 assert_eq!(
628 tokens,
629 vec!["KeyModifiers", ",", " ", "MouseEventKind", "}"]
630 );
631 }
632
633 #[test]
634 fn test_tokenize_code_identifiers() {
635 let tokens = tokenize_code("foo_bar baz123");
636 assert_eq!(tokens, vec!["foo_bar", " ", "baz123"]);
637 }
638
639 #[test]
640 fn test_tokenize_code_punctuation() {
641 let tokens = tokenize_code("use foo::{A, B};");
642 assert_eq!(
643 tokens,
644 vec!["use", " ", "foo", ":", ":", "{", "A", ",", " ", "B", "}", ";"]
645 );
646 }
647
648 #[test]
649 fn test_word_diff_punctuation_separation() {
650 use crate::change::ChangeKind;
651
652 let engine = DiffEngine::new().with_word_level(true);
654 let old = "use foo::{KeyModifiers};";
655 let new = "use foo::{KeyModifiers, MouseEventKind};";
656
657 let result = engine.diff_strings(old, new);
658
659 assert_eq!(result.significant_changes.len(), 1);
661
662 let change = &result.changes[result.significant_changes[0]];
663
664 let equal_content: String = change
666 .spans
667 .iter()
668 .filter(|s| s.kind == ChangeKind::Equal)
669 .map(|s| s.text.as_str())
670 .collect();
671 let insert_content: String = change
672 .spans
673 .iter()
674 .filter(|s| s.kind == ChangeKind::Insert)
675 .map(|s| s.text.as_str())
676 .collect();
677
678 assert!(
680 equal_content.contains("KeyModifiers"),
681 "KeyModifiers should be equal, got equal: '{}', insert: '{}'",
682 equal_content,
683 insert_content
684 );
685
686 assert!(
688 insert_content.contains("MouseEventKind"),
689 "MouseEventKind should be inserted, got equal: '{}', insert: '{}'",
690 equal_content,
691 insert_content
692 );
693
694 assert!(
696 !insert_content.contains("KeyModifiers"),
697 "KeyModifiers should not be inserted, got insert: '{}'",
698 insert_content
699 );
700 }
701
702 #[test]
703 fn test_hunks_are_contiguous_in_significant_changes() {
704 let engine = DiffEngine::new();
705 let old = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\n";
706 let new = "a\nB\nc\nd\ne\nf\ng\nh\nI\nj\nk\nL\nm\nn\n";
707
708 let result = engine.diff_strings(old, new);
709
710 assert!(
711 result.hunks.len() >= 2,
712 "expected multiple hunks for contiguity test"
713 );
714
715 for hunk in &result.hunks {
716 if hunk.change_ids.is_empty() {
717 continue;
718 }
719 let mut positions = Vec::new();
720 for id in &hunk.change_ids {
721 let pos = result
722 .significant_changes
723 .iter()
724 .position(|sid| sid == id)
725 .expect("hunk change id should exist in significant_changes");
726 positions.push(pos);
727 }
728 for pair in positions.windows(2) {
729 assert_eq!(
730 pair[1],
731 pair[0] + 1,
732 "hunk change ids should be contiguous in significant_changes"
733 );
734 }
735 let start = positions[0];
736 for (offset, id) in hunk.change_ids.iter().enumerate() {
737 assert_eq!(
738 result.significant_changes[start + offset],
739 *id,
740 "hunk change order should match significant_changes"
741 );
742 }
743 }
744 }
745}