use crate::change::{Change, ChangeKind, ChangeSpan};
use imara_diff::{Algorithm, Diff, InternedInput, TokenSource};
use rustc_hash::{FxHashMap, FxHashSet};
use std::hash::Hash;
use std::ops::Range;
use std::path::Path;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum DiffError {
#[error("Failed to read file: {0}")]
FileRead(#[from] std::io::Error),
#[error("Diff computation failed: {0}")]
ComputeFailed(String),
}
#[derive(Debug, Clone)]
pub struct Hunk {
pub id: usize,
pub change_ids: Vec<usize>,
pub old_start: Option<usize>,
pub new_start: Option<usize>,
pub insertions: usize,
pub deletions: usize,
}
impl Hunk {
pub fn len(&self) -> usize {
self.change_ids.len()
}
pub fn is_empty(&self) -> bool {
self.change_ids.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct DiffResult {
pub changes: Vec<Change>,
pub significant_changes: Vec<usize>,
pub hunks: Vec<Hunk>,
pub insertions: usize,
pub deletions: usize,
}
impl DiffResult {
pub fn get_significant_changes(&self) -> Vec<&Change> {
self.significant_changes
.iter()
.filter_map(|&id| self.changes.iter().find(|c| c.id == id))
.collect()
}
pub fn get_hunk(&self, hunk_id: usize) -> Option<&Hunk> {
self.hunks.iter().find(|h| h.id == hunk_id)
}
pub fn hunk_for_change(&self, change_id: usize) -> Option<&Hunk> {
self.hunks
.iter()
.find(|h| h.change_ids.contains(&change_id))
}
}
#[derive(Debug, Clone)]
pub struct FileDiff {
pub old_path: Option<String>,
pub new_path: Option<String>,
pub result: DiffResult,
}
pub struct DiffEngine {
context_lines: usize,
word_level: bool,
}
fn diff_ranges<I, T>(algorithm: Algorithm, before: I, after: I) -> Vec<(Range<usize>, Range<usize>)>
where
I: TokenSource<Token = T>,
T: Eq + Hash,
{
let input = InternedInput::new(before, after);
let diff = Diff::compute(algorithm, &input);
diff.hunks()
.map(|hunk| {
(
hunk.before.start as usize..hunk.before.end as usize,
hunk.after.start as usize..hunk.after.end as usize,
)
})
.collect()
}
impl Default for DiffEngine {
fn default() -> Self {
Self {
context_lines: 3,
word_level: true,
}
}
}
impl DiffEngine {
pub fn new() -> Self {
Self::default()
}
pub fn with_context(mut self, lines: usize) -> Self {
self.context_lines = lines;
self
}
pub fn with_word_level(mut self, enabled: bool) -> Self {
self.word_level = enabled;
self
}
pub fn diff_strings(&self, old: &str, new: &str) -> DiffResult {
let mut changes = Vec::new();
let mut significant_changes = Vec::new();
let mut insertions = 0;
let mut deletions = 0;
let mut change_id = 0;
let mut old_line_num = 1usize;
let mut new_line_num = 1usize;
let mut pending_deletes: Vec<(String, usize)> = Vec::new();
let mut pending_inserts: Vec<(String, usize)> = Vec::new();
let old_lines: Vec<&str> = old.lines().collect();
let new_lines: Vec<&str> = new.lines().collect();
let ranges = diff_ranges(Algorithm::Histogram, old, new);
let mut old_idx = 0usize;
for (before, after) in ranges {
if old_idx < before.start {
self.flush_pending_changes(
&mut pending_deletes,
&mut pending_inserts,
&mut changes,
&mut significant_changes,
&mut change_id,
&mut insertions,
&mut deletions,
);
while old_idx < before.start {
let line = old_lines.get(old_idx).copied().unwrap_or("");
let span =
ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
changes.push(Change::single(change_id, span));
change_id += 1;
old_idx += 1;
old_line_num += 1;
new_line_num += 1;
}
}
for idx in before.start..before.end {
let line = old_lines.get(idx).copied().unwrap_or("");
pending_deletes.push((line.to_string(), old_line_num));
old_line_num += 1;
}
for idx in after.start..after.end {
let line = new_lines.get(idx).copied().unwrap_or("");
pending_inserts.push((line.to_string(), new_line_num));
new_line_num += 1;
}
old_idx = before.end;
}
if old_idx < old_lines.len() {
self.flush_pending_changes(
&mut pending_deletes,
&mut pending_inserts,
&mut changes,
&mut significant_changes,
&mut change_id,
&mut insertions,
&mut deletions,
);
while old_idx < old_lines.len() {
let line = old_lines.get(old_idx).copied().unwrap_or("");
let span =
ChangeSpan::equal(line).with_lines(Some(old_line_num), Some(new_line_num));
changes.push(Change::single(change_id, span));
change_id += 1;
old_idx += 1;
old_line_num += 1;
new_line_num += 1;
}
}
self.flush_pending_changes(
&mut pending_deletes,
&mut pending_inserts,
&mut changes,
&mut significant_changes,
&mut change_id,
&mut insertions,
&mut deletions,
);
let (changes, significant_changes) = if self.context_lines != usize::MAX {
let mut id_to_idx = FxHashMap::default();
for (idx, change) in changes.iter().enumerate() {
id_to_idx.insert(change.id, idx);
}
let mut include = vec![false; changes.len()];
for &change_id in &significant_changes {
if let Some(&idx) = id_to_idx.get(&change_id) {
let start = idx.saturating_sub(self.context_lines);
let end = (idx + self.context_lines).min(changes.len().saturating_sub(1));
for slot in include.iter_mut().take(end + 1).skip(start) {
*slot = true;
}
}
}
let significant_set: FxHashSet<usize> = significant_changes.iter().copied().collect();
let mut filtered_changes = Vec::new();
let mut filtered_significant = Vec::new();
for (idx, change) in changes.into_iter().enumerate() {
if include.get(idx).copied().unwrap_or(false) {
if significant_set.contains(&change.id) {
filtered_significant.push(change.id);
}
filtered_changes.push(change);
}
}
(filtered_changes, filtered_significant)
} else {
(changes, significant_changes)
};
let hunks = Self::compute_hunks(&significant_changes, &changes);
DiffResult {
changes,
significant_changes,
hunks,
insertions,
deletions,
}
}
fn compute_hunks(significant_changes: &[usize], changes: &[Change]) -> Vec<Hunk> {
const PROXIMITY_THRESHOLD: usize = 3;
let mut hunks = Vec::new();
if significant_changes.is_empty() {
return hunks;
}
let mut id_to_index =
FxHashMap::with_capacity_and_hasher(changes.len(), Default::default());
for (idx, change) in changes.iter().enumerate() {
id_to_index.insert(change.id, idx);
}
let mut current_hunk_changes: Vec<usize> = Vec::new();
let mut current_hunk_old_start: Option<usize> = None;
let mut current_hunk_new_start: Option<usize> = None;
let mut last_old_line: Option<usize> = None;
let mut last_new_line: Option<usize> = None;
let mut current_insertions = 0;
let mut current_deletions = 0;
let mut hunk_id = 0;
for &change_id in significant_changes {
let change = match id_to_index
.get(&change_id)
.and_then(|idx| changes.get(*idx))
{
Some(c) => c,
None => continue,
};
let (old_line, new_line) = change
.spans
.first()
.map(|s| (s.old_line, s.new_line))
.unwrap_or((None, None));
let is_close = match (last_old_line, last_new_line, old_line, new_line) {
(Some(lo), _, Some(co), _) => co.saturating_sub(lo) <= PROXIMITY_THRESHOLD,
(_, Some(ln), _, Some(cn)) => cn.saturating_sub(ln) <= PROXIMITY_THRESHOLD,
_ => current_hunk_changes.is_empty(), };
if is_close {
current_hunk_changes.push(change_id);
if current_hunk_old_start.is_none() {
current_hunk_old_start = old_line;
}
if current_hunk_new_start.is_none() {
current_hunk_new_start = new_line;
}
} else {
if !current_hunk_changes.is_empty() {
hunks.push(Hunk {
id: hunk_id,
change_ids: current_hunk_changes.clone(),
old_start: current_hunk_old_start,
new_start: current_hunk_new_start,
insertions: current_insertions,
deletions: current_deletions,
});
hunk_id += 1;
}
current_hunk_changes = vec![change_id];
current_hunk_old_start = old_line;
current_hunk_new_start = new_line;
current_insertions = 0;
current_deletions = 0;
}
if old_line.is_some() {
last_old_line = old_line;
}
if new_line.is_some() {
last_new_line = new_line;
}
for span in &change.spans {
match span.kind {
ChangeKind::Insert => current_insertions += 1,
ChangeKind::Delete => current_deletions += 1,
ChangeKind::Replace => {
current_insertions += 1;
current_deletions += 1;
}
ChangeKind::Equal => {}
}
}
}
if !current_hunk_changes.is_empty() {
hunks.push(Hunk {
id: hunk_id,
change_ids: current_hunk_changes,
old_start: current_hunk_old_start,
new_start: current_hunk_new_start,
insertions: current_insertions,
deletions: current_deletions,
});
}
hunks
}
#[allow(clippy::too_many_arguments)]
fn flush_pending_changes(
&self,
pending_deletes: &mut Vec<(String, usize)>,
pending_inserts: &mut Vec<(String, usize)>,
changes: &mut Vec<Change>,
significant_changes: &mut Vec<usize>,
change_id: &mut usize,
insertions: &mut usize,
deletions: &mut usize,
) {
if pending_deletes.is_empty() && pending_inserts.is_empty() {
return;
}
if self.word_level && pending_deletes.len() == pending_inserts.len() {
for ((old_text, old_line), (new_text, new_line)) in
pending_deletes.iter().zip(pending_inserts.iter())
{
let spans = self.compute_word_diff(old_text, new_text, *old_line, *new_line);
let change = Change::new(*change_id, spans);
significant_changes.push(*change_id);
changes.push(change);
*change_id += 1;
*insertions += 1;
*deletions += 1;
}
} else {
for (text, line) in pending_deletes.iter() {
let span = ChangeSpan::delete(text.clone()).with_lines(Some(*line), None);
significant_changes.push(*change_id);
changes.push(Change::single(*change_id, span));
*change_id += 1;
*deletions += 1;
}
for (text, line) in pending_inserts.iter() {
let span = ChangeSpan::insert(text.clone()).with_lines(None, Some(*line));
significant_changes.push(*change_id);
changes.push(Change::single(*change_id, span));
*change_id += 1;
*insertions += 1;
}
}
pending_deletes.clear();
pending_inserts.clear();
}
}
fn tokenize_code(line: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut buf = String::new();
let mut in_word = false;
for ch in line.chars() {
let is_word = ch.is_alphanumeric() || ch == '_';
if is_word {
if !in_word {
if !buf.is_empty() {
tokens.push(std::mem::take(&mut buf));
}
in_word = true;
}
buf.push(ch);
} else {
if in_word {
if !buf.is_empty() {
tokens.push(std::mem::take(&mut buf));
}
in_word = false;
}
if ch.is_whitespace() {
if !buf.is_empty() && !buf.chars().all(char::is_whitespace) {
tokens.push(std::mem::take(&mut buf));
}
buf.push(ch);
} else {
if !buf.is_empty() {
tokens.push(std::mem::take(&mut buf));
}
tokens.push(ch.to_string());
}
}
}
if !buf.is_empty() {
tokens.push(buf);
}
tokens
}
#[derive(Clone, Copy)]
struct TokenSlice<'a> {
tokens: &'a [&'a str],
}
impl<'a> TokenSource for TokenSlice<'a> {
type Token = &'a str;
type Tokenizer = std::iter::Copied<std::slice::Iter<'a, &'a str>>;
fn tokenize(&self) -> Self::Tokenizer {
self.tokens.iter().copied()
}
fn estimate_tokens(&self) -> u32 {
self.tokens.len() as u32
}
}
impl DiffEngine {
fn compute_word_diff(
&self,
old: &str,
new: &str,
old_line: usize,
new_line: usize,
) -> Vec<ChangeSpan> {
let old_tokens = tokenize_code(old);
let new_tokens = tokenize_code(new);
let old_refs: Vec<&str> = old_tokens.iter().map(|s| s.as_str()).collect();
let new_refs: Vec<&str> = new_tokens.iter().map(|s| s.as_str()).collect();
let ranges = diff_ranges(
Algorithm::Histogram,
TokenSlice { tokens: &old_refs },
TokenSlice { tokens: &new_refs },
);
let mut spans = Vec::new();
let mut old_idx = 0usize;
for (before, after) in ranges {
while old_idx < before.start {
let token = old_refs.get(old_idx).copied().unwrap_or("");
spans.push(
ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
);
old_idx += 1;
}
for idx in before.start..before.end {
let token = old_refs.get(idx).copied().unwrap_or("");
spans.push(
ChangeSpan::delete(token.to_string())
.with_lines(Some(old_line), Some(new_line)),
);
}
for idx in after.start..after.end {
let token = new_refs.get(idx).copied().unwrap_or("");
spans.push(
ChangeSpan::insert(token.to_string())
.with_lines(Some(old_line), Some(new_line)),
);
}
old_idx = before.end;
}
while old_idx < old_refs.len() {
let token = old_refs.get(old_idx).copied().unwrap_or("");
spans.push(
ChangeSpan::equal(token.to_string()).with_lines(Some(old_line), Some(new_line)),
);
old_idx += 1;
}
spans
}
pub fn diff_files(&self, old_path: &Path, new_path: &Path) -> Result<FileDiff, DiffError> {
let old_content = std::fs::read_to_string(old_path)?;
let new_content = std::fs::read_to_string(new_path)?;
let result = self.diff_strings(&old_content, &new_content);
Ok(FileDiff {
old_path: Some(old_path.to_string_lossy().to_string()),
new_path: Some(new_path.to_string_lossy().to_string()),
result,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_diff() {
let engine = DiffEngine::new();
let old = "foo\nbar\nbaz";
let new = "foo\nqux\nbaz";
let result = engine.diff_strings(old, new);
assert_eq!(result.insertions, 1);
assert_eq!(result.deletions, 1);
assert!(!result.significant_changes.is_empty());
}
#[test]
fn test_no_changes() {
let engine = DiffEngine::new();
let text = "foo\nbar\nbaz";
let result = engine.diff_strings(text, text);
assert_eq!(result.insertions, 0);
assert_eq!(result.deletions, 0);
assert!(result.significant_changes.is_empty());
}
#[test]
fn test_word_level_diff() {
let engine = DiffEngine::new().with_word_level(true);
let old = "const foo = 4";
let new = "const bar = 4";
let result = engine.diff_strings(old, new);
assert_eq!(result.significant_changes.len(), 1);
}
#[test]
fn test_tokenize_code_basic() {
let tokens = tokenize_code("KeyModifiers, MouseEventKind}");
assert_eq!(
tokens,
vec!["KeyModifiers", ",", " ", "MouseEventKind", "}"]
);
}
#[test]
fn test_tokenize_code_identifiers() {
let tokens = tokenize_code("foo_bar baz123");
assert_eq!(tokens, vec!["foo_bar", " ", "baz123"]);
}
#[test]
fn test_tokenize_code_punctuation() {
let tokens = tokenize_code("use foo::{A, B};");
assert_eq!(
tokens,
vec!["use", " ", "foo", ":", ":", "{", "A", ",", " ", "B", "}", ";"]
);
}
#[test]
fn test_word_diff_punctuation_separation() {
use crate::change::ChangeKind;
let engine = DiffEngine::new().with_word_level(true);
let old = "use foo::{KeyModifiers};";
let new = "use foo::{KeyModifiers, MouseEventKind};";
let result = engine.diff_strings(old, new);
assert_eq!(result.significant_changes.len(), 1);
let change = &result.changes[result.significant_changes[0]];
let equal_content: String = change
.spans
.iter()
.filter(|s| s.kind == ChangeKind::Equal)
.map(|s| s.text.as_str())
.collect();
let insert_content: String = change
.spans
.iter()
.filter(|s| s.kind == ChangeKind::Insert)
.map(|s| s.text.as_str())
.collect();
assert!(
equal_content.contains("KeyModifiers"),
"KeyModifiers should be equal, got equal: '{}', insert: '{}'",
equal_content,
insert_content
);
assert!(
insert_content.contains("MouseEventKind"),
"MouseEventKind should be inserted, got equal: '{}', insert: '{}'",
equal_content,
insert_content
);
assert!(
!insert_content.contains("KeyModifiers"),
"KeyModifiers should not be inserted, got insert: '{}'",
insert_content
);
}
#[test]
fn test_hunks_are_contiguous_in_significant_changes() {
let engine = DiffEngine::new();
let old = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\n";
let new = "a\nB\nc\nd\ne\nf\ng\nh\nI\nj\nk\nL\nm\nn\n";
let result = engine.diff_strings(old, new);
assert!(
result.hunks.len() >= 2,
"expected multiple hunks for contiguity test"
);
for hunk in &result.hunks {
if hunk.change_ids.is_empty() {
continue;
}
let mut positions = Vec::new();
for id in &hunk.change_ids {
let pos = result
.significant_changes
.iter()
.position(|sid| sid == id)
.expect("hunk change id should exist in significant_changes");
positions.push(pos);
}
for pair in positions.windows(2) {
assert_eq!(
pair[1],
pair[0] + 1,
"hunk change ids should be contiguous in significant_changes"
);
}
let start = positions[0];
for (offset, id) in hunk.change_ids.iter().enumerate() {
assert_eq!(
result.significant_changes[start + offset],
*id,
"hunk change order should match significant_changes"
);
}
}
}
}