Skip to main content

vtcode_commons/
diff.rs

1//! Diff utilities for generating structured diffs.
2
3use hashbrown::HashMap;
4use serde::Serialize;
5use std::cmp::min;
6
7/// Represents a chunk of text in a diff (Equal, Delete, or Insert).
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum Chunk<'a> {
10    Equal(&'a str),
11    Delete(&'a str),
12    Insert(&'a str),
13}
14
15/// Compute an optimal diff between two strings using Myers algorithm.
16#[inline]
17pub fn compute_diff_chunks<'a>(old: &'a str, new: &'a str) -> Vec<Chunk<'a>> {
18    if old.is_empty() && new.is_empty() {
19        return Vec::with_capacity(0);
20    }
21    if old.is_empty() {
22        return vec![Chunk::Insert(new)];
23    }
24    if new.is_empty() {
25        return vec![Chunk::Delete(old)];
26    }
27
28    // Strip common prefix first (optimisation).
29    let prefix_byte_len: usize = old
30        .chars()
31        .zip(new.chars())
32        .take_while(|(o, n)| o == n)
33        .map(|(c, _)| c.len_utf8())
34        .sum();
35
36    // Strip common suffix on the remaining text.
37    let old_rest = &old[prefix_byte_len..];
38    let new_rest = &new[prefix_byte_len..];
39
40    let suffix_byte_len: usize = old_rest
41        .chars()
42        .rev()
43        .zip(new_rest.chars().rev())
44        .take_while(|(o, n)| o == n)
45        .map(|(c, _)| c.len_utf8())
46        .sum();
47
48    let old_middle_end = old_rest.len() - suffix_byte_len;
49    let new_middle_end = new_rest.len() - suffix_byte_len;
50
51    let old_middle = &old_rest[..old_middle_end];
52    let new_middle = &new_rest[..new_middle_end];
53
54    let mut result = Vec::with_capacity(old_middle.len() + new_middle.len());
55
56    // Add common prefix
57    if prefix_byte_len > 0 {
58        result.push(Chunk::Equal(&old[..prefix_byte_len]));
59    }
60
61    // Compute optimal diff for the middle section
62    if !old_middle.is_empty() || !new_middle.is_empty() {
63        let old_chars: Vec<char> = old_middle.chars().collect();
64        let new_chars: Vec<char> = new_middle.chars().collect();
65        let old_byte_starts: Vec<usize> = old_middle.char_indices().map(|(idx, _)| idx).collect();
66        let new_byte_starts: Vec<usize> = new_middle.char_indices().map(|(idx, _)| idx).collect();
67        let edits = myers_diff(&old_chars, &new_chars);
68
69        let mut old_pos = 0;
70        let mut new_pos = 0;
71
72        for edit in edits {
73            match edit {
74                Edit::Equal => {
75                    old_pos += 1;
76                    new_pos += 1;
77                }
78                Edit::Delete => {
79                    let Some(ch) = old_chars.get(old_pos).copied() else {
80                        break;
81                    };
82                    let Some(byte_start) = old_byte_starts.get(old_pos).copied() else {
83                        break;
84                    };
85                    let byte_end = byte_start + ch.len_utf8();
86                    result.push(Chunk::Delete(&old_middle[byte_start..byte_end]));
87                    old_pos += 1;
88                }
89                Edit::Insert => {
90                    let Some(ch) = new_chars.get(new_pos).copied() else {
91                        break;
92                    };
93                    let Some(byte_start) = new_byte_starts.get(new_pos).copied() else {
94                        break;
95                    };
96                    let byte_end = byte_start + ch.len_utf8();
97                    result.push(Chunk::Insert(&new_middle[byte_start..byte_end]));
98                    new_pos += 1;
99                }
100            }
101        }
102    }
103
104    // Add common suffix
105    if suffix_byte_len > 0 {
106        result.push(Chunk::Equal(&old[old.len() - suffix_byte_len..]));
107    }
108
109    result
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113enum Edit {
114    Equal,
115    Delete,
116    Insert,
117}
118
119/// Advance along matching characters. Extracted from `myers_diff` so the
120/// compiler sees a tight leaf loop with no surrounding state, enabling better
121/// register allocation and (in some cases) auto-vectorization heuristics.
122#[inline]
123fn advance_matching(old: &[char], new: &[char], mut x: usize, mut y: usize) -> (usize, usize) {
124    while x < old.len() && y < new.len() && old[x] == new[y] {
125        x += 1;
126        y += 1;
127    }
128    (x, y)
129}
130
131/// Erase the trailing equal run during backtracking. Same rationale as
132/// `advance_matching` — a focused leaf function that the compiler can
133/// optimise in isolation.
134/// Returns the final `(x, y)` position after removing equal edits.
135#[inline]
136fn backtrack_equal_run(
137    mut x: usize,
138    mut y: usize,
139    move_x: usize,
140    move_y: usize,
141    edits: &mut Vec<Edit>,
142) -> (usize, usize) {
143    while x > move_x && y > move_y {
144        edits.push(Edit::Equal);
145        x -= 1;
146        y -= 1;
147    }
148    (x, y)
149}
150
151#[allow(clippy::cast_sign_loss)]
152fn myers_diff(old: &[char], new: &[char]) -> Vec<Edit> {
153    let n = old.len();
154    let m = new.len();
155
156    if n == 0 {
157        return vec![Edit::Insert; m];
158    }
159    if m == 0 {
160        return vec![Edit::Delete; n];
161    }
162
163    let max_d = n.saturating_add(m).min(i32::MAX as usize);
164    let max_d_i32 = max_d as i32;
165    let mut v = vec![0; 2 * max_d + 1];
166    let mut v_index = vec![0usize; (max_d + 1) * (2 * max_d + 1)];
167    let row_len = 2 * max_d + 1;
168
169    v[max_d] = 0;
170
171    for d in 0..=max_d {
172        let d_i32 = d as i32;
173        let row_start = d * row_len;
174        for k in (-d_i32..=d_i32).step_by(2) {
175            let k_idx = (k + max_d_i32) as usize;
176
177            let x = if k == -d_i32 || (k != d_i32 && v[k_idx - 1] < v[k_idx + 1]) {
178                v[k_idx + 1]
179            } else {
180                v[k_idx - 1] + 1
181            };
182
183            let mut x = x;
184            let mut y = (x as i32 - k) as usize;
185
186            (x, y) = advance_matching(old, new, x, y);
187
188            v[k_idx] = x;
189            v_index[row_start + k_idx] = x;
190
191            if x >= n && y >= m {
192                return backtrack_myers(old, new, &v_index, d, k, max_d);
193            }
194        }
195    }
196
197    vec![]
198}
199
200#[allow(clippy::cast_sign_loss)]
201fn backtrack_myers(
202    old: &[char],
203    new: &[char],
204    v_index: &[usize],
205    d: usize,
206    mut k: i32,
207    max_d: usize,
208) -> Vec<Edit> {
209    let mut edits = Vec::with_capacity(old.len() + new.len());
210    let mut x = old.len();
211    let mut y = new.len();
212    let max_d_i32 = max_d as i32;
213    let row_len = 2 * max_d + 1;
214
215    for cur_d in (0..=d).rev() {
216        if cur_d == 0 {
217            while x > 0 && y > 0 {
218                edits.push(Edit::Equal);
219                x -= 1;
220                y -= 1;
221            }
222            break;
223        }
224
225        let k_idx = (k + max_d_i32) as usize;
226        let prev_row_start = (cur_d - 1) * row_len;
227
228        let cur_d_i32 = cur_d as i32;
229        let prev_k = if k == cur_d_i32.wrapping_neg()
230            || (k != cur_d_i32
231                && v_index[prev_row_start + k_idx - 1] < v_index[prev_row_start + k_idx + 1])
232        {
233            k + 1
234        } else {
235            k - 1
236        };
237
238        let prev_k_idx = (prev_k + max_d_i32) as usize;
239        let prev_x_val = v_index[prev_row_start + prev_k_idx];
240        let prev_y = (prev_x_val as i32 - prev_k) as usize;
241
242        let (move_x, move_y) = if prev_k == k + 1 {
243            (prev_x_val, prev_y + 1)
244        } else {
245            (prev_x_val + 1, prev_y)
246        };
247
248        (x, y) = backtrack_equal_run(x, y, move_x, move_y, &mut edits);
249
250        if prev_k == k + 1 {
251            edits.push(Edit::Insert);
252            y -= 1;
253        } else {
254            edits.push(Edit::Delete);
255            x -= 1;
256        }
257
258        k = prev_k;
259    }
260
261    edits.reverse();
262    edits
263}
264
265/// Options for diff generation.
266#[derive(Debug, Clone)]
267pub struct DiffOptions<'a> {
268    pub context_lines: usize,
269    pub old_label: Option<&'a str>,
270    pub new_label: Option<&'a str>,
271    pub missing_newline_hint: bool,
272}
273
274impl Default for DiffOptions<'_> {
275    fn default() -> Self {
276        Self {
277            context_lines: 3,
278            old_label: None,
279            new_label: None,
280            missing_newline_hint: true,
281        }
282    }
283}
284
285/// A diff rendered with both structured hunks and formatted text.
286#[derive(Debug, Clone, Serialize)]
287pub struct DiffBundle {
288    pub hunks: Vec<DiffHunk>,
289    pub formatted: String,
290    pub is_empty: bool,
291}
292
293/// A diff hunk with metadata for old/new ranges.
294#[derive(Debug, Clone, Serialize)]
295pub struct DiffHunk {
296    pub old_start: usize,
297    pub old_lines: usize,
298    pub new_start: usize,
299    pub new_lines: usize,
300    pub lines: Vec<DiffLine>,
301}
302
303/// A single diff line annotated with metadata and type.
304#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
305#[serde(rename_all = "snake_case")]
306pub enum DiffLineKind {
307    Context,
308    Addition,
309    Deletion,
310}
311
312/// Metadata for a single line inside a diff hunk.
313#[derive(Debug, Clone, Serialize)]
314pub struct DiffLine {
315    pub kind: DiffLineKind,
316    pub old_line: Option<u32>,
317    pub new_line: Option<u32>,
318    pub text: String,
319}
320
321/// Compute a structured diff bundle.
322pub fn compute_diff<F>(old: &str, new: &str, options: DiffOptions<'_>, formatter: F) -> DiffBundle
323where
324    F: FnOnce(&[DiffHunk], &DiffOptions<'_>) -> String,
325{
326    let old_lines_owned = split_lines_with_terminator(old);
327    let new_lines_owned = split_lines_with_terminator(new);
328
329    let old_refs: Vec<&str> = old_lines_owned.iter().map(|s| s.as_str()).collect();
330    let new_refs: Vec<&str> = new_lines_owned.iter().map(|s| s.as_str()).collect();
331
332    let records = collect_line_records(&old_refs, &new_refs);
333    let has_changes = records
334        .iter()
335        .any(|record| matches!(record.kind, DiffLineKind::Addition | DiffLineKind::Deletion));
336
337    let hunks = if has_changes {
338        build_hunks(&records, options.context_lines)
339    } else {
340        Vec::new()
341    };
342
343    let formatted = if hunks.is_empty() {
344        String::new()
345    } else {
346        formatter(&hunks, &options)
347    };
348
349    DiffBundle {
350        hunks,
351        formatted,
352        is_empty: !has_changes,
353    }
354}
355
356fn split_lines_with_terminator(text: &str) -> Vec<String> {
357    if text.is_empty() {
358        return Vec::with_capacity(0);
359    }
360
361    let mut lines: Vec<String> = text
362        .split_inclusive('\n')
363        .map(|line| line.to_string())
364        .collect();
365
366    if lines.is_empty() {
367        lines.push(text.to_string());
368    }
369
370    lines
371}
372
373#[inline]
374fn collect_line_records<'a>(
375    old_lines: &'a [&'a str],
376    new_lines: &'a [&'a str],
377) -> Vec<LineRecord<'a>> {
378    let (old_encoded, new_encoded) = encode_line_sequences(old_lines, new_lines);
379    let mut records = Vec::with_capacity(old_lines.len() + new_lines.len());
380    let mut old_index = 0u32;
381    let mut new_index = 0u32;
382
383    for chunk in compute_diff_chunks(old_encoded.as_str(), new_encoded.as_str()) {
384        match chunk {
385            Chunk::Equal(text) => {
386                for _ in text.chars() {
387                    let old_line = old_index + 1;
388                    let new_line = new_index + 1;
389                    let line = old_lines[old_index as usize];
390                    records.push(LineRecord {
391                        kind: DiffLineKind::Context,
392                        old_line: Some(old_line),
393                        new_line: Some(new_line),
394                        text: line,
395                        anchor_old: old_line,
396                        anchor_new: new_line,
397                    });
398                    old_index += 1;
399                    new_index += 1;
400                }
401            }
402            Chunk::Delete(text) => {
403                for _ in text.chars() {
404                    let old_line = old_index + 1;
405                    let anchor_new = new_index + 1;
406                    let line = old_lines[old_index as usize];
407                    records.push(LineRecord {
408                        kind: DiffLineKind::Deletion,
409                        old_line: Some(old_line),
410                        new_line: None,
411                        text: line,
412                        anchor_old: old_line,
413                        anchor_new,
414                    });
415                    old_index += 1;
416                }
417            }
418            Chunk::Insert(text) => {
419                for _ in text.chars() {
420                    let new_line = new_index + 1;
421                    let anchor_old = old_index + 1;
422                    let line = new_lines[new_index as usize];
423                    records.push(LineRecord {
424                        kind: DiffLineKind::Addition,
425                        old_line: None,
426                        new_line: Some(new_line),
427                        text: line,
428                        anchor_old,
429                        anchor_new: new_line,
430                    });
431                    new_index += 1;
432                }
433            }
434        }
435    }
436
437    records
438}
439
440fn encode_line_sequences<'a>(
441    old_lines: &'a [&'a str],
442    new_lines: &'a [&'a str],
443) -> (String, String) {
444    let mut token_map: HashMap<&'a str, char> = HashMap::new();
445    let mut next_codepoint: u32 = 0;
446
447    let old_encoded = encode_line_list(old_lines, &mut token_map, &mut next_codepoint);
448    let new_encoded = encode_line_list(new_lines, &mut token_map, &mut next_codepoint);
449
450    (old_encoded, new_encoded)
451}
452
453fn encode_line_list<'a>(
454    lines: &'a [&'a str],
455    map: &mut HashMap<&'a str, char>,
456    next_codepoint: &mut u32,
457) -> String {
458    let mut encoded = String::with_capacity(lines.len());
459    for &line in lines {
460        let token = if let Some(&value) = map.get(line) {
461            value
462        } else {
463            let Some(ch) = next_token_char(next_codepoint) else {
464                break;
465            };
466            map.insert(line, ch);
467            ch
468        };
469        encoded.push(token);
470    }
471    encoded
472}
473
474fn next_token_char(counter: &mut u32) -> Option<char> {
475    while *counter <= 0x10FFFF {
476        let candidate = *counter;
477        *counter += 1;
478        if (0xD800..=0xDFFF).contains(&candidate) {
479            continue;
480        }
481        if let Some(ch) = char::from_u32(candidate) {
482            return Some(ch);
483        }
484    }
485    None
486}
487
488#[derive(Debug)]
489struct LineRecord<'a> {
490    kind: DiffLineKind,
491    old_line: Option<u32>,
492    new_line: Option<u32>,
493    text: &'a str,
494    anchor_old: u32,
495    anchor_new: u32,
496}
497
498fn build_hunks(records: &[LineRecord<'_>], context: usize) -> Vec<DiffHunk> {
499    if records.is_empty() {
500        return Vec::new();
501    }
502
503    let ranges = compute_hunk_ranges(records, context);
504    let mut hunks = Vec::with_capacity(ranges.len());
505
506    for (start, end) in ranges {
507        let slice = &records[start..=end];
508
509        let old_start = slice
510            .iter()
511            .filter_map(|r| r.old_line)
512            .min()
513            .or_else(|| slice.iter().map(|r| r.anchor_old).min())
514            .unwrap_or(1) as usize;
515
516        let new_start = slice
517            .iter()
518            .filter_map(|r| r.new_line)
519            .min()
520            .or_else(|| slice.iter().map(|r| r.anchor_new).min())
521            .unwrap_or(1) as usize;
522
523        let old_lines = slice
524            .iter()
525            .filter(|r| matches!(r.kind, DiffLineKind::Context | DiffLineKind::Deletion))
526            .count();
527        let new_lines = slice
528            .iter()
529            .filter(|r| matches!(r.kind, DiffLineKind::Context | DiffLineKind::Addition))
530            .count();
531
532        let lines = slice
533            .iter()
534            .map(|record| DiffLine {
535                kind: record.kind,
536                old_line: record.old_line,
537                new_line: record.new_line,
538                text: record.text.to_string(),
539            })
540            .collect();
541
542        hunks.push(DiffHunk {
543            old_start,
544            old_lines,
545            new_start,
546            new_lines,
547            lines,
548        });
549    }
550
551    hunks
552}
553
554fn compute_hunk_ranges(records: &[LineRecord<'_>], context: usize) -> Vec<(usize, usize)> {
555    let mut ranges = Vec::with_capacity(4);
556    let mut current_start: Option<usize> = None;
557    let mut current_end: usize = 0;
558
559    for (idx, record) in records.iter().enumerate() {
560        if record.kind != DiffLineKind::Context {
561            let start = idx.saturating_sub(context);
562            let end = min(idx + context, records.len().saturating_sub(1));
563
564            if let Some(existing_start) = current_start {
565                if start < existing_start {
566                    current_start = Some(start);
567                }
568                if end > current_end {
569                    current_end = end;
570                }
571            } else {
572                current_start = Some(start);
573                current_end = end;
574            }
575        } else if let Some(start) = current_start
576            && idx > current_end
577        {
578            ranges.push((start, current_end));
579            current_start = None;
580        }
581    }
582
583    if let Some(start) = current_start {
584        ranges.push((start, current_end));
585    }
586
587    ranges
588}