Skip to main content

vcs_diff/
diff.rs

1//! The unified-diff model and parser, shared by `vcs-git` and `vcs-jj`.
2//!
3//! `git diff` and `jj diff --git` emit byte-identical git-format unified diffs,
4//! so a single parser serves both. Pure functions over arbitrary text — no
5//! process execution.
6
7/// Aggregate line/file counts from a diff stat (`git diff --shortstat`,
8/// `jj diff --stat`).
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10#[cfg_attr(feature = "serde", derive(serde::Serialize))]
11#[non_exhaustive]
12pub struct DiffStat {
13    /// Number of files changed.
14    pub files_changed: usize,
15    /// Lines added (`insertions(+)`).
16    pub insertions: usize,
17    /// Lines removed (`deletions(-)`).
18    pub deletions: usize,
19}
20
21impl DiffStat {
22    /// Build a [`DiffStat`]. (A constructor, because the struct is
23    /// `#[non_exhaustive]` — the parser crates and tests can't use struct-literal
24    /// syntax across the crate boundary.)
25    pub fn new(files_changed: usize, insertions: usize, deletions: usize) -> Self {
26        Self {
27            files_changed,
28            insertions,
29            deletions,
30        }
31    }
32}
33
34/// How a file changed in a unified diff.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36#[cfg_attr(feature = "serde", derive(serde::Serialize))]
37#[non_exhaustive]
38pub enum ChangeKind {
39    /// A new file (`new file mode …`).
40    Added,
41    /// An existing file's contents changed.
42    Modified,
43    /// The file was removed (`deleted file mode …`).
44    Deleted,
45    /// The file was renamed (`rename from …` / `rename to …`).
46    Renamed,
47}
48
49/// One line inside a [`Hunk`], tagged by its role. The stored text excludes the
50/// leading ` `/`+`/`-` marker **and the line terminator** — a CRLF-origin diff's
51/// trailing `\r` is stripped along with the `\n`, so reconstruct exact bytes
52/// from [`FileDiff::raw`], not from these lines.
53#[derive(Debug, Clone, PartialEq, Eq)]
54#[cfg_attr(feature = "serde", derive(serde::Serialize))]
55#[non_exhaustive]
56pub enum DiffLine {
57    /// Unchanged context line (leading ` `).
58    Context(String),
59    /// Added line (leading `+`).
60    Added(String),
61    /// Removed line (leading `-`).
62    Removed(String),
63}
64
65/// A single `@@ … @@` hunk within a [`FileDiff`].
66#[derive(Debug, Clone, PartialEq, Eq)]
67#[cfg_attr(feature = "serde", derive(serde::Serialize))]
68#[non_exhaustive]
69pub struct Hunk {
70    /// Start line in the old file (the `-<start>` of the `@@` header).
71    pub old_start: usize,
72    /// Line count in the old file (defaults to 1 when the `,<count>` is omitted).
73    pub old_lines: usize,
74    /// Start line in the new file (the `+<start>` of the `@@` header).
75    pub new_start: usize,
76    /// Line count in the new file (defaults to 1 when the `,<count>` is omitted).
77    pub new_lines: usize,
78    /// Text after the closing `@@` (the function/section heading); empty when none.
79    pub section: String,
80    /// The hunk body, one entry per `+`/`-`/` ` line.
81    pub lines: Vec<DiffLine>,
82}
83
84/// One file's entry in a parsed git-format unified diff (`git diff` or
85/// `jj diff --git`).
86#[derive(Debug, Clone, PartialEq, Eq)]
87#[cfg_attr(feature = "serde", derive(serde::Serialize))]
88#[non_exhaustive]
89pub struct FileDiff {
90    /// How the file changed.
91    pub change: ChangeKind,
92    /// The file's path — the *new* path for a rename — forward-slash normalised.
93    pub path: String,
94    /// For a rename, the original path (forward-slash normalised); `None` otherwise.
95    pub old_path: Option<String>,
96    /// The `@@` hunks; empty for a binary file or a pure rename with no edits.
97    pub hunks: Vec<Hunk>,
98    /// The verbatim diff section for this file (the `diff --git …` block through
99    /// to the next file), for callers that display the raw text.
100    pub raw: String,
101}
102
103/// Parse a git-format unified diff into one [`FileDiff`] per file. Works on
104/// `git diff` and `jj diff --git` output alike. Public so a consumer can parse
105/// diff text it obtained by other means.
106///
107/// Paths are read from the unambiguous single-path lines (`+++ b/…`, `--- a/…`,
108/// `rename to …`) rather than the space-ambiguous `diff --git a/… b/…` header,
109/// and normalised to forward slashes. Ported from the `vcs-flow-commit` parser.
110pub fn parse_diff(diff: &str) -> Vec<FileDiff> {
111    diff_sections(diff).filter_map(parse_section).collect()
112}
113
114/// Slice a git-format diff into per-file sections (each starts at `diff --git`).
115fn diff_sections(full: &str) -> impl Iterator<Item = &str> {
116    let mut bounds = Vec::new();
117    let mut idx = 0;
118    for line in full.split_inclusive('\n') {
119        if line.starts_with("diff --git ") {
120            bounds.push(idx);
121        }
122        idx += line.len();
123    }
124    let ends = bounds
125        .iter()
126        .skip(1)
127        .copied()
128        .chain(std::iter::once(full.len()));
129    bounds
130        .clone()
131        .into_iter()
132        .zip(ends)
133        .map(move |(s, e)| &full[s..e])
134        .collect::<Vec<_>>()
135        .into_iter()
136}
137
138/// Determine the [`FileDiff`] for one `diff --git` section: change kind and path
139/// from the header lines, plus every `@@` hunk and its body.
140fn parse_section(section: &str) -> Option<FileDiff> {
141    let mut kind = ChangeKind::Modified;
142    let mut new_path = None;
143    let mut minus_path = None;
144    let mut rename_to = None;
145    let mut rename_from = None;
146    let mut hunks: Vec<Hunk> = Vec::new();
147    let mut current: Option<Hunk> = None;
148
149    for line in section.lines() {
150        if let Some(hunk) = parse_hunk_header(line) {
151            if let Some(done) = current.replace(hunk) {
152                hunks.push(done);
153            }
154            continue;
155        }
156        if let Some(hunk) = current.as_mut() {
157            // Inside a hunk body: classify by the leading marker. `\ No newline at
158            // end of file` annotations and any stray blank line are dropped.
159            match line.as_bytes().first() {
160                Some(b' ') => hunk.lines.push(DiffLine::Context(line[1..].to_string())),
161                Some(b'+') => hunk.lines.push(DiffLine::Added(line[1..].to_string())),
162                Some(b'-') => hunk.lines.push(DiffLine::Removed(line[1..].to_string())),
163                _ => {}
164            }
165            continue;
166        }
167        // Header region (before the first `@@`).
168        if line.starts_with("new file") {
169            kind = ChangeKind::Added;
170        } else if line.starts_with("deleted file") {
171            kind = ChangeKind::Deleted;
172        } else if let Some(p) = line.strip_prefix("rename to ") {
173            rename_to = Some(p.trim_end().to_string());
174        } else if let Some(p) = line.strip_prefix("rename from ") {
175            rename_from = Some(p.trim_end().to_string());
176        } else if let Some(p) = line.strip_prefix("+++ b/") {
177            new_path = Some(p.trim_end().to_string());
178        } else if let Some(p) = line.strip_prefix("--- a/") {
179            minus_path = Some(p.trim_end().to_string());
180        }
181    }
182    if let Some(done) = current.take() {
183        hunks.push(done);
184    }
185
186    let normalize = |p: String| p.replace('\\', "/");
187    // A rename keeps its old path so a caller can record the deletion too.
188    let old_path = if rename_to.is_some() {
189        kind = ChangeKind::Renamed;
190        rename_from.map(normalize)
191    } else {
192        None
193    };
194    let path = rename_to
195        .or(new_path)
196        .or(minus_path)
197        .or_else(|| header_b_path(section))?;
198    Some(FileDiff {
199        change: kind,
200        path: normalize(path),
201        old_path,
202        hunks,
203        raw: section.to_string(),
204    })
205}
206
207/// Parse a hunk header `@@ -<os>[,<ol>] +<ns>[,<nl>] @@[ <section>]` into an empty
208/// [`Hunk`]; `None` for any other line.
209fn parse_hunk_header(line: &str) -> Option<Hunk> {
210    let rest = line.strip_prefix("@@ ")?;
211    let (ranges, section) = rest.split_once(" @@")?;
212    let mut parts = ranges.split_whitespace();
213    let (old_start, old_lines) = parse_hunk_range(parts.next()?.strip_prefix('-')?);
214    let (new_start, new_lines) = parse_hunk_range(parts.next()?.strip_prefix('+')?);
215    Some(Hunk {
216        old_start,
217        old_lines,
218        new_start,
219        new_lines,
220        section: section.strip_prefix(' ').unwrap_or(section).to_string(),
221        lines: Vec::new(),
222    })
223}
224
225/// Parse a `<start>[,<count>]` hunk range; an omitted count means 1 line.
226fn parse_hunk_range(range: &str) -> (usize, usize) {
227    match range.split_once(',') {
228        Some((start, count)) => (start.parse().unwrap_or(0), count.parse().unwrap_or(0)),
229        None => (range.parse().unwrap_or(0), 1),
230    }
231}
232
233/// Fallback path extraction for sections with no `+++`/`---`/`rename` lines
234/// (e.g. binary files): the `b/<new>` of the `diff --git` header. Ambiguous only
235/// when a path contains the literal `" b/"`, which binary-with-spaces makes rare.
236fn header_b_path(section: &str) -> Option<String> {
237    let first = section.lines().next()?;
238    let s = first.strip_prefix("diff --git ")?;
239    let idx = s.find(" b/")?;
240    Some(s[idx + 1..].strip_prefix("b/").unwrap_or("").to_string())
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn diff_covers_add_modify_delete_rename() {
249        // Add (new), modify (mod), delete (gone), and a directory-changing rename
250        // (old/f -> new/f). Ported from the vcs-flow section-parser test.
251        let full = concat!(
252            "diff --git a/new b/new\n",
253            "new file mode 100644\n--- /dev/null\n+++ b/new\n@@ -0,0 +1 @@\n+n\n",
254            "diff --git a/mod b/mod\n",
255            "--- a/mod\n+++ b/mod\n@@ -1 +1 @@\n-a\n+b\n",
256            "diff --git a/gone b/gone\n",
257            "deleted file mode 100644\n--- a/gone\n+++ /dev/null\n@@ -1 +0,0 @@\n-x\n",
258            "diff --git a/old/f.txt b/new/f.txt\n",
259            "similarity index 100%\nrename from old/f.txt\nrename to new/f.txt\n",
260        );
261        let files = parse_diff(full);
262        let kinds: Vec<_> = files.iter().map(|f| (f.path.as_str(), f.change)).collect();
263        assert_eq!(
264            kinds,
265            vec![
266                ("new", ChangeKind::Added),
267                ("mod", ChangeKind::Modified),
268                ("gone", ChangeKind::Deleted),
269                ("new/f.txt", ChangeKind::Renamed),
270            ]
271        );
272        // The rename carries its old path so the deletion is recorded too.
273        let rename = files
274            .iter()
275            .find(|f| f.change == ChangeKind::Renamed)
276            .unwrap();
277        assert_eq!(rename.old_path.as_deref(), Some("old/f.txt"));
278    }
279
280    #[test]
281    fn diff_handles_space_paths() {
282        // git appends a trailing tab to `+++`/`---` paths containing spaces; the
283        // path must survive intact (the `diff --git` header is ambiguous here).
284        let full = "diff --git a/a b/c.txt b/a b/c.txt\n--- a/a b/c.txt\t\n+++ b/a b/c.txt\t\n@@ -1 +1 @@\n-x\n+y\n";
285        let files = parse_diff(full);
286        assert_eq!(files.len(), 1);
287        assert_eq!(files[0].path, "a b/c.txt");
288    }
289
290    #[test]
291    fn diff_parses_hunk_ranges_and_body() {
292        let full = "diff --git a/f b/f\n--- a/f\n+++ b/f\n@@ -1,2 +1,3 @@ fn main()\n ctx\n-old\n+new\n+added\n";
293        let files = parse_diff(full);
294        assert_eq!(files.len(), 1);
295        // The verbatim section is preserved for display.
296        assert_eq!(files[0].raw, full);
297        let hunk = &files[0].hunks[0];
298        assert_eq!(
299            (
300                hunk.old_start,
301                hunk.old_lines,
302                hunk.new_start,
303                hunk.new_lines
304            ),
305            (1, 2, 1, 3)
306        );
307        assert_eq!(hunk.section, "fn main()");
308        assert_eq!(
309            hunk.lines,
310            vec![
311                DiffLine::Context("ctx".into()),
312                DiffLine::Removed("old".into()),
313                DiffLine::Added("new".into()),
314                DiffLine::Added("added".into()),
315            ]
316        );
317    }
318
319    #[test]
320    fn diff_omitted_count_defaults_to_one() {
321        // `@@ -3 +3 @@` (no `,count`) means a single line on each side.
322        let full = "diff --git a/f b/f\n--- a/f\n+++ b/f\n@@ -3 +3 @@\n-a\n+b\n";
323        let hunk = &parse_diff(full)[0].hunks[0];
324        assert_eq!((hunk.old_start, hunk.old_lines), (3, 1));
325        assert_eq!((hunk.new_start, hunk.new_lines), (3, 1));
326    }
327}
328
329// Property-based fuzzing: `parse_diff` is a pure function over *arbitrary* CLI
330// text (a git/jj on the user's machine we don't control), so the load-bearing
331// invariant is "never panic, whatever the bytes" — the byte-offset slicing in
332// `parse_section`/`header_b_path` must stay char-boundary-safe.
333#[cfg(test)]
334mod proptests {
335    use super::*;
336    use proptest::prelude::*;
337
338    /// A line drawn from a git-format diff's structural vocabulary plus multibyte
339    /// text, so a joined document reaches the byte-offset branches.
340    fn diff_line() -> impl Strategy<Value = String> {
341        prop_oneof![
342            Just("diff --git a/f b/f\n".to_string()),
343            Just("--- a/f\n".to_string()),
344            Just("+++ b/f\n".to_string()),
345            Just("@@ -1,2 +3,4 @@ ctx\n".to_string()),
346            Just("@@ -1 +1 @@\n".to_string()),
347            Just("new file mode 100644\n".to_string()),
348            Just("deleted file mode 100644\n".to_string()),
349            Just("rename from {old => new}.rs\n".to_string()),
350            Just("rename to é/r.rs\n".to_string()),
351            "[-+ ]?[a-zé\t]{0,12}\n", // diff body / text incl. multibyte
352        ]
353    }
354
355    fn diff_doc() -> impl Strategy<Value = String> {
356        prop::collection::vec(diff_line(), 0..40).prop_map(|lines| lines.concat())
357    }
358
359    proptest! {
360        // Panic-freedom on completely arbitrary input.
361        #[test]
362        fn parse_diff_never_panics_on_arbitrary_text(s in any::<String>()) {
363            let _ = parse_diff(&s);
364        }
365
366        // …and on structure-biased input that reaches the parsing branches.
367        #[test]
368        fn parse_diff_never_panics_on_structured_text(s in diff_doc()) {
369            let _ = parse_diff(&s);
370        }
371
372        // parse_diff never invents files it can't render the marker for: every
373        // returned FileDiff carries a raw section starting with `diff --git`.
374        #[test]
375        fn parse_diff_sections_are_well_formed(s in diff_doc()) {
376            for file in parse_diff(&s) {
377                prop_assert!(file.raw.starts_with("diff --git"));
378            }
379        }
380    }
381}
382
383// The optional `serde` feature derives `Serialize` on the public model.
384#[cfg(all(test, feature = "serde"))]
385mod serde_tests {
386    use super::*;
387
388    #[test]
389    fn diff_stat_and_change_kind_serialize() {
390        assert_eq!(
391            serde_json::to_value(DiffStat::new(3, 12, 4)).unwrap(),
392            serde_json::json!({"files_changed": 3, "insertions": 12, "deletions": 4})
393        );
394        // Field-less enum variants serialize as their name.
395        assert_eq!(
396            serde_json::to_value(ChangeKind::Renamed).unwrap(),
397            serde_json::json!("Renamed")
398        );
399    }
400}