diff_parse/
diff.rs

1// Copyright (C) 2022 Daniel Mueller <deso@posteo.net>
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4//! A module for parsing diffs.
5
6use std::io::BufRead;
7use std::io::Error;
8use std::io::ErrorKind;
9use std::io::Result as IoResult;
10use std::rc::Rc;
11use std::str::FromStr;
12
13use once_cell::sync::Lazy;
14
15use regex::Regex;
16
17const WS_STRING: &str = r"[ \t]*";
18const FILE_STRING: &str = r"([^ \t]+)";
19const ADDSUB_STRING: &str = r"([+\-])";
20const NUMLINE_STRING: &str = r"([0-9]+)";
21
22static DIFF_DIFF_REGEX: Lazy<Regex> = Lazy::new(|| {
23  // Aside from '+' and '-' we have a "continuation" character ('\') in
24  // here which essentially just indicates a line that is being ignored.
25  // This character is used (in conjunction with the string "No newline at
26  // end of file") to indicate that a newline symbol at the end of a file
27  // is added or removed, for instance.
28  Regex::new(r"^[+\-\\ ]").unwrap()
29});
30static DIFF_NODIFF_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[^+\- ]").unwrap());
31static DIFF_SRC_REGEX: Lazy<Regex> =
32  Lazy::new(|| Regex::new(&format!("^---{WS_STRING}{FILE_STRING}")).unwrap());
33static DIFF_DST_REGEX: Lazy<Regex> =
34  Lazy::new(|| Regex::new(&format!(r"^\+\+\+{WS_STRING}{FILE_STRING}")).unwrap());
35static DIFF_HEAD_REGEX: Lazy<Regex> = Lazy::new(|| {
36  // Note that in case a new file containing a single line is added the
37  // diff header might not contain the second count.
38  Regex::new(&format!(
39    "^@@ {ADDSUB_STRING}{NUMLINE_STRING}(?:,{NUMLINE_STRING})? \
40         {ADDSUB_STRING}{NUMLINE_STRING}(?:,{NUMLINE_STRING})? @@"
41  ))
42  .unwrap()
43});
44
45
46/// An enumeration of the supported operations in a diff.
47#[derive(Clone, Copy, Debug, PartialEq)]
48pub enum Op {
49  /// Lines are being added.
50  Add,
51  /// Lines are being removed.
52  Sub,
53}
54
55impl FromStr for Op {
56  type Err = ();
57
58  fn from_str(s: &str) -> Result<Self, Self::Err> {
59    match s {
60      "+" => Ok(Self::Add),
61      "-" => Ok(Self::Sub),
62      _ => Err(()),
63    }
64  }
65}
66
67
68/// An object capturing meta data about a diff.
69#[derive(Debug)]
70pub struct File {
71  /// The file the diff belongs to.
72  pub file: Rc<String>,
73  /// Whether the diff adds or removes lines.
74  pub op: Op,
75  /// The start line of the diff.
76  pub line: usize,
77  /// The number of lines in the diff.
78  pub count: usize,
79}
80
81
82/// An enumeration of all the states our parser can be in.
83#[derive(Clone, Debug)]
84enum State {
85  /// The state when we expect a new file to start.
86  Start,
87  /// The state after we parsed the source file header part.
88  Src { src: Rc<String> },
89  /// The state after we parsed the destination file header part.
90  Dst { src: Rc<String>, dst: Rc<String> },
91  /// The state after we parsed the entire header.
92  Hdr { src: Rc<String>, dst: Rc<String> },
93}
94
95impl State {
96  /// A helper function advancing `self` to another state.
97  fn advance(&mut self, state: State) -> Option<IoResult<()>> {
98    *self = state;
99    Some(Ok(()))
100  }
101
102  /// Try parsing a line containing information about the changed lines.
103  fn parse_head(
104    &mut self,
105    diffs: &mut Vec<(File, File)>,
106    line: &str,
107    src: Rc<String>,
108    dst: Rc<String>,
109  ) -> Option<IoResult<()>> {
110    let captures = DIFF_HEAD_REGEX.captures(line)?;
111
112    let mut parse = || -> IoResult<()> {
113      // It is fine to unwrap captures 1-2 and 4-5 because we know they
114      // participate in the match unconditionally.
115      let add_src = captures.get(1).unwrap().as_str();
116      let start_src = captures.get(2).unwrap().as_str();
117      // Because a diff header might not contain counts if only a single
118      // line is affected, we provide the default "1" here.
119      let count_src = captures.get(3).map(|m| m.as_str()).unwrap_or("1");
120      let add_dst = captures.get(4).unwrap().as_str();
121      let start_dst = captures.get(5).unwrap().as_str();
122      let count_dst = captures.get(6).map(|m| m.as_str()).unwrap_or("1");
123
124      let src_file = File {
125        file: src.clone(),
126        // It is fine to unwrap here because the regex would not have
127        // matched if the operation was not valid.
128        op: add_src.parse().unwrap(),
129        line: start_src.parse().map_err(|error| {
130          Error::new(
131            ErrorKind::Other,
132            format!(r#"failed to parse start line number in line: "{line}": {error}"#),
133          )
134        })?,
135        count: count_src.parse().map_err(|error| {
136          Error::new(
137            ErrorKind::Other,
138            format!(r#"failed to parse line count in line: "{line}": {error}"#),
139          )
140        })?,
141      };
142      let dst_file = File {
143        file: dst.clone(),
144        // It is fine to unwrap here because the regex would not have
145        // matched if the operation was not valid.
146        op: add_dst.parse().unwrap(),
147        line: start_dst.parse().map_err(|error| {
148          Error::new(
149            ErrorKind::Other,
150            format!(r#"failed to parse start line number in line: "{line}": {error}"#),
151          )
152        })?,
153        count: count_dst.parse().map_err(|error| {
154          Error::new(
155            ErrorKind::Other,
156            format!(r#"failed to parse line count in line: "{line}": {error}"#),
157          )
158        })?,
159      };
160      diffs.push((src_file, dst_file));
161      Ok(())
162    };
163
164
165    if let Err(error) = parse() {
166      return Some(Err(error))
167    }
168    self.advance(Self::Hdr { src, dst })
169  }
170
171  /// Try parsing a line containing the source file.
172  fn parse_src(&mut self, line: &str) -> Option<IoResult<()>> {
173    let captures = DIFF_SRC_REGEX.captures(line)?;
174    // It is fine to unwrap here because we know the queried capture
175    // group participates in the match unconditionally.
176    let src = captures.get(1).unwrap();
177
178    self.advance(Self::Src {
179      src: Rc::new(src.as_str().to_owned()),
180    })
181  }
182
183  /// Try parsing a line containing the destination file.
184  fn parse_dst(&mut self, line: &str, src: Rc<String>) -> Option<IoResult<()>> {
185    let captures = DIFF_DST_REGEX.captures(line)?;
186    // It is fine to unwrap here because we know the queried capture
187    // group participates in the match unconditionally.
188    let dst = captures.get(1).unwrap();
189
190    self.advance(Self::Dst {
191      src,
192      dst: Rc::new(dst.as_str().to_owned()),
193    })
194  }
195
196  /// Try matching a line that contains no actual diff.
197  fn match_no_diff(&mut self, line: &str) -> Option<IoResult<()>> {
198    DIFF_NODIFF_REGEX.is_match(line).then(|| Ok(()))
199  }
200
201  /// Try matching an actual diff line.
202  fn match_diff(&mut self, line: &str) -> Option<IoResult<()>> {
203    DIFF_DIFF_REGEX.is_match(line).then(|| Ok(()))
204  }
205
206  /// Try matching a line not from an actual diff that indicates the
207  /// start of a new file.
208  fn restart(&mut self, line: &str) -> Option<IoResult<()>> {
209    DIFF_NODIFF_REGEX.is_match(line).then(|| ())?;
210    self.advance(Self::Start)
211  }
212
213  fn parse(&mut self, diffs: &mut Vec<(File, File)>, line: &str) -> IoResult<()> {
214    /// Check and evaluate the result of a parser function.
215    macro_rules! check {
216      ($result:expr) => {
217        match $result {
218          // The parser did not match. Continue with the next one.
219          None => (),
220          // The parser matched and then either continued parsing
221          // successfully or produced an error. Short circuit in both cases
222          // to bubble up the result.
223          Some(result) => return result,
224        }
225      };
226    }
227
228    // This clone is a mere bump of two `Rc` counts, at most.
229    match self.clone() {
230      State::Start => {
231        check!(self.parse_src(line));
232        check!(self.match_no_diff(line));
233      },
234      State::Src { src } => {
235        check!(self.parse_dst(line, src));
236      },
237      State::Dst { src, dst } => {
238        check!(self.parse_head(diffs, line, src, dst));
239      },
240      State::Hdr { src, dst } => {
241        check!(self.match_diff(line));
242        check!(self.parse_head(diffs, line, src, dst));
243        check!(self.restart(line));
244      },
245    };
246
247    Err(Error::new(
248      ErrorKind::Other,
249      format!(r#"encountered unexpected line: "{line}" (state: {self:?})"#),
250    ))
251  }
252}
253
254
255/// A type interpreting a diff and extracting relevant information.
256pub struct Parser {
257  state: State,
258  diffs: Vec<(File, File)>,
259}
260
261impl Parser {
262  /// Create a new `Parser` object in its initial state.
263  #[inline]
264  pub fn new() -> Self {
265    Self {
266      state: State::Start,
267      diffs: Vec::new(),
268    }
269  }
270
271  /// Parse a list of lines.
272  pub fn parse<L>(&mut self, mut lines: L) -> IoResult<()>
273  where
274    L: BufRead,
275  {
276    let mut line = String::new();
277
278    loop {
279      line.clear();
280
281      let count = lines.read_line(&mut line)?;
282      if count == 0 {
283        // We have reached end-of-file.
284        break Ok(())
285      }
286
287      // Remove trailing new line symbols, we already expect lines.
288      let line = if let Some(line) = line.strip_suffix('\n') {
289        line
290      } else {
291        &line
292      };
293      // We simply ignore any empty lines and do not even hand them into
294      // the state for further consideration because they cannot change
295      // anything.
296      if !line.is_empty() {
297        let () = self.state.parse(&mut self.diffs, line)?;
298      }
299    }
300  }
301
302  /// Retrieve all found diffs.
303  pub fn diffs(&self) -> &[(File, File)] {
304    &self.diffs
305  }
306}
307
308
309#[cfg(test)]
310mod tests {
311  use super::*;
312
313  use std::ops::Deref as _;
314
315
316  /// Test parsing of a very simple one-line-change diff.
317  #[test]
318  fn parse_simple_diff() {
319    let diff = r#"
320--- main.c
321+++ main.c
322@@ -6,6 +6,6 @@ int main(int argc, char const* argv[])
323     fprintf(stderr, "Too many arguments.\n");
324     return -1;
325   }
326-  printf("Hello world!");
327+  printf("Hello world!\n");
328   return 0;
329 }"#;
330
331    let mut parser = Parser::new();
332    let () = parser.parse(diff.as_bytes()).unwrap();
333
334    let diffs = parser.diffs();
335    assert_eq!(diffs.len(), 1);
336
337    let (src, dst) = &diffs[0];
338    assert_eq!(src.file.deref(), "main.c");
339    assert_eq!(src.op, Op::Sub);
340    assert_eq!(src.line, 6);
341    assert_eq!(src.count, 6);
342
343    assert_eq!(dst.file.deref(), "main.c");
344    assert_eq!(dst.op, Op::Add);
345    assert_eq!(dst.line, 6);
346    assert_eq!(dst.count, 6);
347  }
348
349  /// Test that we can parse a diff emitted by git if a file's trailing
350  /// newline is added.
351  #[test]
352  fn parse_diff_adding_newline_at_end_of_file() {
353    let diff = r#"
354--- main.c
355+++ main.c
356@@ -8,4 +8,4 @@ int main(int argc, char const* argv[])
357   }
358   printf("Hello world!");
359   return 0;
360-}
361\\ No newline at end of file
362+}"#;
363
364    let mut parser = Parser::new();
365    let () = parser.parse(diff.as_bytes()).unwrap();
366
367    let diffs = parser.diffs();
368    assert_eq!(diffs.len(), 1);
369
370    let (src, dst) = &diffs[0];
371    assert_eq!(src.file.deref(), "main.c");
372    assert_eq!(src.op, Op::Sub);
373    assert_eq!(src.line, 8);
374    assert_eq!(src.count, 4);
375
376    assert_eq!(dst.file.deref(), "main.c");
377    assert_eq!(dst.op, Op::Add);
378    assert_eq!(dst.line, 8);
379    assert_eq!(dst.count, 4);
380  }
381
382  /// Test that we can parse a diff emitted by git if a file's trailing
383  /// newline is removed.
384  #[test]
385  fn parse_diff_removing_newline_at_end_of_file() {
386    let diff = r#"
387--- main.c
388+++ main.c
389@@ -8,4 +8,4 @@ int main(int argc, char const* argv[])
390   }
391   printf("Hello world!");
392   return 0;
393-}
394+}
395\\ No newline at end of file"#;
396
397    let mut parser = Parser::new();
398    let () = parser.parse(diff.as_bytes()).unwrap();
399
400    let diffs = parser.diffs();
401    assert_eq!(diffs.len(), 1);
402
403    let (src, dst) = &diffs[0];
404    assert_eq!(src.file.deref(), "main.c");
405    assert_eq!(src.op, Op::Sub);
406    assert_eq!(src.line, 8);
407    assert_eq!(src.count, 4);
408
409    assert_eq!(dst.file.deref(), "main.c");
410    assert_eq!(dst.op, Op::Add);
411    assert_eq!(dst.line, 8);
412    assert_eq!(dst.count, 4);
413  }
414
415  /// Test that we can parse a diff adding a file with a single line."""
416  #[test]
417  fn parse_diff_with_added_file_with_single_line() {
418    let diff = r#"
419--- /dev/null
420+++ main.c
421@@ -0,0 +1 @@
422+main.c"#;
423
424    let mut parser = Parser::new();
425    let () = parser.parse(diff.as_bytes()).unwrap();
426
427    let diffs = parser.diffs();
428    assert_eq!(diffs.len(), 1);
429
430    let (src, dst) = &diffs[0];
431    assert_eq!(src.file.deref(), "/dev/null");
432    assert_eq!(src.op, Op::Sub);
433    assert_eq!(src.line, 0);
434    assert_eq!(src.count, 0);
435
436    assert_eq!(dst.file.deref(), "main.c");
437    assert_eq!(dst.op, Op::Add);
438    assert_eq!(dst.line, 1);
439    assert_eq!(dst.count, 1);
440  }
441
442  /// Test that we can parse a diff removing a file with a single line.
443  #[test]
444  fn parse_diff_with_removed_file_with_single_line() {
445    let diff = r#"
446--- main.c
447+++ /dev/null
448@@ -1 +0,0 @@
449-main.c"#;
450
451    let mut parser = Parser::new();
452    let () = parser.parse(diff.as_bytes()).unwrap();
453
454    let diffs = parser.diffs();
455    assert_eq!(diffs.len(), 1);
456
457    let (src, dst) = &diffs[0];
458    assert_eq!(src.file.deref(), "main.c");
459    assert_eq!(src.op, Op::Sub);
460    assert_eq!(src.line, 1);
461    assert_eq!(src.count, 1);
462
463    assert_eq!(dst.file.deref(), "/dev/null");
464    assert_eq!(dst.op, Op::Add);
465    assert_eq!(dst.line, 0);
466    assert_eq!(dst.count, 0);
467  }
468
469  /// Verify that we can parse a diff containing an empty line.
470  #[test]
471  fn parse_diff_with_empty_line() {
472    let diff = r#"
473--- main.c
474+++ main.c
475@@ -1,6 +1,6 @@
476 #include <stdio.h>
477 
478-int main(int argc, char const* argv[])
479+int main(int argc, char* argv[])
480 {
481   if (argc > 1) {
482     fprintf(stderr, "Too many arguments.\n");"#;
483
484    let mut parser = Parser::new();
485    let () = parser.parse(diff.as_bytes()).unwrap();
486
487    let diffs = parser.diffs();
488    assert_eq!(diffs.len(), 1);
489
490    let (src, dst) = &diffs[0];
491    assert_eq!(src.file.deref(), "main.c");
492    assert_eq!(src.op, Op::Sub);
493    assert_eq!(src.line, 1);
494    assert_eq!(src.count, 6);
495
496    assert_eq!(dst.file.deref(), "main.c");
497    assert_eq!(dst.op, Op::Add);
498    assert_eq!(dst.line, 1);
499    assert_eq!(dst.count, 6);
500  }
501}