Skip to main content

git_bot_feedback/
git_diff.rs

1use regex::Regex;
2use std::{collections::HashMap, ops::Range, path::Path};
3
4use crate::{FileDiffLines, FileFilter, LinesChangedOnly, error::DiffError};
5
6/// A struct to represent the header information of a diff hunk.
7pub struct DiffHunkHeader {
8    /// The starting line number of the old hunk.
9    pub old_start: u32,
10    /// The total number of lines in the old hunk.
11    pub old_lines: u32,
12    /// The starting line number of the new hunk.
13    pub new_start: u32,
14    /// The total number of lines in the new hunk.
15    pub new_lines: u32,
16}
17
18fn get_filename_from_front_matter(front_matter: &str) -> Result<Option<&str>, DiffError> {
19    let diff_file_name = Regex::new(r"(?m)^\+\+\+\sb?/(.*)$")?;
20    let diff_renamed_file = Regex::new(r"(?m)^rename to (.*)$")?;
21    let diff_binary_file = Regex::new(r"(?m)^Binary\sfiles\s")?;
22    if let Some(captures) = diff_file_name.captures(front_matter)
23        && let Some(name) = captures.get(1)
24    {
25        return Ok(Some(name.as_str()));
26    }
27    if front_matter.starts_with("similarity")
28        && let Some(captures) = diff_renamed_file.captures(front_matter)
29        && let Some(name) = captures.get(1)
30    {
31        return Ok(Some(name.as_str()));
32    }
33    if !diff_binary_file.is_match(front_matter) {
34        log::warn!("Unrecognized diff starting with:\n{}", front_matter);
35    }
36    Ok(None)
37}
38
39/// A regex pattern used in multiple functions
40static HUNK_INFO_PATTERN: &str = r"(?m)@@\s\-\d+,?\d*\s\+(\d+),?(\d*)\s@@";
41
42/// Parses a single file's patch containing one or more hunks
43///
44/// Returns a 2-item tuple:
45///
46/// - the line numbers that contain additions
47/// - the ranges of lines that span each hunk
48fn parse_patch(patch: &str) -> Result<(Vec<u32>, Vec<Range<u32>>), DiffError> {
49    let mut diff_hunks = Vec::new();
50    let mut additions = Vec::new();
51
52    let hunk_info = Regex::new(HUNK_INFO_PATTERN)?;
53    let hunk_headers = hunk_info.captures_iter(patch).collect::<Vec<_>>();
54    if !hunk_headers.is_empty() {
55        // skip the first split because it is anything that precedes first hunk header
56        let hunks = hunk_info.split(patch).skip(1);
57        for (hunk, header) in hunks.zip(hunk_headers) {
58            // header.unwrap() is safe because the hunk_headers.iter() is parallel to hunk_info.split()
59            let [start_line, end_range] = header.extract().1.map(|v| v.parse::<u32>().unwrap_or(1));
60            let mut line_numb_in_diff = start_line;
61            diff_hunks.push(start_line..start_line + end_range);
62            for (line_index, line) in hunk.split('\n').enumerate() {
63                if line.starts_with('+') {
64                    additions.push(line_numb_in_diff);
65                }
66                if line_index > 0 && !line.starts_with('-') {
67                    line_numb_in_diff += 1;
68                }
69            }
70        }
71    }
72    Ok((additions, diff_hunks))
73}
74
75/// Parses a git `diff` string into a map of file names to their corresponding
76/// [`FileDiffLines`].
77///
78/// The `file_filter` is used to filter out files that are not of interest.
79/// The `lines_changed_only` parameter determines whether to include files
80/// based on their contents' changes.
81pub fn parse_diff(
82    diff: &str,
83    file_filter: &FileFilter,
84    lines_changed_only: &LinesChangedOnly,
85) -> Result<HashMap<String, FileDiffLines>, DiffError> {
86    let mut results = HashMap::new();
87    let diff_file_delimiter = Regex::new(r"(?m)^diff \-\-git a/.*$")?;
88    let hunk_info = Regex::new(HUNK_INFO_PATTERN)?;
89
90    let file_diffs = diff_file_delimiter.split(diff);
91    for file_diff in file_diffs {
92        if file_diff.is_empty() || file_diff.starts_with("deleted file") {
93            continue;
94        }
95        let hunk_start = if let Some(first_hunk) = hunk_info.find(file_diff) {
96            first_hunk.start()
97        } else {
98            file_diff.len()
99        };
100        let front_matter = &file_diff[..hunk_start];
101        if let Some(file_name) = get_filename_from_front_matter(front_matter.trim_start())? {
102            let file_name = file_name.strip_prefix('/').unwrap_or(file_name);
103            if file_filter.is_qualified(Path::new(file_name)) {
104                let (added_lines, diff_hunks) = parse_patch(&file_diff[hunk_start..])?;
105                if lines_changed_only
106                    .is_change_valid(!added_lines.is_empty(), !diff_hunks.is_empty())
107                {
108                    results
109                        .entry(file_name.to_string())
110                        .or_insert_with(|| FileDiffLines::with_info(added_lines, diff_hunks));
111                }
112            }
113        }
114    }
115    Ok(results)
116}
117
118// ******************* UNIT TESTS ***********************
119#[cfg(test)]
120mod test {
121    #![allow(clippy::unwrap_used)]
122
123    use super::parse_diff;
124    use crate::{FileFilter, LinesChangedOnly};
125
126    const RENAMED_DIFF: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
127similarity index 100%
128rename from /tests/demo/some source.cpp
129rename to /tests/demo/some source.c
130diff --git a/some picture.png b/some picture.png
131new file mode 100644
132Binary files /dev/null and b/some picture.png differ
133"#;
134
135    #[test]
136    fn parse_renamed_diff() {
137        let files = parse_diff(
138            RENAMED_DIFF,
139            &FileFilter::new(&[], &["c"], None),
140            &LinesChangedOnly::Off,
141        )
142        .unwrap();
143        let git_file = files.get("tests/demo/some source.c").unwrap();
144        assert!(git_file.added_lines.is_empty());
145        assert!(git_file.diff_hunks.is_empty());
146    }
147
148    #[test]
149    fn parse_renamed_only_diff() {
150        let files = parse_diff(
151            RENAMED_DIFF,
152            &FileFilter::new(&[], &["c"], None),
153            &LinesChangedOnly::Diff,
154        )
155        .unwrap();
156        assert!(files.is_empty());
157    }
158
159    const RENAMED_DIFF_WITH_CHANGES: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
160similarity index 99%
161rename from /tests/demo/some source.cpp
162rename to /tests/demo/some source.c
163@@ -3,7 +3,7 @@
164\n \n \n-#include "math.h"
165+#include <math.h>\n \n \n \n"#;
166
167    #[test]
168    fn parse_renamed_diff_with_patch() {
169        let files = parse_diff(
170            &String::from_iter([RENAMED_DIFF_WITH_CHANGES, TERSE_HEADERS]),
171            // ignore src/demo.cpp file (in TERSE_HEADERS) via glob (src/*);
172            // triggers code coverage of a `}` (region end)
173            &FileFilter::new(&["src/*"], &["c", "cpp"], None),
174            &LinesChangedOnly::On,
175        )
176        .unwrap();
177        eprintln!("files: {files:#?}");
178        let git_file = files.get("tests/demo/some source.c").unwrap();
179        assert!(!git_file.is_line_in_diff(&1));
180        assert!(git_file.is_line_in_diff(&4));
181    }
182
183    const TYPICAL_DIFF: &str = "diff --git a/path/for/Some file.cpp b/path/to/Some file.cpp\n\
184                            --- a/path/for/Some file.cpp\n\
185                            +++ b/path/to/Some file.cpp\n\
186                            @@ -3,7 +3,7 @@\n \n \n \n\
187                            -#include <some_lib/render/animation.hpp>\n\
188                            +#include <some_lib/render/animations.hpp>\n \n \n \n";
189
190    #[test]
191    fn parse_typical_diff() {
192        let files = parse_diff(
193            TYPICAL_DIFF,
194            &FileFilter::new(&[], &["cpp"], None),
195            &LinesChangedOnly::On,
196        )
197        .unwrap();
198        assert!(!files.is_empty());
199    }
200
201    const BINARY_DIFF: &str = "diff --git a/some picture.png b/some picture.png\n\
202                new file mode 100644\n\
203                Binary files /dev/null and b/some picture.png differ\n";
204
205    #[test]
206    fn parse_binary_diff() {
207        let files = parse_diff(
208            BINARY_DIFF,
209            &FileFilter::new(&[], &["png"], None),
210            &LinesChangedOnly::Diff,
211        )
212        .unwrap();
213        assert!(files.is_empty());
214    }
215
216    const TERSE_HEADERS: &str = r#"diff --git a/src/demo.cpp b/src/demo.cpp
217--- a/src/demo.cpp
218+++ b/src/demo.cpp
219@@ -3 +3 @@
220-#include <stdio.h>
221+#include "stdio.h"
222@@ -4,0 +5,2 @@
223+auto main() -> int
224+{
225@@ -18 +17,2 @@ int main(){
226-    return 0;}
227+    return 0;
228+}"#;
229
230    #[test]
231    fn terse_hunk_header() {
232        let file_filter = FileFilter::new(&[], &["cpp"], None);
233        let files = parse_diff(TERSE_HEADERS, &file_filter, &LinesChangedOnly::Diff).unwrap();
234        let file_diff = files.get("src/demo.cpp").unwrap();
235        assert_eq!(file_diff.diff_hunks, vec![3..4, 5..7, 17..19]);
236    }
237}