git_bot_feedback/
git_diff.rs

1use regex::Regex;
2use std::{collections::HashMap, ops::Range, path::PathBuf};
3
4use crate::{FileDiffLines, FileFilter, LinesChangedOnly};
5
6/// A struct to represent the header information of a diff hunk.
7pub struct DiffHunkHeader {
8    /// The starting line number of the old hunk.
9    pub old_start: u32,
10    /// The total number of lines in the old hunk.
11    pub old_lines: u32,
12    /// The starting line number of the new hunk.
13    pub new_start: u32,
14    /// The total number of lines in the new hunk.
15    pub new_lines: u32,
16}
17
18fn get_filename_from_front_matter(front_matter: &str) -> Option<&str> {
19    let diff_file_name = Regex::new(r"(?m)^\+\+\+\sb?/(.*)$").unwrap();
20    let diff_renamed_file = Regex::new(r"(?m)^rename to (.*)$").unwrap();
21    let diff_binary_file = Regex::new(r"(?m)^Binary\sfiles\s").unwrap();
22    if let Some(captures) = diff_file_name.captures(front_matter) {
23        return Some(captures.get(1).unwrap().as_str());
24    }
25    if front_matter.starts_with("similarity")
26        && let Some(captures) = diff_renamed_file.captures(front_matter)
27    {
28        return Some(captures.get(1).unwrap().as_str());
29    }
30    if !diff_binary_file.is_match(front_matter) {
31        log::warn!("Unrecognized diff starting with:\n{}", front_matter);
32    }
33    None
34}
35
36/// A regex pattern used in multiple functions
37static HUNK_INFO_PATTERN: &str = r"(?m)@@\s\-\d+,?\d*\s\+(\d+),?(\d*)\s@@";
38
39/// Parses a single file's patch containing one or more hunks
40///
41/// Returns a 2-item tuple:
42///
43/// - the line numbers that contain additions
44/// - the ranges of lines that span each hunk
45fn parse_patch(patch: &str) -> (Vec<u32>, Vec<Range<u32>>) {
46    let mut diff_hunks = Vec::new();
47    let mut additions = Vec::new();
48
49    let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
50    let hunk_headers = hunk_info.captures_iter(patch).collect::<Vec<_>>();
51    if !hunk_headers.is_empty() {
52        // skip the first split because it is anything that precedes first hunk header
53        let hunks = hunk_info.split(patch).skip(1);
54        for (hunk, header) in hunks.zip(hunk_headers) {
55            // header.unwrap() is safe because the hunk_headers.iter() is parallel to hunk_info.split()
56            let [start_line, end_range] = header.extract().1.map(|v| v.parse::<u32>().unwrap_or(1));
57            let mut line_numb_in_diff = start_line;
58            diff_hunks.push(start_line..start_line + end_range);
59            for (line_index, line) in hunk.split('\n').enumerate() {
60                if line.starts_with('+') {
61                    additions.push(line_numb_in_diff);
62                }
63                if line_index > 0 && !line.starts_with('-') {
64                    line_numb_in_diff += 1;
65                }
66            }
67        }
68    }
69    (additions, diff_hunks)
70}
71
72/// Parses a git `diff` string into a map of file names to their corresponding
73/// [`FileDiffLines`].
74///
75/// The `file_filter` is used to filter out files that are not of interest.
76/// The `lines_changed_only` parameter determines whether to include files
77/// based on their contents' changes.
78pub fn parse_diff(
79    diff: &str,
80    file_filter: &FileFilter,
81    lines_changed_only: &LinesChangedOnly,
82) -> HashMap<String, FileDiffLines> {
83    let mut results = HashMap::new();
84    let diff_file_delimiter = Regex::new(r"(?m)^diff --git a/.*$").unwrap();
85    let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
86
87    let file_diffs = diff_file_delimiter.split(diff);
88    for file_diff in file_diffs {
89        if file_diff.is_empty() || file_diff.starts_with("deleted file") {
90            continue;
91        }
92        let hunk_start = if let Some(first_hunk) = hunk_info.find(file_diff) {
93            first_hunk.start()
94        } else {
95            file_diff.len()
96        };
97        let front_matter = &file_diff[..hunk_start];
98        if let Some(file_name) = get_filename_from_front_matter(front_matter.trim_start()) {
99            let file_name = file_name.strip_prefix('/').unwrap_or(file_name);
100            let file_path = PathBuf::from(file_name);
101            if file_filter.is_not_ignored(&file_path) {
102                let (added_lines, diff_hunks) = parse_patch(&file_diff[hunk_start..]);
103                if lines_changed_only
104                    .is_change_valid(!added_lines.is_empty(), !diff_hunks.is_empty())
105                {
106                    results
107                        .entry(file_name.to_string())
108                        .or_insert_with(|| FileDiffLines::with_info(added_lines, diff_hunks));
109                }
110            }
111        }
112    }
113    results
114}
115
116// ******************* UNIT TESTS ***********************
117#[cfg(test)]
118mod test {
119    use super::parse_diff;
120    use crate::{FileFilter, LinesChangedOnly};
121
122    const RENAMED_DIFF: &'static str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
123similarity index 100%
124rename from /tests/demo/some source.cpp
125rename to /tests/demo/some source.c
126diff --git a/some picture.png b/some picture.png
127new file mode 100644
128Binary files /dev/null and b/some picture.png differ
129"#;
130
131    #[test]
132    fn parse_renamed_diff() {
133        let files = parse_diff(
134            RENAMED_DIFF,
135            &FileFilter::new(&[], &["c"], None),
136            &LinesChangedOnly::Off,
137        );
138        let git_file = files.get("tests/demo/some source.c").unwrap();
139        assert!(git_file.added_lines.is_empty());
140        assert!(git_file.diff_hunks.is_empty());
141    }
142
143    #[test]
144    fn parse_renamed_only_diff() {
145        let files = parse_diff(
146            RENAMED_DIFF,
147            &FileFilter::new(&[], &["c"], None),
148            &LinesChangedOnly::Diff,
149        );
150        assert!(files.is_empty());
151    }
152
153    const RENAMED_DIFF_WITH_CHANGES: &'static str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
154similarity index 99%
155rename from /tests/demo/some source.cpp
156rename to /tests/demo/some source.c
157@@ -3,7 +3,7 @@
158\n \n \n-#include "math.h"
159+#include <math.h>\n \n \n \n"#;
160
161    #[test]
162    fn parse_renamed_diff_with_patch() {
163        let files = parse_diff(
164            &String::from_iter([RENAMED_DIFF_WITH_CHANGES, TERSE_HEADERS]),
165            // ignore src/demo.cpp file (in TERSE_HEADERS) via glob (src/*);
166            // triggers code coverage of a `}` (region end)
167            &FileFilter::new(&["src/*"], &["c", "cpp"], None),
168            &LinesChangedOnly::On,
169        );
170        eprintln!("files: {files:#?}");
171        let git_file = files.get("tests/demo/some source.c").unwrap();
172        assert!(!git_file.is_line_in_diff(&1));
173        assert!(git_file.is_line_in_diff(&4));
174    }
175
176    const TYPICAL_DIFF: &str = "diff --git a/path/for/Some file.cpp b/path/to/Some file.cpp\n\
177                            --- a/path/for/Some file.cpp\n\
178                            +++ b/path/to/Some file.cpp\n\
179                            @@ -3,7 +3,7 @@\n \n \n \n\
180                            -#include <some_lib/render/animation.hpp>\n\
181                            +#include <some_lib/render/animations.hpp>\n \n \n \n";
182
183    #[test]
184    fn parse_typical_diff() {
185        let files = parse_diff(
186            TYPICAL_DIFF,
187            &FileFilter::new(&[], &["cpp"], None),
188            &LinesChangedOnly::On,
189        );
190        assert!(!files.is_empty());
191    }
192
193    const BINARY_DIFF: &'static str = "diff --git a/some picture.png b/some picture.png\n\
194                new file mode 100644\n\
195                Binary files /dev/null and b/some picture.png differ\n";
196
197    #[test]
198    fn parse_binary_diff() {
199        let files = parse_diff(
200            BINARY_DIFF,
201            &FileFilter::new(&[], &["png"], None),
202            &LinesChangedOnly::Diff,
203        );
204        assert!(files.is_empty());
205    }
206
207    const TERSE_HEADERS: &'static str = r#"diff --git a/src/demo.cpp b/src/demo.cpp
208--- a/src/demo.cpp
209+++ b/src/demo.cpp
210@@ -3 +3 @@
211-#include <stdio.h>
212+#include "stdio.h"
213@@ -4,0 +5,2 @@
214+auto main() -> int
215+{
216@@ -18 +17,2 @@ int main(){
217-    return 0;}
218+    return 0;
219+}"#;
220
221    #[test]
222    fn terse_hunk_header() {
223        let file_filter = FileFilter::new(&[], &["cpp"], None);
224        let files = parse_diff(TERSE_HEADERS, &file_filter, &LinesChangedOnly::Diff);
225        let file_diff = files.get("src/demo.cpp").unwrap();
226        assert_eq!(file_diff.diff_hunks, vec![3..4, 5..7, 17..19]);
227    }
228}