Skip to main content

git_bot_feedback/
git_diff.rs

1#[cfg(feature = "pyo3")]
2use pyo3::prelude::*;
3
4use regex::Regex;
5use std::{collections::HashMap, ops::Range, path::Path};
6
7use crate::{FileDiffLines, FileFilter, LinesChangedOnly, error::DiffError};
8
9/// A struct to represent the header information of a diff hunk.
10#[cfg_attr(
11    feature = "pyo3",
12    pyclass(module = "git_bot_feedback", from_py_object, get_all, frozen)
13)]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub struct DiffHunkHeader {
16    /// The starting line number of the old hunk.
17    pub old_start: u32,
18    /// The total number of lines in the old hunk.
19    pub old_lines: u32,
20    /// The starting line number of the new hunk.
21    pub new_start: u32,
22    /// The total number of lines in the new hunk.
23    pub new_lines: u32,
24}
25
26#[cfg(feature = "pyo3")]
27#[pymethods]
28impl DiffHunkHeader {
29    /// Create a new diff hunk header instance.
30    #[new]
31    #[pyo3(text_signature = "(old_start: int, old_lines: int, new_start: int, new_lines: int)")]
32    pub fn new_py(old_start: i64, old_lines: i64, new_start: i64, new_lines: i64) -> Self {
33        Self {
34            old_start: old_start.clamp(0, u32::MAX as i64) as u32,
35            old_lines: old_lines.clamp(0, u32::MAX as i64) as u32,
36            new_start: new_start.clamp(0, u32::MAX as i64) as u32,
37            new_lines: new_lines.clamp(0, u32::MAX as i64) as u32,
38        }
39    }
40}
41
42fn get_filename_from_front_matter(front_matter: &str) -> Result<Option<&str>, DiffError> {
43    let diff_file_name = Regex::new(r"(?m)^\+\+\+\sb?/(.*)$")?;
44    let diff_renamed_file = Regex::new(r"(?m)^rename to (.*)$")?;
45    let diff_binary_file = Regex::new(r"(?m)^Binary\sfiles\s")?;
46    if let Some(captures) = diff_file_name.captures(front_matter)
47        && let Some(name) = captures.get(1)
48    {
49        return Ok(Some(name.as_str()));
50    }
51    if front_matter.starts_with("similarity")
52        && let Some(captures) = diff_renamed_file.captures(front_matter)
53        && let Some(name) = captures.get(1)
54    {
55        return Ok(Some(name.as_str()));
56    }
57    if !diff_binary_file.is_match(front_matter) {
58        log::warn!("Unrecognized diff starting with:\n{}", front_matter);
59    }
60    Ok(None)
61}
62
63/// A regex pattern used in multiple functions
64static HUNK_INFO_PATTERN: &str = r"(?m)@@\s\-\d+,?\d*\s\+(\d+),?(\d*)\s@@";
65
66/// Parses a single file's patch containing one or more hunks
67///
68/// Returns a 2-item tuple:
69///
70/// - the line numbers that contain additions
71/// - the ranges of lines that span each hunk
72fn parse_patch(patch: &str) -> Result<(Vec<u32>, Vec<Range<u32>>), DiffError> {
73    let mut diff_hunks = Vec::new();
74    let mut additions = Vec::new();
75
76    let hunk_info = Regex::new(HUNK_INFO_PATTERN)?;
77    let hunk_headers = hunk_info.captures_iter(patch).collect::<Vec<_>>();
78    if !hunk_headers.is_empty() {
79        // skip the first split because it is anything that precedes first hunk header
80        let hunks = hunk_info.split(patch).skip(1);
81        for (hunk, header) in hunks.zip(hunk_headers) {
82            // header.unwrap() is safe because the hunk_headers.iter() is parallel to hunk_info.split()
83            let [start_line, end_range] = header.extract().1.map(|v| v.parse::<u32>().unwrap_or(1));
84            let mut line_numb_in_diff = start_line;
85            diff_hunks.push(start_line..start_line + end_range);
86            for (line_index, line) in hunk.split('\n').enumerate() {
87                if line.starts_with('+') {
88                    additions.push(line_numb_in_diff);
89                }
90                if line_index > 0 && !line.starts_with('-') {
91                    line_numb_in_diff += 1;
92                }
93            }
94        }
95    }
96    Ok((additions, diff_hunks))
97}
98
99/// Parses a git `diff` string into a map of file names to their corresponding
100/// [`FileDiffLines`].
101///
102/// The `file_filter` is used to filter out files that are not of interest.
103/// The `lines_changed_only` parameter determines whether to include files
104/// based on their contents' changes.
105pub fn parse_diff(
106    diff: &str,
107    file_filter: &FileFilter,
108    lines_changed_only: &LinesChangedOnly,
109) -> Result<HashMap<String, FileDiffLines>, DiffError> {
110    let mut results = HashMap::new();
111    let diff_file_delimiter = Regex::new(r"(?m)^diff \-\-git a/.*$")?;
112    let hunk_info = Regex::new(HUNK_INFO_PATTERN)?;
113
114    let file_diffs = diff_file_delimiter.split(diff);
115    for file_diff in file_diffs {
116        if file_diff.is_empty() || file_diff.starts_with("deleted file") {
117            continue;
118        }
119        let hunk_start = if let Some(first_hunk) = hunk_info.find(file_diff) {
120            first_hunk.start()
121        } else {
122            file_diff.len()
123        };
124        let front_matter = &file_diff[..hunk_start];
125        if let Some(file_name) = get_filename_from_front_matter(front_matter.trim_start())? {
126            let file_name = file_name.strip_prefix('/').unwrap_or(file_name);
127            if file_filter.is_qualified(Path::new(file_name)) {
128                let (added_lines, diff_hunks) = parse_patch(&file_diff[hunk_start..])?;
129                if lines_changed_only
130                    .is_change_valid(!added_lines.is_empty(), !diff_hunks.is_empty())
131                {
132                    results
133                        .entry(file_name.to_string())
134                        .or_insert_with(|| FileDiffLines::with_info(added_lines, diff_hunks));
135                }
136            }
137        }
138    }
139    Ok(results)
140}
141
142// ******************* UNIT TESTS ***********************
143#[cfg(test)]
144mod test {
145    #![allow(clippy::unwrap_used)]
146
147    use super::parse_diff;
148    use crate::{FileFilter, LinesChangedOnly};
149
150    const RENAMED_DIFF: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
151similarity index 100%
152rename from /tests/demo/some source.cpp
153rename to /tests/demo/some source.c
154diff --git a/some picture.png b/some picture.png
155new file mode 100644
156Binary files /dev/null and b/some picture.png differ
157"#;
158
159    #[test]
160    fn parse_renamed_diff() {
161        let files = parse_diff(
162            RENAMED_DIFF,
163            &FileFilter::new(&[], &["c"], None),
164            &LinesChangedOnly::Off,
165        )
166        .unwrap();
167        let git_file = files.get("tests/demo/some source.c").unwrap();
168        assert!(git_file.added_lines.is_empty());
169        assert!(git_file.diff_hunks.is_empty());
170    }
171
172    #[test]
173    fn parse_renamed_only_diff() {
174        let files = parse_diff(
175            RENAMED_DIFF,
176            &FileFilter::new(&[], &["c"], None),
177            &LinesChangedOnly::Diff,
178        )
179        .unwrap();
180        assert!(files.is_empty());
181    }
182
183    const RENAMED_DIFF_WITH_CHANGES: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
184similarity index 99%
185rename from /tests/demo/some source.cpp
186rename to /tests/demo/some source.c
187@@ -3,7 +3,7 @@
188\n \n \n-#include "math.h"
189+#include <math.h>\n \n \n \n"#;
190
191    #[test]
192    fn parse_renamed_diff_with_patch() {
193        let files = parse_diff(
194            &String::from_iter([RENAMED_DIFF_WITH_CHANGES, TERSE_HEADERS]),
195            // ignore src/demo.cpp file (in TERSE_HEADERS) via glob (src/*);
196            // triggers code coverage of a `}` (region end)
197            &FileFilter::new(&["src/*"], &["c", "cpp"], None),
198            &LinesChangedOnly::On,
199        )
200        .unwrap();
201        eprintln!("files: {files:#?}");
202        let git_file = files.get("tests/demo/some source.c").unwrap();
203        assert!(!git_file.is_line_in_diff(&1));
204        assert!(git_file.is_line_in_diff(&4));
205    }
206
207    const TYPICAL_DIFF: &str = "diff --git a/path/for/Some file.cpp b/path/to/Some file.cpp\n\
208                            --- a/path/for/Some file.cpp\n\
209                            +++ b/path/to/Some file.cpp\n\
210                            @@ -3,7 +3,7 @@\n \n \n \n\
211                            -#include <some_lib/render/animation.hpp>\n\
212                            +#include <some_lib/render/animations.hpp>\n \n \n \n";
213
214    #[test]
215    fn parse_typical_diff() {
216        let files = parse_diff(
217            TYPICAL_DIFF,
218            &FileFilter::new(&[], &["cpp"], None),
219            &LinesChangedOnly::On,
220        )
221        .unwrap();
222        assert!(!files.is_empty());
223    }
224
225    const BINARY_DIFF: &str = "diff --git a/some picture.png b/some picture.png\n\
226                new file mode 100644\n\
227                Binary files /dev/null and b/some picture.png differ\n";
228
229    #[test]
230    fn parse_binary_diff() {
231        let files = parse_diff(
232            BINARY_DIFF,
233            &FileFilter::new(&[], &["png"], None),
234            &LinesChangedOnly::Diff,
235        )
236        .unwrap();
237        assert!(files.is_empty());
238    }
239
240    const TERSE_HEADERS: &str = r#"diff --git a/src/demo.cpp b/src/demo.cpp
241--- a/src/demo.cpp
242+++ b/src/demo.cpp
243@@ -3 +3 @@
244-#include <stdio.h>
245+#include "stdio.h"
246@@ -4,0 +5,2 @@
247+auto main() -> int
248+{
249@@ -18 +17,2 @@ int main(){
250-    return 0;}
251+    return 0;
252+}"#;
253
254    #[test]
255    fn terse_hunk_header() {
256        let file_filter = FileFilter::new(&[], &["cpp"], None);
257        let files = parse_diff(TERSE_HEADERS, &file_filter, &LinesChangedOnly::Diff).unwrap();
258        let file_diff = files.get("src/demo.cpp").unwrap();
259        assert_eq!(file_diff.diff_hunks, vec![3..4, 5..7, 17..19]);
260    }
261}