cpp_linter/
git.rs

1//! This module is primarily used to parse diff blobs.
2//!
3//! It can also be used (locally) to get a list of files changes from either the last
4//! commit or the next commit's staging area.
5//!
6//! This also includes a private module that is used as a fallback (brute force)
7//! mechanism when parsing diffs fail using libgit2. NOTE: parsing a diff from a buffer
8//! (str or bytes) only happens in CI or when libgit2 cannot be used to initialize a
9//! repository.
10
11use std::{ops::RangeInclusive, path::PathBuf};
12
13use anyhow::{Context, Result};
14// non-std crates
15use git2::{Diff, Error, Patch, Repository};
16
17// project specific modules/crates
18use crate::common_fs::{FileFilter, FileObj};
19
20/// This (re-)initializes the repository located in the specified `path`.
21///
22/// This is actually not used in CI for file permissions and ownership reasons.
23/// Rather this is only (supposed to be) used when executed on a local developer
24/// machine.
25pub fn open_repo(path: &str) -> Result<Repository, Error> {
26    Repository::open(PathBuf::from(path).as_path())
27}
28
29/// Fetches the SHA1 of the commit for the specified [`git2::Repository`].
30///
31/// The optionally specified `depth` can be used to traverse the tree a number of times
32/// since the current `"HEAD"`.
33fn get_sha(repo: &Repository, depth: Option<u32>) -> Result<git2::Object<'_>, Error> {
34    match depth {
35        Some(int) => repo.revparse_single(format!("HEAD~{}", int).as_str()),
36        None => repo.revparse_single("HEAD"),
37    }
38}
39
40/// Fetch the [`git2::Diff`] about a given [`git2::Repository`].
41///
42/// This is actually not used in CI for file permissions and ownership reasons.
43/// Rather this is only (supposed to be) used when executed on a local developer
44/// machine.
45///
46/// If there are files staged for a commit, then the resulting [`Diff`] will describe
47/// the staged changes. However, if there are no staged changes, then the last commit's
48/// [`Diff`] is returned.
49pub fn get_diff(repo: &Repository) -> Result<git2::Diff> {
50    let head = get_sha(repo, None).unwrap().peel_to_tree().unwrap();
51    let mut has_staged_files = false;
52    for entry in repo.statuses(None).unwrap().iter() {
53        if entry.status().bits()
54            & (git2::Status::INDEX_NEW.bits()
55                | git2::Status::INDEX_MODIFIED.bits()
56                | git2::Status::INDEX_RENAMED.bits())
57            > 0
58        {
59            has_staged_files = true;
60            break;
61        }
62    }
63
64    if has_staged_files {
65        // get diff for staged files only
66        repo.diff_tree_to_index(Some(&head), None, None)
67            .with_context(|| "Could not get diff for current changes in local repo index")
68    } else {
69        // get diff for last commit only
70        let base = get_sha(repo, Some(1)).unwrap().peel_to_tree().unwrap();
71        repo.diff_tree_to_tree(Some(&base), Some(&head), None)
72            .with_context(|| "Could not get diff for last commit")
73    }
74}
75
76/// Parses a patch for a single file in a diff.
77///
78/// Returns the list of line numbers that have additions and the ranges spanning each
79/// chunk present in the `patch`.
80fn parse_patch(patch: &Patch) -> (Vec<u32>, Vec<RangeInclusive<u32>>) {
81    let mut additions = Vec::new();
82    let mut diff_hunks = Vec::new();
83    for hunk_idx in 0..patch.num_hunks() {
84        let (hunk, line_count) = patch.hunk(hunk_idx).unwrap();
85        diff_hunks.push(RangeInclusive::new(
86            hunk.new_start(),
87            hunk.new_start() + hunk.new_lines(),
88        ));
89        for line in 0..line_count {
90            let diff_line = patch.line_in_hunk(hunk_idx, line).unwrap();
91            if diff_line.origin_value() == git2::DiffLineType::Addition {
92                additions.push(diff_line.new_lineno().unwrap());
93            }
94        }
95    }
96    (additions, diff_hunks)
97}
98
99/// Parses a given [`git2::Diff`] and returns a list of [`FileObj`]s.
100///
101/// The specified list of `extensions`, `ignored` and `not_ignored` files are used as
102/// filters to expedite the process and only focus on the data cpp_linter can use.
103pub fn parse_diff(diff: &git2::Diff, file_filter: &FileFilter) -> Vec<FileObj> {
104    let mut files: Vec<FileObj> = Vec::new();
105    for file_idx in 0..diff.deltas().count() {
106        let diff_delta = diff.get_delta(file_idx).unwrap();
107        let file_path = diff_delta.new_file().path().unwrap().to_path_buf();
108        if [
109            git2::Delta::Added,
110            git2::Delta::Modified,
111            git2::Delta::Renamed,
112        ]
113        .contains(&diff_delta.status())
114            && file_filter.is_source_or_ignored(&file_path)
115        {
116            let (added_lines, diff_chunks) =
117                parse_patch(&Patch::from_diff(diff, file_idx).unwrap().unwrap());
118            files.push(FileObj::from(file_path, added_lines, diff_chunks));
119        }
120    }
121    files
122}
123
124/// Same as [`parse_diff`] but takes a buffer of bytes instead of a [`git2::Diff`].
125///
126/// In the case that libgit2 fails to parse the buffer of bytes, a private algorithm is
127/// used. In such a case, brute force parsing the diff as a string can be costly. So, a
128/// log warning and error are output when this occurs. Please report this instance for
129/// troubleshooting/diagnosis as this likely means the diff is malformed or there is a
130/// bug in libgit2 source.
131pub fn parse_diff_from_buf(buff: &[u8], file_filter: &FileFilter) -> Vec<FileObj> {
132    if let Ok(diff_obj) = &Diff::from_buffer(buff) {
133        parse_diff(diff_obj, file_filter)
134    } else {
135        log::warn!("libgit2 failed to parse the diff");
136        brute_force_parse_diff::parse_diff(&String::from_utf8_lossy(buff), file_filter)
137    }
138}
139
140mod brute_force_parse_diff {
141    //! A private module to house the brute force algorithms of parsing a diff as a string.
142    //! This module is only intended as a fall back mechanism when [super::parse_diff_from_buf]
143    //! fails to use libgit2 C bindings.
144    //!
145    //! Since this is a fail safe, there are log messages that indicate when it is used.
146    //! Any instance where this mechanism is used should be reported as it is likely a bug
147    //! in libgit2 source.
148
149    use regex::Regex;
150    use std::{ops::RangeInclusive, path::PathBuf};
151
152    use crate::common_fs::{FileFilter, FileObj};
153
154    fn get_filename_from_front_matter(front_matter: &str) -> Option<&str> {
155        let diff_file_name = Regex::new(r"(?m)^\+\+\+\sb?/(.*)$").unwrap();
156        let diff_renamed_file = Regex::new(r"(?m)^rename to (.*)$").unwrap();
157        let diff_binary_file = Regex::new(r"(?m)^Binary\sfiles\s").unwrap();
158        if let Some(captures) = diff_file_name.captures(front_matter) {
159            return Some(captures.get(1).unwrap().as_str());
160        }
161        if front_matter.trim_start().starts_with("similarity") {
162            if let Some(captures) = diff_renamed_file.captures(front_matter) {
163                return Some(captures.get(1).unwrap().as_str());
164            }
165        }
166        if !diff_binary_file.is_match(front_matter) {
167            log::warn!("Unrecognized diff starting with:\n{}", front_matter);
168        }
169        None
170    }
171
172    /// A regex pattern used in multiple functions
173    static HUNK_INFO_PATTERN: &str = r"(?m)@@\s\-\d+,\d+\s\+(\d+,\d+)\s@@";
174
175    /// Parses a single file's patch containing one or more hunks
176    /// Returns a 3-item tuple:
177    /// - the line numbers that contain additions
178    /// - the ranges of lines that span each hunk
179    fn parse_patch(patch: &str) -> (Vec<u32>, Vec<RangeInclusive<u32>>) {
180        let mut diff_chunks = Vec::new();
181        let mut additions = Vec::new();
182
183        let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
184        if let Some(hunk_headers) = hunk_info.captures(patch) {
185            for (index, (hunk, header)) in
186                hunk_info.split(patch).zip(hunk_headers.iter()).enumerate()
187            {
188                if index == 0 {
189                    continue; // we don't need the whole match, just the capture groups
190                }
191                let new_range: Vec<u32> = header
192                    .unwrap()
193                    .as_str()
194                    .split(',')
195                    .take(2)
196                    .map(|val| val.parse::<u32>().unwrap())
197                    .collect();
198                let start_line = new_range[0];
199                let end_range = new_range[1];
200                let mut line_numb_in_diff = start_line;
201                diff_chunks.push(RangeInclusive::new(start_line, start_line + end_range));
202                for (line_index, line) in hunk.split('\n').enumerate() {
203                    if line.starts_with('+') {
204                        additions.push(line_numb_in_diff);
205                    }
206                    if line_index > 0 && !line.starts_with('-') {
207                        line_numb_in_diff += 1;
208                    }
209                }
210            }
211        }
212        (additions, diff_chunks)
213    }
214
215    pub fn parse_diff(diff: &str, file_filter: &FileFilter) -> Vec<FileObj> {
216        log::error!("Using brute force diff parsing!");
217        let mut results = Vec::new();
218        let diff_file_delimiter = Regex::new(r"(?m)^diff --git a/.*$").unwrap();
219        let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
220
221        let file_diffs = diff_file_delimiter.split(diff);
222        for file_diff in file_diffs {
223            if file_diff.is_empty() || file_diff.starts_with("deleted file") {
224                continue;
225            }
226            let hunk_start = if let Some(first_hunk) = hunk_info.find(file_diff) {
227                first_hunk.start()
228            } else {
229                file_diff.len()
230            };
231            let front_matter = &file_diff[..hunk_start];
232            if let Some(file_name) = get_filename_from_front_matter(front_matter) {
233                let file_path = PathBuf::from(file_name);
234                if file_filter.is_source_or_ignored(&file_path) {
235                    let (added_lines, diff_chunks) = parse_patch(&file_diff[hunk_start..]);
236                    results.push(FileObj::from(file_path, added_lines, diff_chunks));
237                }
238            }
239            // } else {
240            //     // file has no changed content. moving on
241            //     continue;
242            // }
243        }
244        results
245    }
246
247    // ******************* UNIT TESTS ***********************
248    #[cfg(test)]
249    mod test {
250
251        use super::parse_diff;
252        use crate::{
253            common_fs::{FileFilter, FileObj},
254            git::parse_diff_from_buf,
255        };
256
257        static RENAMED_DIFF: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
258similarity index 100%
259rename from /tests/demo/some source.cpp
260rename to /tests/demo/some source.c
261diff --git a/some picture.png b/some picture.png
262new file mode 100644
263Binary files /dev/null and b/some picture.png differ
264"#;
265
266        static RENAMED_DIFF_WITH_CHANGES: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
267similarity index 99%
268rename from /tests/demo/some source.cpp
269rename to /tests/demo/some source.c
270@@ -3,7 +3,7 @@
271\n \n \n-#include "iomanip"
272+#include <cstdlib>\n \n \n \n"#;
273
274        #[test]
275        fn parse_renamed_diff() {
276            let diff_buf = RENAMED_DIFF.as_bytes();
277            let files = parse_diff_from_buf(
278                diff_buf,
279                &FileFilter::new(&["target".to_string()], vec!["c".to_string()]),
280            );
281            assert!(!files.is_empty());
282            assert!(files
283                .first()
284                .unwrap()
285                .name
286                .ends_with("tests/demo/some source.c"));
287        }
288
289        #[test]
290        fn parse_renamed_diff_with_patch() {
291            let diff_buf = RENAMED_DIFF_WITH_CHANGES.as_bytes();
292            let files = parse_diff_from_buf(
293                diff_buf,
294                &FileFilter::new(&["target".to_string()], vec!["c".to_string()]),
295            );
296            assert!(!files.is_empty());
297        }
298
299        /// Used to parse the same string buffer using both libgit2 and brute force regex.
300        /// Returns 2 vectors of [FileObj] that should be equivalent.
301        fn setup_parsed(buf: &str, extensions: &[String]) -> (Vec<FileObj>, Vec<FileObj>) {
302            let ignore = ["target".to_string()];
303            (
304                parse_diff_from_buf(
305                    buf.as_bytes(),
306                    &FileFilter::new(&ignore, extensions.to_owned()),
307                ),
308                parse_diff(buf, &FileFilter::new(&ignore, extensions.to_owned())),
309            )
310        }
311
312        fn assert_files_eq(files_from_a: &[FileObj], files_from_b: &[FileObj]) {
313            assert_eq!(files_from_a.len(), files_from_b.len());
314            for (a, b) in files_from_a.iter().zip(files_from_b) {
315                assert_eq!(a.name, b.name);
316                assert_eq!(a.added_lines, b.added_lines);
317                assert_eq!(a.added_ranges, b.added_ranges);
318                assert_eq!(a.diff_chunks, b.diff_chunks);
319            }
320        }
321
322        #[test]
323        fn parse_typical_diff() {
324            let diff_buf = "diff --git a/path/for/Some file.cpp b/path/to/Some file.cpp\n\
325                            --- a/path/for/Some file.cpp\n\
326                            +++ b/path/to/Some file.cpp\n\
327                            @@ -3,7 +3,7 @@\n \n \n \n\
328                            -#include <some_lib/render/animation.hpp>\n\
329                            +#include <some_lib/render/animations.hpp>\n \n \n \n";
330
331            let (files_from_buf, files_from_str) = setup_parsed(diff_buf, &[String::from("cpp")]);
332            assert!(!files_from_buf.is_empty());
333            assert_files_eq(&files_from_buf, &files_from_str);
334        }
335
336        #[test]
337        fn parse_binary_diff() {
338            let diff_buf = "diff --git a/some picture.png b/some picture.png\n\
339                new file mode 100644\n\
340                Binary files /dev/null and b/some picture.png differ\n";
341
342            let (files_from_buf, files_from_str) = setup_parsed(diff_buf, &[String::from("png")]);
343            assert!(files_from_buf.is_empty());
344            assert_files_eq(&files_from_buf, &files_from_str);
345        }
346    }
347}
348
349#[cfg(test)]
350mod test {
351    use std::{
352        env::{self, current_dir, set_current_dir},
353        fs::read,
354    };
355
356    use git2::build::CheckoutBuilder;
357    use git2::{ApplyLocation, Diff, IndexAddOption, Repository};
358
359    // used to setup a testing stage
360    fn clone_repo(url: &str, sha: &str, path: &str, patch_path: Option<&str>) {
361        let repo = Repository::clone(url, path).unwrap();
362        let commit = repo.revparse_single(sha).unwrap();
363        repo.checkout_tree(
364            &commit,
365            Some(CheckoutBuilder::new().force().recreate_missing(true)),
366        )
367        .unwrap();
368        repo.set_head_detached(commit.id()).unwrap();
369        if let Some(patch) = patch_path {
370            let diff = Diff::from_buffer(&read(patch).unwrap()).unwrap();
371            repo.apply(&diff, ApplyLocation::Both, None).unwrap();
372            let mut index = repo.index().unwrap();
373            index
374                .add_all(["tests/demo/demo.*"], IndexAddOption::DEFAULT, None)
375                .unwrap();
376            index.write().unwrap();
377        }
378    }
379
380    use tempfile::{tempdir, TempDir};
381
382    use crate::{
383        common_fs::FileFilter,
384        rest_api::{github::GithubApiClient, RestApiClient},
385    };
386
387    fn get_temp_dir() -> TempDir {
388        let tmp = tempdir().unwrap();
389        println!("Using temp folder at {:?}", tmp.path());
390        tmp
391    }
392
393    async fn checkout_cpp_linter_py_repo(
394        sha: &str,
395        extensions: &[String],
396        tmp: &TempDir,
397        patch_path: Option<&str>,
398    ) -> Vec<crate::common_fs::FileObj> {
399        let url = "https://github.com/cpp-linter/cpp-linter";
400        clone_repo(
401            url,
402            sha,
403            tmp.path().as_os_str().to_str().unwrap(),
404            patch_path,
405        );
406        let rest_api_client = GithubApiClient::new();
407        let file_filter = FileFilter::new(&["target".to_string()], extensions.to_owned());
408        set_current_dir(tmp).unwrap();
409        env::set_var("CI", "false"); // avoid use of REST API when testing in CI
410        rest_api_client
411            .unwrap()
412            .get_list_of_changed_files(&file_filter)
413            .await
414            .unwrap()
415    }
416
417    #[tokio::test]
418    async fn with_no_changed_sources() {
419        // commit with no modified C/C++ sources
420        let sha = "0c236809891000b16952576dc34de082d7a40bf3";
421        let cur_dir = current_dir().unwrap();
422        let tmp = get_temp_dir();
423        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
424        let files = checkout_cpp_linter_py_repo(sha, &extensions, &tmp, None).await;
425        println!("files = {:?}", files);
426        assert!(files.is_empty());
427        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
428        drop(tmp); // delete temp_folder
429    }
430
431    #[tokio::test]
432    async fn with_changed_sources() {
433        // commit with modified C/C++ sources
434        let sha = "950ff0b690e1903797c303c5fc8d9f3b52f1d3c5";
435        let cur_dir = current_dir().unwrap();
436        let tmp = get_temp_dir();
437        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
438        let files = checkout_cpp_linter_py_repo(sha, &extensions.clone(), &tmp, None).await;
439        println!("files = {:?}", files);
440        assert!(files.len() >= 2);
441        for file in files {
442            assert!(
443                extensions.contains(&file.name.extension().unwrap().to_string_lossy().to_string())
444            );
445        }
446        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
447        drop(tmp); // delete temp_folder
448    }
449
450    #[tokio::test]
451    async fn with_staged_changed_sources() {
452        // commit with no modified C/C++ sources
453        let sha = "0c236809891000b16952576dc34de082d7a40bf3";
454        let cur_dir = current_dir().unwrap();
455        let tmp = get_temp_dir();
456        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
457        let files = checkout_cpp_linter_py_repo(
458            sha,
459            &extensions.clone(),
460            &tmp,
461            Some("tests/git_status_test_assets/cpp-linter/cpp-linter/test_git_lib.patch"),
462        )
463        .await;
464        println!("files = {:?}", files);
465        assert!(!files.is_empty());
466        for file in files {
467            assert!(
468                extensions.contains(&file.name.extension().unwrap().to_string_lossy().to_string())
469            );
470        }
471        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
472        drop(tmp); // delete temp_folder
473    }
474}