cpp_linter/
git.rs

1//! This module is primarily used to parse diff blobs.
2//!
3//! It can also be used (locally) to get a list of files changes from either the last
4//! commit or the next commit's staging area.
5//!
6//! This also includes a private module that is used as a fallback (brute force)
7//! mechanism when parsing diffs fail using libgit2. NOTE: parsing a diff from a buffer
8//! (str or bytes) only happens in CI or when libgit2 cannot be used to initialize a
9//! repository.
10
11use std::{ops::RangeInclusive, path::PathBuf};
12
13use anyhow::{Context, Result};
14// non-std crates
15use git2::{Diff, Error, Patch, Repository};
16
17// project specific modules/crates
18use crate::{
19    cli::LinesChangedOnly,
20    common_fs::{FileFilter, FileObj},
21};
22
23/// This (re-)initializes the repository located in the specified `path`.
24///
25/// This is actually not used in CI for file permissions and ownership reasons.
26/// Rather this is only (supposed to be) used when executed on a local developer
27/// machine.
28pub fn open_repo(path: &str) -> Result<Repository, Error> {
29    Repository::open(PathBuf::from(path).as_path())
30}
31
32/// Fetches the SHA1 of the commit for the specified [`git2::Repository`].
33///
34/// The optionally specified `depth` can be used to traverse the tree a number of times
35/// since the current `"HEAD"`.
36fn get_sha(repo: &Repository, depth: Option<u32>) -> Result<git2::Object<'_>, Error> {
37    match depth {
38        Some(int) => repo.revparse_single(format!("HEAD~{}", int).as_str()),
39        None => repo.revparse_single("HEAD"),
40    }
41}
42
43/// Fetch the [`git2::Diff`] about a given [`git2::Repository`].
44///
45/// This is actually not used in CI for file permissions and ownership reasons.
46/// Rather this is only (supposed to be) used when executed on a local developer
47/// machine.
48///
49/// If there are files staged for a commit, then the resulting [`Diff`] will describe
50/// the staged changes. However, if there are no staged changes, then the last commit's
51/// [`Diff`] is returned.
52pub fn get_diff(repo: &'_ Repository) -> Result<git2::Diff<'_>> {
53    let head = get_sha(repo, None).unwrap().peel_to_tree().unwrap();
54    let mut has_staged_files = false;
55    for entry in repo.statuses(None).unwrap().iter() {
56        if entry.status().bits()
57            & (git2::Status::INDEX_NEW.bits()
58                | git2::Status::INDEX_MODIFIED.bits()
59                | git2::Status::INDEX_RENAMED.bits())
60            > 0
61        {
62            has_staged_files = true;
63            break;
64        }
65    }
66
67    // RARE BUG when `head` is the first commit in the repo! Affects local-only runs.
68    // > panicked at cpp-linter\src\git.rs:73:43:
69    // > called `Result::unwrap()` on an `Err` value:
70    // > Error { code: -3, class: 3, message: "parent 0 does not exist" }
71    if has_staged_files {
72        // get diff for staged files only
73        repo.diff_tree_to_index(Some(&head), None, None)
74            .with_context(|| "Could not get diff for current changes in local repo index")
75    } else {
76        // get diff for last commit only
77        let base = get_sha(repo, Some(1)).unwrap().peel_to_tree().unwrap();
78        repo.diff_tree_to_tree(Some(&base), Some(&head), None)
79            .with_context(|| "Could not get diff for last commit")
80    }
81}
82
83/// Parses a patch for a single file in a diff.
84///
85/// Returns the list of line numbers that have additions and the ranges spanning each
86/// chunk present in the `patch`.
87fn parse_patch(patch: &Patch) -> (Vec<u32>, Vec<RangeInclusive<u32>>) {
88    let mut additions = Vec::new();
89    let mut diff_hunks = Vec::new();
90    for hunk_idx in 0..patch.num_hunks() {
91        let (hunk, line_count) = patch.hunk(hunk_idx).unwrap();
92        diff_hunks.push(RangeInclusive::new(
93            hunk.new_start(),
94            hunk.new_start() + hunk.new_lines(),
95        ));
96        for line in 0..line_count {
97            let diff_line = patch.line_in_hunk(hunk_idx, line).unwrap();
98            if diff_line.origin_value() == git2::DiffLineType::Addition {
99                additions.push(diff_line.new_lineno().unwrap());
100            }
101        }
102    }
103    (additions, diff_hunks)
104}
105
106/// Parses a given [`git2::Diff`] and returns a list of [`FileObj`]s.
107///
108/// The specified list of `extensions`, `ignored` and `not_ignored` files are used as
109/// filters to expedite the process and only focus on the data cpp_linter can use.
110pub fn parse_diff(
111    diff: &git2::Diff,
112    file_filter: &FileFilter,
113    lines_changed_only: &LinesChangedOnly,
114) -> Vec<FileObj> {
115    let mut files: Vec<FileObj> = Vec::new();
116    for file_idx in 0..diff.deltas().count() {
117        let diff_delta = diff.get_delta(file_idx).unwrap();
118        let file_path = diff_delta.new_file().path().unwrap().to_path_buf();
119        if matches!(
120            diff_delta.status(),
121            git2::Delta::Added | git2::Delta::Modified | git2::Delta::Renamed,
122        ) && file_filter.is_source_or_ignored(&file_path)
123        {
124            let (added_lines, diff_chunks) =
125                parse_patch(&Patch::from_diff(diff, file_idx).unwrap().unwrap());
126            if lines_changed_only.is_change_valid(!added_lines.is_empty(), !diff_chunks.is_empty())
127            {
128                files.push(FileObj::from(file_path, added_lines, diff_chunks));
129            }
130        }
131    }
132    files
133}
134
135/// Same as [`parse_diff`] but takes a buffer of bytes instead of a [`git2::Diff`].
136///
137/// In the case that libgit2 fails to parse the buffer of bytes, a private algorithm is
138/// used. In such a case, brute force parsing the diff as a string can be costly. So, a
139/// log warning and error are output when this occurs. Please report this instance for
140/// troubleshooting/diagnosis as this likely means the diff is malformed or there is a
141/// bug in libgit2 source.
142pub fn parse_diff_from_buf(
143    buff: &[u8],
144    file_filter: &FileFilter,
145    lines_changed_only: &LinesChangedOnly,
146) -> Vec<FileObj> {
147    if let Ok(diff_obj) = &Diff::from_buffer(buff) {
148        parse_diff(diff_obj, file_filter, lines_changed_only)
149    } else {
150        log::warn!("libgit2 failed to parse the diff");
151        brute_force_parse_diff::parse_diff(
152            &String::from_utf8_lossy(buff),
153            file_filter,
154            lines_changed_only,
155        )
156    }
157}
158
159mod brute_force_parse_diff {
160    //! A private module to house the brute force algorithms of parsing a diff as a string.
161    //! This module is only intended as a fall back mechanism when [super::parse_diff_from_buf]
162    //! fails to use libgit2 C bindings.
163    //!
164    //! Since this is a fail safe, there are log messages that indicate when it is used.
165    //! Any instance where this mechanism is used should be reported as it is likely a bug
166    //! in libgit2 source.
167
168    use regex::Regex;
169    use std::{ops::RangeInclusive, path::PathBuf};
170
171    use crate::{
172        cli::LinesChangedOnly,
173        common_fs::{FileFilter, FileObj},
174    };
175
176    fn get_filename_from_front_matter(front_matter: &str) -> Option<&str> {
177        let diff_file_name = Regex::new(r"(?m)^\+\+\+\sb?/(.*)$").unwrap();
178        let diff_renamed_file = Regex::new(r"(?m)^rename to (.*)$").unwrap();
179        let diff_binary_file = Regex::new(r"(?m)^Binary\sfiles\s").unwrap();
180        if let Some(captures) = diff_file_name.captures(front_matter) {
181            return Some(captures.get(1).unwrap().as_str());
182        }
183        if front_matter.trim_start().starts_with("similarity") {
184            if let Some(captures) = diff_renamed_file.captures(front_matter) {
185                return Some(captures.get(1).unwrap().as_str());
186            }
187        }
188        if !diff_binary_file.is_match(front_matter) {
189            log::warn!("Unrecognized diff starting with:\n{}", front_matter);
190        }
191        None
192    }
193
194    /// A regex pattern used in multiple functions
195    static HUNK_INFO_PATTERN: &str = r"(?m)@@\s\-\d+,\d+\s\+(\d+,\d+)\s@@";
196
197    /// Parses a single file's patch containing one or more hunks
198    /// Returns a 3-item tuple:
199    /// - the line numbers that contain additions
200    /// - the ranges of lines that span each hunk
201    fn parse_patch(patch: &str) -> (Vec<u32>, Vec<RangeInclusive<u32>>) {
202        let mut diff_chunks = Vec::new();
203        let mut additions = Vec::new();
204
205        let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
206        if let Some(hunk_headers) = hunk_info.captures(patch) {
207            for (index, (hunk, header)) in
208                hunk_info.split(patch).zip(hunk_headers.iter()).enumerate()
209            {
210                if index == 0 {
211                    continue; // we don't need the whole match, just the capture groups
212                }
213                let new_range: Vec<u32> = header
214                    .unwrap()
215                    .as_str()
216                    .split(',')
217                    .take(2)
218                    .map(|val| val.parse::<u32>().unwrap())
219                    .collect();
220                let start_line = new_range[0];
221                let end_range = new_range[1];
222                let mut line_numb_in_diff = start_line;
223                diff_chunks.push(RangeInclusive::new(start_line, start_line + end_range));
224                for (line_index, line) in hunk.split('\n').enumerate() {
225                    if line.starts_with('+') {
226                        additions.push(line_numb_in_diff);
227                    }
228                    if line_index > 0 && !line.starts_with('-') {
229                        line_numb_in_diff += 1;
230                    }
231                }
232            }
233        }
234        (additions, diff_chunks)
235    }
236
237    pub fn parse_diff(
238        diff: &str,
239        file_filter: &FileFilter,
240        lines_changed_only: &LinesChangedOnly,
241    ) -> Vec<FileObj> {
242        log::error!("Using brute force diff parsing!");
243        let mut results = Vec::new();
244        let diff_file_delimiter = Regex::new(r"(?m)^diff --git a/.*$").unwrap();
245        let hunk_info = Regex::new(HUNK_INFO_PATTERN).unwrap();
246
247        let file_diffs = diff_file_delimiter.split(diff);
248        for file_diff in file_diffs {
249            if file_diff.is_empty() || file_diff.starts_with("deleted file") {
250                continue;
251            }
252            let hunk_start = if let Some(first_hunk) = hunk_info.find(file_diff) {
253                first_hunk.start()
254            } else {
255                file_diff.len()
256            };
257            let front_matter = &file_diff[..hunk_start];
258            if let Some(file_name) = get_filename_from_front_matter(front_matter) {
259                let file_path = PathBuf::from(file_name);
260                if file_filter.is_source_or_ignored(&file_path) {
261                    let (added_lines, diff_chunks) = parse_patch(&file_diff[hunk_start..]);
262                    if lines_changed_only
263                        .is_change_valid(!added_lines.is_empty(), !diff_chunks.is_empty())
264                    {
265                        results.push(FileObj::from(file_path, added_lines, diff_chunks));
266                    }
267                }
268            }
269            // } else {
270            //     // file has no changed content. moving on
271            //     continue;
272            // }
273        }
274        results
275    }
276
277    // ******************* UNIT TESTS ***********************
278    #[cfg(test)]
279    mod test {
280
281        use super::parse_diff;
282        use crate::{
283            cli::LinesChangedOnly,
284            common_fs::{FileFilter, FileObj},
285            git::parse_diff_from_buf,
286        };
287
288        static RENAMED_DIFF: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
289similarity index 100%
290rename from /tests/demo/some source.cpp
291rename to /tests/demo/some source.c
292diff --git a/some picture.png b/some picture.png
293new file mode 100644
294Binary files /dev/null and b/some picture.png differ
295"#;
296
297        static RENAMED_DIFF_WITH_CHANGES: &str = r#"diff --git a/tests/demo/some source.cpp b/tests/demo/some source.c
298similarity index 99%
299rename from /tests/demo/some source.cpp
300rename to /tests/demo/some source.c
301@@ -3,7 +3,7 @@
302\n \n \n-#include "iomanip"
303+#include <cstdlib>\n \n \n \n"#;
304
305        #[test]
306        fn parse_renamed_diff() {
307            let diff_buf = RENAMED_DIFF.as_bytes();
308            let files = parse_diff_from_buf(
309                diff_buf,
310                &FileFilter::new(&["target".to_string()], vec!["c".to_string()]),
311                &LinesChangedOnly::Off,
312            );
313            assert!(!files.is_empty());
314            assert!(files
315                .first()
316                .unwrap()
317                .name
318                .ends_with("tests/demo/some source.c"));
319        }
320
321        #[test]
322        fn parse_renamed_diff_with_patch() {
323            let diff_buf = RENAMED_DIFF_WITH_CHANGES.as_bytes();
324            let files = parse_diff_from_buf(
325                diff_buf,
326                &FileFilter::new(&["target".to_string()], vec!["c".to_string()]),
327                &LinesChangedOnly::Off,
328            );
329            assert!(!files.is_empty());
330        }
331
332        /// Used to parse the same string buffer using both libgit2 and brute force regex.
333        /// Returns 2 vectors of [FileObj] that should be equivalent.
334        fn setup_parsed(buf: &str, extensions: &[String]) -> (Vec<FileObj>, Vec<FileObj>) {
335            let ignore = ["target".to_string()];
336            (
337                parse_diff_from_buf(
338                    buf.as_bytes(),
339                    &FileFilter::new(&ignore, extensions.to_owned()),
340                    &LinesChangedOnly::Off,
341                ),
342                parse_diff(
343                    buf,
344                    &FileFilter::new(&ignore, extensions.to_owned()),
345                    &LinesChangedOnly::Off,
346                ),
347            )
348        }
349
350        fn assert_files_eq(files_from_a: &[FileObj], files_from_b: &[FileObj]) {
351            assert_eq!(files_from_a.len(), files_from_b.len());
352            for (a, b) in files_from_a.iter().zip(files_from_b) {
353                assert_eq!(a.name, b.name);
354                assert_eq!(a.added_lines, b.added_lines);
355                assert_eq!(a.added_ranges, b.added_ranges);
356                assert_eq!(a.diff_chunks, b.diff_chunks);
357            }
358        }
359
360        #[test]
361        fn parse_typical_diff() {
362            let diff_buf = "diff --git a/path/for/Some file.cpp b/path/to/Some file.cpp\n\
363                            --- a/path/for/Some file.cpp\n\
364                            +++ b/path/to/Some file.cpp\n\
365                            @@ -3,7 +3,7 @@\n \n \n \n\
366                            -#include <some_lib/render/animation.hpp>\n\
367                            +#include <some_lib/render/animations.hpp>\n \n \n \n";
368
369            let (files_from_buf, files_from_str) = setup_parsed(diff_buf, &[String::from("cpp")]);
370            assert!(!files_from_buf.is_empty());
371            assert_files_eq(&files_from_buf, &files_from_str);
372        }
373
374        #[test]
375        fn parse_binary_diff() {
376            let diff_buf = "diff --git a/some picture.png b/some picture.png\n\
377                new file mode 100644\n\
378                Binary files /dev/null and b/some picture.png differ\n";
379
380            let (files_from_buf, files_from_str) = setup_parsed(diff_buf, &[String::from("png")]);
381            assert!(files_from_buf.is_empty());
382            assert_files_eq(&files_from_buf, &files_from_str);
383        }
384    }
385}
386
387#[cfg(test)]
388mod test {
389    use std::{
390        env::{self, current_dir, set_current_dir},
391        fs::read,
392    };
393
394    use git2::build::CheckoutBuilder;
395    use git2::{ApplyLocation, Diff, IndexAddOption, Repository};
396
397    // used to setup a testing stage
398    fn clone_repo(url: &str, sha: &str, path: &str, patch_path: Option<&str>) {
399        let repo = Repository::clone(url, path).unwrap();
400        let commit = repo.revparse_single(sha).unwrap();
401        repo.checkout_tree(
402            &commit,
403            Some(CheckoutBuilder::new().force().recreate_missing(true)),
404        )
405        .unwrap();
406        repo.set_head_detached(commit.id()).unwrap();
407        if let Some(patch) = patch_path {
408            let diff = Diff::from_buffer(&read(patch).unwrap()).unwrap();
409            repo.apply(&diff, ApplyLocation::Both, None).unwrap();
410            let mut index = repo.index().unwrap();
411            index
412                .add_all(["tests/demo/demo.*"], IndexAddOption::DEFAULT, None)
413                .unwrap();
414            index.write().unwrap();
415        }
416    }
417
418    use tempfile::{tempdir, TempDir};
419
420    use crate::{
421        cli::LinesChangedOnly,
422        common_fs::FileFilter,
423        rest_api::{github::GithubApiClient, RestApiClient},
424    };
425
426    fn get_temp_dir() -> TempDir {
427        let tmp = tempdir().unwrap();
428        println!("Using temp folder at {:?}", tmp.path());
429        tmp
430    }
431
432    async fn checkout_cpp_linter_py_repo(
433        sha: &str,
434        extensions: &[String],
435        tmp: &TempDir,
436        patch_path: Option<&str>,
437    ) -> Vec<crate::common_fs::FileObj> {
438        let url = "https://github.com/cpp-linter/cpp-linter";
439        clone_repo(
440            url,
441            sha,
442            tmp.path().as_os_str().to_str().unwrap(),
443            patch_path,
444        );
445        let rest_api_client = GithubApiClient::new();
446        let file_filter = FileFilter::new(&["target".to_string()], extensions.to_owned());
447        set_current_dir(tmp).unwrap();
448        env::set_var("CI", "false"); // avoid use of REST API when testing in CI
449        rest_api_client
450            .unwrap()
451            .get_list_of_changed_files(&file_filter, &LinesChangedOnly::Off)
452            .await
453            .unwrap()
454    }
455
456    #[tokio::test]
457    async fn with_no_changed_sources() {
458        // commit with no modified C/C++ sources
459        let sha = "0c236809891000b16952576dc34de082d7a40bf3";
460        let cur_dir = current_dir().unwrap();
461        let tmp = get_temp_dir();
462        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
463        let files = checkout_cpp_linter_py_repo(sha, &extensions, &tmp, None).await;
464        println!("files = {:?}", files);
465        assert!(files.is_empty());
466        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
467        drop(tmp); // delete temp_folder
468    }
469
470    #[tokio::test]
471    async fn with_changed_sources() {
472        // commit with modified C/C++ sources
473        let sha = "950ff0b690e1903797c303c5fc8d9f3b52f1d3c5";
474        let cur_dir = current_dir().unwrap();
475        let tmp = get_temp_dir();
476        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
477        let files = checkout_cpp_linter_py_repo(sha, &extensions.clone(), &tmp, None).await;
478        println!("files = {:?}", files);
479        assert!(files.len() >= 2);
480        for file in files {
481            assert!(
482                extensions.contains(&file.name.extension().unwrap().to_string_lossy().to_string())
483            );
484        }
485        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
486        drop(tmp); // delete temp_folder
487    }
488
489    #[tokio::test]
490    async fn with_staged_changed_sources() {
491        // commit with no modified C/C++ sources
492        let sha = "0c236809891000b16952576dc34de082d7a40bf3";
493        let cur_dir = current_dir().unwrap();
494        let tmp = get_temp_dir();
495        let extensions = vec!["cpp".to_string(), "hpp".to_string()];
496        let files = checkout_cpp_linter_py_repo(
497            sha,
498            &extensions.clone(),
499            &tmp,
500            Some("tests/git_status_test_assets/cpp-linter/cpp-linter/test_git_lib.patch"),
501        )
502        .await;
503        println!("files = {:?}", files);
504        assert!(!files.is_empty());
505        for file in files {
506            assert!(
507                extensions.contains(&file.name.extension().unwrap().to_string_lossy().to_string())
508            );
509        }
510        set_current_dir(cur_dir).unwrap(); // prep to delete temp_folder
511        drop(tmp); // delete temp_folder
512    }
513}