git_scanner/
git_logger.rs

1#![warn(clippy::all)]
2use crate::git_file_future::{FileNameChange, GitFileFutureRegistry};
3use failure::Error;
4use git2::Revwalk;
5use git2::{Commit, Delta, DiffDelta, ObjectType, Odb, Oid, Patch, Repository, Tree};
6use regex::Regex;
7use serde::Serialize;
8use std::cell::RefCell;
9use std::path::{Path, PathBuf};
10use std::rc::Rc;
11use std::time::{Duration, SystemTime};
12
13#[derive(Debug, Clone, Copy)]
14pub struct GitLogConfig {
15    /// include merge commits in file stats - usually excluded by `git log` - see https://stackoverflow.com/questions/37801342/using-git-log-to-display-files-changed-during-merge
16    include_merges: bool,
17    /// earliest commmit for filtering - secs since the epoch - could use Option but this is pretty cheap to check
18    earliest_time: u64,
19}
20
21impl GitLogConfig {
22    pub fn default() -> GitLogConfig {
23        GitLogConfig {
24            include_merges: false,
25            earliest_time: 0,
26        }
27    }
28
29    #[allow(dead_code)]
30    pub fn include_merges(self, include_merges: bool) -> GitLogConfig {
31        let mut config = self;
32        config.include_merges = include_merges;
33        config
34    }
35    /// filter log by unix timestamp
36    pub fn since(self, earliest_time: u64) -> GitLogConfig {
37        let mut config = self;
38        config.earliest_time = earliest_time;
39        config
40    }
41    /// filter log by number of years before now
42    pub fn since_years(self, years: f64) -> GitLogConfig {
43        let secs: f64 = (60 * 60 * 24 * 365) as f64 * years;
44        let years_ago = SystemTime::now() - Duration::from_secs(secs as u64);
45        let years_ago_secs = years_ago
46            .duration_since(SystemTime::UNIX_EPOCH)
47            .unwrap()
48            .as_secs();
49        self.since(years_ago_secs)
50    }
51}
52
53pub struct GitLog {
54    /// repo work dir - always canonical
55    workdir: PathBuf,
56    repo: Repository,
57    config: GitLogConfig,
58}
59
60pub struct GitLogIterator<'a> {
61    git_log: &'a GitLog,
62    odb: Odb<'a>,
63    revwalk: Revwalk<'a>,
64    // this is an RC as we need to use it after the iterator has been consumed
65    git_file_future_registry: Rc<RefCell<GitFileFutureRegistry>>,
66}
67
68/// simplified user info - based on git2::Signature
69/// everything is derived, seems to work OK as the structure is so simple
70#[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord, Serialize)]
71pub struct User {
72    name: Option<String>,
73    email: Option<String>,
74}
75
76impl User {
77    pub fn new(name: Option<&str>, email: Option<&str>) -> User {
78        User {
79            name: name.map(|x| x.to_owned()),
80            email: email.map(|x| x.to_owned()),
81        }
82    }
83}
84
85/// simplified commit log entry
86#[derive(Debug, Serialize, Clone, Getters)]
87pub struct GitLogEntry {
88    id: String,
89    summary: String,
90    parents: Vec<String>,
91    committer: User,
92    commit_time: u64,
93    author: User,
94    author_time: u64,
95    co_authors: Vec<User>,
96    file_changes: Vec<FileChange>,
97}
98
99/// the various kinds of git change we care about - a serializable subset of git2::Delta
100#[derive(Debug, Serialize, Clone, PartialEq)]
101pub enum CommitChange {
102    Add,
103    Rename,
104    Delete,
105    Modify,
106    Copied,
107}
108
109/// Stats for file changes
110#[derive(Debug, Serialize, Clone, Getters)]
111pub struct FileChange {
112    file: PathBuf,
113    old_file: Option<PathBuf>,
114    change: CommitChange,
115    lines_added: u64,
116    lines_deleted: u64,
117}
118
119impl GitLog {
120    pub fn workdir(&self) -> &Path {
121        &self.workdir
122    }
123
124    pub fn new(start_dir: &Path, config: GitLogConfig) -> Result<GitLog, Error> {
125        let repo = Repository::discover(start_dir)?;
126
127        let workdir = repo
128            .workdir()
129            .ok_or_else(|| format_err!("bare repository - no workdir"))?
130            .canonicalize()?;
131
132        debug!("work dir: {:?}", workdir);
133
134        Ok(GitLog {
135            workdir,
136            repo,
137            config,
138        })
139    }
140
141    pub fn iterator(&self) -> Result<GitLogIterator, Error> {
142        let odb = self.repo.odb()?;
143        let mut revwalk = self.repo.revwalk()?;
144        revwalk.set_sorting(git2::Sort::TOPOLOGICAL)?;
145        revwalk.push_head()?;
146        Ok(GitLogIterator {
147            git_log: &self,
148            odb,
149            revwalk,
150            git_file_future_registry: Rc::new(RefCell::new(GitFileFutureRegistry::new())),
151        })
152    }
153}
154
155impl<'a> Iterator for GitLogIterator<'a> {
156    type Item = Result<GitLogEntry, Error>;
157
158    fn next(&mut self) -> Option<Self::Item> {
159        let mut next_item = self.revwalk.next();
160        while next_item.is_some() {
161            let c = self.summarise_commit(next_item.unwrap());
162            match c {
163                Ok(Some(c)) => {
164                    if c.commit_time >= self.git_log.config.earliest_time {
165                        self.register_file_futures(&c);
166                        return Some(Ok(c));
167                    } else {
168                        return None; // short circuit!
169                    }
170                }
171                Ok(None) => {}
172                Err(e) => return Some(Err(e)),
173            };
174            next_item = self.revwalk.next();
175        }
176        None
177    }
178}
179
180impl<'a> GitLogIterator<'a> {
181    pub fn git_file_future_registry(&self) -> Rc<RefCell<GitFileFutureRegistry>> {
182        self.git_file_future_registry.clone()
183    }
184
185    /// registers renames and deletes
186    fn register_file_futures(&mut self, entry: &GitLogEntry) {
187        // TODO: probably should be using Oid not String globally, then this would be simpler:
188        let parents: Vec<Oid> = entry
189            .parents
190            .iter()
191            .map(|id| Oid::from_str(&id).unwrap())
192            .collect();
193        let mut file_changes: Vec<(PathBuf, FileNameChange)> = Vec::new();
194        for file_change in &entry.file_changes {
195            match file_change.change {
196                CommitChange::Rename => {
197                    let old_name = file_change.old_file.as_ref().unwrap().clone();
198                    let new_name = file_change.file.clone();
199                    file_changes.push((old_name, FileNameChange::Renamed(new_name)))
200                }
201                CommitChange::Delete => {
202                    let name = file_change.file.clone();
203                    file_changes.push((name, FileNameChange::Deleted()))
204                }
205                _ => (),
206            }
207        }
208        self.git_file_future_registry.borrow_mut().register(
209            &Oid::from_str(&entry.id).unwrap(),
210            &parents,
211            &file_changes,
212        );
213    }
214
215    /// Summarises a git commit
216    /// returns Error if error, Result<None> if the id was not actually a commit, or Result<Some<GitLogEntry>> if valid
217    fn summarise_commit(
218        &self,
219        oid: Result<Oid, git2::Error>,
220    ) -> Result<Option<GitLogEntry>, Error> {
221        let oid = oid?;
222        let kind = self.odb.read(oid)?.kind();
223        match kind {
224            ObjectType::Commit => {
225                let commit = self.git_log.repo.find_commit(oid)?;
226                debug!("processing {:?}", commit);
227                let author = commit.author();
228                let committer = commit.committer();
229                let author_time = author.when().seconds() as u64;
230                let commit_time = committer.when().seconds() as u64;
231                let other_time = commit.time().seconds() as u64;
232                if commit_time != other_time {
233                    error!(
234                        "Commit {:?} time {:?} != commit time {:?}",
235                        commit, other_time, commit_time
236                    );
237                }
238                let co_authors = if let Some(message) = commit.message() {
239                    find_coauthors(message)
240                } else {
241                    Vec::new()
242                };
243
244                let commit_tree = commit.tree()?;
245                let file_changes = commit_file_changes(
246                    &self.git_log.repo,
247                    &commit,
248                    &commit_tree,
249                    self.git_log.config,
250                );
251                Ok(Some(GitLogEntry {
252                    id: oid.to_string(),
253                    summary: commit.summary().unwrap_or("[no message]").to_string(),
254                    parents: commit.parent_ids().map(|p| p.to_string()).collect(),
255                    committer: signature_to_user(&committer),
256                    commit_time,
257                    author: signature_to_user(&author),
258                    author_time,
259                    co_authors,
260                    file_changes,
261                }))
262            }
263            _ => {
264                info!("ignoring object type: {}", kind);
265                Ok(None)
266            }
267        }
268    }
269}
270
271fn signature_to_user(signature: &git2::Signature) -> User {
272    User {
273        name: signature.name().map(|x| x.to_owned()),
274        email: signature.email().map(|x| x.to_owned()),
275    }
276}
277
278fn trim_string(s: &str) -> Option<&str> {
279    let trimmed = s.trim();
280    if trimmed.is_empty() {
281        None
282    } else {
283        Some(&trimmed)
284    }
285}
286
287fn find_coauthors(message: &str) -> Vec<User> {
288    lazy_static! {
289        static ref CO_AUTH_LINE: Regex = Regex::new(r"(?m)^\s*Co-authored-by:(.*)$").unwrap();
290        static ref CO_AUTH_ANGLE_BRACKETS: Regex = Regex::new(r"^(.*)<([^>]+)>$").unwrap();
291    }
292
293    CO_AUTH_LINE
294        .captures_iter(message)
295        .map(|capture_group| {
296            let co_author_text = &capture_group[1];
297            if let Some(co_author_bits) = CO_AUTH_ANGLE_BRACKETS.captures(co_author_text) {
298                User::new(
299                    trim_string(&co_author_bits.get(1).unwrap().as_str()),
300                    trim_string(co_author_bits.get(2).unwrap().as_str()),
301                )
302            } else if co_author_text.contains('@') {
303                // no angle brackets, but an @
304                User::new(None, trim_string(co_author_text))
305            } else {
306                User::new(trim_string(co_author_text), None)
307            }
308        })
309        .collect()
310}
311
312fn commit_file_changes(
313    repo: &Repository,
314    commit: &Commit,
315    commit_tree: &Tree,
316    config: GitLogConfig,
317) -> Vec<FileChange> {
318    if commit.parent_count() == 0 {
319        info!("Commit {} has no parent", commit.id());
320
321        scan_diffs(&repo, &commit_tree, None, &commit, None).expect("Can't scan for diffs")
322    } else if commit.parent_count() > 1 && !config.include_merges {
323        debug!(
324            "Not showing file changes for merge commit {:?}",
325            commit.id()
326        );
327        Vec::new()
328    } else {
329        commit
330            .parents()
331            .flat_map(|parent| {
332                debug!("Getting changes for parent {:?}:", parent);
333                let parent_tree = parent.tree().expect("can't get parent tree");
334                scan_diffs(
335                    &repo,
336                    &commit_tree,
337                    Some(&parent_tree),
338                    &commit,
339                    Some(&parent),
340                )
341                .expect("Can't scan for diffs")
342            })
343            .collect()
344    }
345}
346
347fn scan_diffs(
348    repo: &Repository,
349    commit_tree: &Tree,
350    parent_tree: Option<&Tree>,
351    commit: &Commit,
352    parent: Option<&Commit>,
353) -> Result<Vec<FileChange>, Error> {
354    let mut diff = repo.diff_tree_to_tree(parent_tree, Some(&commit_tree), None)?;
355    // Identify renames, None means default settings - see https://libgit2.org/libgit2/#HEAD/group/diff/git_diff_find_similar
356    diff.find_similar(None)?;
357    let file_changes = diff
358        .deltas()
359        .enumerate()
360        .filter_map(|(delta_index, delta)| {
361            // can we / should we get bytes for binary changes?  Adds show as 0 lines.
362            let patch =
363                Patch::from_diff(&diff, delta_index).expect("can't get a patch from a diff");
364            let (_, lines_added, lines_deleted) = if let Some(patch) = patch {
365                patch
366                    .line_stats()
367                    .expect("Couldn't get line stats from a patch")
368            } else {
369                warn!("No patch possible diffing {:?} -> {:?}", commit, parent);
370                (0, 0, 0)
371            };
372            summarise_delta(delta, lines_added as u64, lines_deleted as u64)
373        });
374    Ok(file_changes.collect())
375}
376
377fn summarise_delta(delta: DiffDelta, lines_added: u64, lines_deleted: u64) -> Option<FileChange> {
378    match delta.status() {
379        Delta::Added => {
380            let name = delta.new_file().path().unwrap();
381            Some(FileChange {
382                file: name.to_path_buf(),
383                old_file: None,
384                change: CommitChange::Add,
385                lines_added,
386                lines_deleted,
387            })
388        }
389        Delta::Renamed => {
390            let old_name = delta.old_file().path().unwrap();
391            let new_name = delta.new_file().path().unwrap();
392            Some(FileChange {
393                file: new_name.to_path_buf(),
394                old_file: Some(old_name.to_path_buf()),
395                change: CommitChange::Rename,
396                lines_added,
397                lines_deleted,
398            })
399        }
400        Delta::Deleted => {
401            let name = delta.old_file().path().unwrap();
402            Some(FileChange {
403                file: name.to_path_buf(),
404                old_file: None,
405                change: CommitChange::Delete,
406                lines_added,
407                lines_deleted,
408            })
409        }
410        Delta::Modified => {
411            let name = delta.new_file().path().unwrap();
412            Some(FileChange {
413                file: name.to_path_buf(),
414                old_file: None,
415                change: CommitChange::Modify,
416                lines_added,
417                lines_deleted,
418            })
419        }
420        Delta::Copied => {
421            let old_name = delta.old_file().path().unwrap();
422            let new_name = delta.new_file().path().unwrap();
423            Some(FileChange {
424                file: new_name.to_path_buf(),
425                old_file: Some(old_name.to_path_buf()),
426                change: CommitChange::Copied,
427                lines_added,
428                lines_deleted,
429            })
430        }
431        _ => {
432            error!("Not able to handle delta of status {:?}", delta.status());
433            None
434        }
435    }
436}
437
438#[cfg(test)]
439mod test {
440    use super::*;
441    use pretty_assertions::assert_eq;
442    use serde_json::json;
443    use tempfile::tempdir;
444    use test_shared::*;
445
446    #[test]
447    fn authorless_message_has_no_coauthors() {
448        assert_eq!(find_coauthors("do be do be do"), Vec::<User>::new());
449    }
450
451    #[test]
452    fn can_get_coauthors_from_message() {
453        let message = r#"This is a commit message
454        not valid: Co-authored-by: fred jones
455        Co-authored-by: valid user <valid@thing.com>
456        Co-authored-by: <be.lenient@any-domain.com>
457        Co-authored-by: bad@user <this isn't really trying to be clever>
458        ignore random lines
459        Co-authored-by: if there's no at it's a name
460        Co-authored-by: if there's an @ it's email@thing.com
461        ignore trailing lines
462        "#;
463
464        let expected = vec![
465            User::new(Some("valid user"), Some("valid@thing.com")),
466            User::new(None, Some("be.lenient@any-domain.com")),
467            User::new(
468                Some("bad@user"),
469                Some("this isn't really trying to be clever"),
470            ),
471            User::new(Some("if there's no at it's a name"), None),
472            User::new(None, Some("if there's an @ it's email@thing.com")),
473        ];
474
475        assert_eq!(find_coauthors(message), expected);
476    }
477
478    #[test]
479    fn can_extract_basic_git_log() -> Result<(), Error> {
480        let gitdir = tempdir()?;
481        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
482        let git_log = GitLog::new(&git_root, GitLogConfig::default())?;
483
484        assert_eq!(git_log.workdir.canonicalize()?, git_root.canonicalize()?);
485
486        let err_count = git_log.iterator()?.filter(|x| Result::is_err(x)).count();
487        assert_eq!(err_count, 0);
488
489        let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
490
491        assert_eq_json_file(&entries, "./tests/expected/git/git_sample.json");
492
493        Ok(())
494    }
495
496    #[test]
497    fn git_log_can_include_merge_changes() -> Result<(), Error> {
498        let gitdir = tempdir()?;
499        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
500
501        let git_log = GitLog::new(&git_root, GitLogConfig::default().include_merges(true))?;
502
503        let err_count = git_log.iterator()?.filter(Result::is_err).count();
504        assert_eq!(err_count, 0);
505
506        let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
507
508        assert_eq_json_file(&entries, "./tests/expected/git/git_sample_with_merges.json");
509
510        Ok(())
511    }
512
513    #[allow(clippy::unreadable_literal)]
514    #[test]
515    fn git_log_can_limit_to_recent_history() -> Result<(), Error> {
516        let gitdir = tempdir()?;
517        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
518
519        let git_log = GitLog::new(&git_root, GitLogConfig::default().since(1558521694))?;
520
521        let err_count = git_log.iterator()?.filter(Result::is_err).count();
522        assert_eq!(err_count, 0);
523
524        let ids: Vec<_> = git_log
525            .iterator()?
526            .filter_map(Result::ok)
527            .map(|h| (h.summary.clone(), h.commit_time))
528            .collect();
529        assert_eq!(
530            ids,
531            vec![
532                ("renaming".to_owned(), 1558533240u64),
533                ("just changed parent.clj".to_owned(), 1558524371u64),
534                ("Merge branch \'fiddling\'".to_owned(), 1558521695u64)
535            ]
536        );
537
538        Ok(())
539    }
540
541    #[test]
542    fn git_log_tracks_renames() -> Result<(), Error> {
543        let gitdir = tempdir()?;
544        let git_root = unzip_git_sample("rename_simple", gitdir.path())?;
545
546        let git_log = GitLog::new(&git_root, GitLogConfig::default())?;
547
548        let err_count = git_log.iterator()?.filter(Result::is_err).count();
549        assert_eq!(err_count, 0);
550
551        let mut entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
552        entries.sort_by(|a, b| a.author_time.cmp(&b.author_time));
553
554        let changes: Vec<String> = entries
555            .iter()
556            .map(|entry| entry.summary.to_owned())
557            .collect();
558
559        assert_eq!(
560            changes,
561            vec![
562                "initial commit",
563                "unrelated commit",
564                "moving a to c",
565                "moving and renaming"
566            ]
567        );
568
569        let file_changes: Vec<Vec<FileChange>> = entries
570            .iter()
571            .map(|entry| {
572                let mut entries = entry.file_changes.clone();
573                entries.sort_by(|a, b| a.file.cmp(&b.file));
574                entries
575            })
576            .collect();
577
578        assert_eq_json_value(
579            &file_changes,
580            &json!([
581                [{"change":"Add",
582                  "file":"a.txt",
583                  "lines_added": 4,
584                  "lines_deleted": 0,
585                  "old_file": null}
586                ],
587                [{"change":"Add",
588                  "file":"b.txt",
589                  "lines_added": 1,
590                  "lines_deleted": 0,
591                  "old_file": null}
592                ],
593                [{"change":"Rename",
594                  "file":"c.txt",
595                  "lines_added": 0,
596                  "lines_deleted": 0,
597                  "old_file": "a.txt"}
598                ],
599                [{"change":"Rename",
600                  "file":"d.txt",
601                  "lines_added": 1,
602                  "lines_deleted": 0,
603                  "old_file": "c.txt"}
604                ]
605               ]
606            ),
607        );
608
609        Ok(())
610    }
611}
612/*
613<Array([
614<    Array([
615<        Object({
616<            "change": String(
617<                "Add",
618<            ),
619<            "file": String(
620<                "a.txt",
621<            ),
622<            "lines_added": Number(
623<                4,
624<            ),
625<            "lines_deleted": Number(
626<                0,
627<            ),
628<            "old_file": Null,
629<        }),
630<    ]),
631<    Array([
632<        Object({
633<            "change": String(
634<                "Add",
635<            ),
636<            "file": String(
637<                "b.txt",
638<            ),
639<            "lines_added": Number(
640<                1,
641<            ),
642<            "lines_deleted": Number(
643<                0,
644<            ),
645<            "old_file": Null,
646<        }),
647<    ]),
648<    Array([
649<        Object({
650<            "change": String(
651<                "Rename",
652<            ),
653<            "file": String(
654<                "c.txt",
655<            ),
656<            "lines_added": Number(
657<                0,
658<            ),
659<            "lines_deleted": Number(
660<                0,
661<            ),
662<            "old_file": String(
663<                "a.txt",
664<            ),
665<        }),
666<    ]),
667<    Array([
668<        Object({
669<            "change": String(
670<                "Rename",
671<            ),
672<            "file": String(
673<                "d.txt",
674<            ),
675<            "lines_added": Number(
676<                1,
677<            ),
678<            "lines_deleted": Number(
679<                0,
680<            ),
681<            "old_file": String(
682<                "c.txt",
683<            ),
684<        }),
685<    ]),
686<])
687*/
688// run a single test with:
689// cargo test -- --nocapture can_extract_basic_git_log | grep -v "running 0 tests" | grep -v "0 passed" | grep -v -e '^\s*$'