git_scanner/
git_file_history.rs

1#![warn(clippy::all)]
2use crate::git_logger::{CommitChange, FileChange, GitLog, GitLogEntry, User};
3use chrono::offset::TimeZone;
4use chrono::Utc;
5use failure::Error;
6use git2::Oid;
7use indicatif::{ProgressBar, ProgressStyle};
8use serde::Serialize;
9use std::collections::HashMap;
10use std::path::Path;
11use std::path::PathBuf;
12
13/// For each file we just keep a simplified history - what the changes were, by whom, and when.
14#[derive(Debug, Serialize, Builder)]
15#[builder(setter(into), pattern = "owned")]
16pub struct FileHistoryEntry {
17    pub id: String,
18    pub committer: User,
19    pub commit_time: u64,
20    pub author: User,
21    pub author_time: u64,
22    pub co_authors: Vec<User>,
23    pub change: CommitChange,
24    pub lines_added: u64,
25    pub lines_deleted: u64,
26}
27
28impl FileHistoryEntry {
29    fn from(entry: &GitLogEntry, file_change: &FileChange) -> FileHistoryEntry {
30        let entry = entry.clone();
31        let file_change = file_change.clone();
32        FileHistoryEntry {
33            id: entry.id().to_owned(),
34            committer: entry.committer().clone(),
35            commit_time: *entry.commit_time(),
36            author: entry.author().clone(),
37            author_time: *entry.author_time(),
38            co_authors: entry.co_authors().clone(),
39            change: file_change.change().clone(),
40            lines_added: *file_change.lines_added(),
41            lines_deleted: *file_change.lines_deleted(),
42        }
43    }
44}
45
46#[cfg(test)]
47impl FileHistoryEntryBuilder {
48    pub fn test_default() -> Self {
49        FileHistoryEntryBuilder::default()
50            .co_authors(Vec::new())
51            .change(CommitChange::Add)
52            .lines_added(0u64)
53            .lines_deleted(0u64)
54    }
55    pub fn emails(self, email: &str) -> Self {
56        self.committer(User::new(None, Some(email)))
57            .author(User::new(None, Some(email)))
58    }
59
60    pub fn times(self, time: u64) -> Self {
61        self.commit_time(time).author_time(time)
62    }
63}
64
65#[derive(Debug, Serialize)]
66pub struct GitFileHistory {
67    /// repo work dir - always canonical
68    workdir: PathBuf,
69    history_by_file: HashMap<PathBuf, Vec<FileHistoryEntry>>,
70    last_commit: u64,
71}
72
73impl GitFileHistory {
74    pub fn new(log: &mut GitLog) -> Result<GitFileHistory, Error> {
75        let mut last_commit: u64 = 0;
76        let mut history_by_file = HashMap::<PathBuf, Vec<FileHistoryEntry>>::new();
77        let progress_bar = ProgressBar::new_spinner()
78            .with_style(ProgressStyle::default_spinner().template("[{elapsed}] {msg}"));
79
80        // for handling renames, this needs to be a 2-pass process
81
82        // This is ugly! I need to think of cleaning up, probably in one of two ways:
83        // 1. ditch the whole "expose an iterator" interface - if we're loading it all into memory anyway, there's no point, could make the code cleaner and maybe get rid of the ugly use of Rc<RefCell<>>
84        // 2. fully split the parsing into two passes, one to get parent/child info and one to get file summary.  This would use less memory - but might be slower?  YAGNI I think.
85
86        let log_iterator = log.iterator()?;
87        // I can't find a cleaner way for an iterator to have side effects
88        let git_file_future_registry = log_iterator.git_file_future_registry();
89        let log_entries: Vec<Result<GitLogEntry, Error>> = log_iterator.collect();
90
91        // safe to borrow this now as the iterator has gone and can't mutate any more
92        let git_file_future_registry = git_file_future_registry.borrow();
93
94        for entry in log_entries {
95            progress_bar.tick();
96            match entry {
97                Ok(entry) => {
98                    let commit_time = *entry.commit_time();
99                    let fmt_time = Utc.timestamp(commit_time as i64, 0).to_string();
100                    progress_bar.set_message(&fmt_time);
101                    if commit_time > last_commit {
102                        last_commit = commit_time;
103                    }
104                    for file_change in entry.clone().file_changes() {
105                        // TODO: use Oids so we don't need ugly conversion.
106                        let final_filename = git_file_future_registry
107                            .final_name(&Oid::from_str(entry.id()).unwrap(), file_change.file());
108                        if let Some(filename) = final_filename {
109                            let hash_entry =
110                                history_by_file.entry(filename).or_insert_with(Vec::new);
111                            let new_entry = FileHistoryEntry::from(&entry, &file_change);
112                            hash_entry.push(new_entry);
113                        } else {
114                            debug!(
115                                "Not storing history for deleted file {:?}",
116                                file_change.file()
117                            );
118                        }
119                    }
120                }
121                Err(e) => {
122                    warn!("Ignoring invalid git log entry: {:?}", e);
123                }
124            }
125        }
126
127        Ok(GitFileHistory {
128            workdir: log.workdir().to_owned(),
129            history_by_file,
130            last_commit,
131        })
132    }
133
134    /// true if this repo is valid for this file - file must exist (as we canonicalize it)
135    pub fn is_repo_for(&self, file: &Path) -> Result<bool, Error> {
136        let canonical_file = file.canonicalize()?;
137        Ok(canonical_file.starts_with(&self.workdir))
138    }
139
140    /// get git history for this file - file must exist (as we canonicalize it)
141    pub fn history_for(&self, file: &Path) -> Result<Option<&Vec<FileHistoryEntry>>, Error> {
142        let canonical_file = file.canonicalize()?;
143        let relative_file = canonical_file.strip_prefix(&self.workdir)?;
144        Ok(self.history_by_file.get(relative_file))
145    }
146
147    pub fn last_commit(&self) -> u64 {
148        self.last_commit
149    }
150}
151
152#[cfg(test)]
153mod test {
154    use super::*;
155    use crate::git_logger::GitLogConfig;
156    use pretty_assertions::assert_eq;
157    use tempfile::tempdir;
158    use test_shared::*;
159
160    #[test]
161    fn can_get_log_by_filename() -> Result<(), Error> {
162        let gitdir = tempdir()?;
163        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
164
165        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
166
167        let history = GitFileHistory::new(&mut git_log)?;
168
169        assert_eq!(history.workdir.canonicalize()?, git_root.canonicalize()?);
170
171        // assert_eq_json_str(&history.history_by_file, "{}");
172        assert_eq_json_file(
173            &history.history_by_file,
174            "./tests/expected/git/git_sample_by_filename.json",
175        );
176
177        Ok(())
178    }
179
180    #[test]
181    fn can_tell_if_file_is_in_git_repo() -> Result<(), Error> {
182        let gitdir = tempdir()?;
183        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
184
185        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
186
187        let history = GitFileHistory::new(&mut git_log)?;
188
189        assert_eq!(
190            history.is_repo_for(&git_root.join("simple/parent.clj"))?,
191            true
192        );
193
194        Ok(())
195    }
196
197    #[test]
198    fn can_get_history_for_file() -> Result<(), Error> {
199        let gitdir = tempdir()?;
200        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
201
202        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
203
204        let history = GitFileHistory::new(&mut git_log)?;
205
206        let file_history = history.history_for(&git_root.join("simple/parent.clj"))?;
207
208        assert_eq!(file_history.is_some(), true);
209
210        let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
211        assert_eq!(
212            ids,
213            vec![
214                "0dbd54d4c524ecc776f381e660cce9b2dd92162c",
215                "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
216                "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"
217            ]
218        );
219
220        assert_eq!(history.last_commit(), 1_558_533_240);
221
222        Ok(())
223    }
224
225    #[test]
226    fn no_history_for_files_not_known() -> Result<(), Error> {
227        let gitdir = tempdir()?;
228        let git_root = unzip_git_sample("git_sample", gitdir.path())?;
229
230        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
231
232        let history = GitFileHistory::new(&mut git_log)?;
233
234        let new_file = git_root.join("simple/nonesuch.clj");
235        std::fs::File::create(&new_file)?;
236
237        let file_history = history.history_for(&new_file)?;
238
239        assert_eq!(file_history.is_none(), true);
240
241        Ok(())
242    }
243
244    #[test]
245    fn can_get_history_for_complex_renamed_files() -> Result<(), Error> {
246        let gitdir = tempdir()?;
247        let git_root = unzip_git_sample("rename_complex", gitdir.path())?;
248        /*
249        This is generated by the script in tests/data/builders/renaming/rename_complex.sh
250
251        log is:
252
253        * 3629e5a (HEAD -> master) restoring deleted z
254        *   261e027 merging dave work with fixes
255        |\
256        | * c3b47c3 (dave_work) rename bb to b, a2 back to a
257        | * 500a621 rename a1 to a2, add bb, kill z
258        * |   fac9419 merging jay work
259        |\ \
260        | * | 34b904b (jay_work) rename bee to b, aa back to a
261        | * | 3bd2d90 rename a1 to aa, add bee
262        | |/
263        * | 8be47df rename a1 back to a prep merging
264        |/
265        * 388e644 rename a to a1
266        * bd6d7df initial commit
267        */
268
269        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
270
271        let history = GitFileHistory::new(&mut git_log)?;
272
273        let file_history = history.history_for(&git_root.join("a.txt"))?;
274
275        let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
276        assert_eq!(
277            ids,
278            // all of these refs have a file that ends up being "a.txt" via renames and merges:
279            vec![
280                "c3b47c335ebd9dbb9b0c9922bc258555a2cf71c9",
281                "500a621e9e83612f51dbce15202cd7bef3c88f00",
282                "34b904b010abf316167bba7a7ce2b4a5996cc0d1",
283                "3bd2d9088ee5b051ada1bd30f07e7bcd390f6327",
284                "8be47dfc0a25ec27941413619f632a1fa66e5ba5",
285                "388e644e9240aa333fe669069bb00d418ffca500",
286                "bd6d7dfa063ec95ebc3bad7bffd4262e3702b77c",
287            ]
288        );
289
290        Ok(())
291    }
292
293    #[test]
294    fn deleted_files_dont_have_history() -> Result<(), Error> {
295        let gitdir = tempdir()?;
296        let git_root = unzip_git_sample("rename_complex", gitdir.path())?;
297
298        let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
299
300        let history = GitFileHistory::new(&mut git_log)?;
301
302        let file_history = history.history_for(&git_root.join("z.txt"))?;
303
304        assert_eq!(file_history.is_some(), true);
305
306        let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
307        assert_eq!(
308            ids,
309            // z.txt is only using the final commit, not the earlier file that was deleted.
310            vec!["3629e5a8d8d7547bac749530eb540d0f61535cd1",]
311        );
312
313        Ok(())
314    }
315}