Skip to main content

gitoxide_core/hours/
mod.rs

1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5    bstr::{BStr, ByteSlice},
6    prelude::*,
7    progress, Count, NestedProgress, Progress,
8};
9use smallvec::{smallvec, SmallVec};
10
11/// Additional configuration for the hours estimation functionality.
12pub struct Context<W> {
13    /// Ignore github bots which match the `[bot]` search string.
14    pub ignore_bots: bool,
15    /// Show personally identifiable information before the summary. Includes names and email addresses.
16    pub show_pii: bool,
17    /// Collect how many files have been added, removed and modified (without rename tracking).
18    pub file_stats: bool,
19    /// Collect how many lines in files have been added, removed and modified (without rename tracking).
20    pub line_stats: bool,
21    /// The number of threads to use. If unset, use all cores, if 0 use all physical cores.
22    pub threads: Option<usize>,
23    /// Omit unifying identities by name and email which can lead to the same author appear multiple times
24    /// due to using different names or email addresses.
25    pub omit_unify_identities: bool,
26    /// Where to write our output to
27    pub out: W,
28}
29
30pub struct SignatureRef<'a> {
31    name: &'a BStr,
32    email: &'a BStr,
33    time: gix::date::Time,
34}
35
36impl SignatureRef<'_> {
37    fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
38        self.time.seconds
39    }
40}
41
42/// Return `(commit_author, [commit_author, co_authors...])`. Use the `commit_author` for easy access to the commit author itself.
43fn commit_author_identities(
44    commit_data: &[u8],
45) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[gix::actor::IdentityRef<'_>; 2]>), gix::objs::decode::Error> {
46    let commit = gix::objs::CommitRef::from_bytes(commit_data)?;
47    let author = commit.author()?.trim();
48    let mut authors = smallvec![gix::actor::IdentityRef::from(author)];
49    authors.extend(commit.co_authored_by_trailers().filter_map(|trailer| {
50        gix::actor::IdentityRef::from_bytes::<gix::objs::decode::ParseError>(trailer.value.as_ref())
51            .ok()
52            .map(|identity| identity.trim())
53    }));
54    Ok((author, authors))
55}
56
57/// Estimate the hours it takes to produce the content of the repository in `_working_dir_`, with `_refname_` for
58/// the start of the commit graph traversal.
59///
60/// * `_working_dir_` - The directory containing a '.git/' folder.
61/// * `_refname_` - The name of the ref like 'main' or 'master' at which to start iterating the commit graph.
62/// * `_progress_` - A way to provide progress and performance information
63pub fn estimate<W, P>(
64    working_dir: &Path,
65    rev_spec: &BStr,
66    mut progress: P,
67    Context {
68        show_pii,
69        ignore_bots,
70        file_stats,
71        line_stats,
72        omit_unify_identities,
73        threads,
74        mut out,
75    }: Context<W>,
76) -> anyhow::Result<()>
77where
78    W: io::Write,
79    P: NestedProgress,
80{
81    let repo = gix::discover(working_dir)?;
82    let commit_id = repo.rev_parse_single(rev_spec)?.detach();
83    let mut string_heap = BTreeSet::<&'static [u8]>::new();
84    let needs_stats = file_stats || line_stats;
85    let threads = gix::features::parallel::num_threads(threads);
86
87    let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
88        std::thread::scope(|scope| -> anyhow::Result<_> {
89            let start = Instant::now();
90            let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
91            let mailmap = repo.open_mailmap();
92
93            let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
94                let mut out = Vec::new();
95                for (commit_idx, commit_data) in rx {
96                    if let Ok((commit_author, authors)) = commit_author_identities(&commit_data) {
97                        let mut string_ref = |s: &[u8]| -> &'static BStr {
98                            match string_heap.get(s) {
99                                Some(n) => n.as_bstr(),
100                                None => {
101                                    let sv: Vec<u8> = s.to_owned();
102                                    string_heap.insert(Box::leak(sv.into_boxed_slice()));
103                                    (*string_heap.get(s).expect("present")).as_ref()
104                                }
105                            }
106                        };
107                        let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
108                        for identity in authors {
109                            let author = mailmap.resolve_cow(gix::actor::SignatureRef {
110                                name: identity.name,
111                                email: identity.email,
112                                time: commit_author.time,
113                            });
114                            let name = string_ref(author.name.as_ref());
115                            let email = string_ref(author.email.as_ref());
116                            if authors_for_commit
117                                .iter()
118                                .any(|existing| existing.name == name && existing.email == email)
119                            {
120                                continue;
121                            }
122                            authors_for_commit.push(SignatureRef {
123                                name,
124                                email,
125                                time: author.time,
126                            });
127                        }
128                        out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
129                    }
130                }
131                out.shrink_to_fit();
132                out.sort_by(|a, b| {
133                    a.1.email
134                        .cmp(b.1.email)
135                        .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
136                        .then(a.0.cmp(&b.0))
137                });
138                Ok(out)
139            });
140
141            let (stats_progresses, stats_counters) = if needs_stats {
142                {
143                    let mut sp = progress.add_child("extract stats");
144                    sp.init(None, progress::count("commits"));
145                    let sc = sp.counter();
146
147                    let mut cp = progress.add_child("find changes");
148                    cp.init(None, progress::count("modified files"));
149                    let cc = cp.counter();
150
151                    let mut lp = progress.add_child("find changes");
152                    lp.init(None, progress::count("diff lines"));
153                    let lc = lp.counter();
154
155                    (Some((sp, cp, lp)), Some((sc, cc, lc)))
156                }
157            } else {
158                Default::default()
159            };
160
161            let mut progress = progress.add_child("traverse commit graph");
162            progress.init(None, progress::count("commits"));
163
164            let (tx_tree_id, stat_threads) = if needs_stats {
165                {
166                    let (tx, threads) = spawn_tree_delta_threads(
167                        scope,
168                        threads,
169                        line_stats,
170                        repo.clone(),
171                        stats_counters.clone().expect("counters are set"),
172                    );
173                    (Some(tx), threads)
174                }
175            } else {
176                Default::default()
177            };
178
179            let mut commit_idx = 0_u32;
180            let mut skipped_merge_commits = 0;
181            const CHUNK_SIZE: usize = 50;
182            let mut chunk = Vec::with_capacity(CHUNK_SIZE);
183            let mut commit_iter = commit_id.ancestors(&repo.objects);
184            let mut is_shallow = false;
185            while let Some(c) = commit_iter.next() {
186                progress.inc();
187                if gix::interrupt::is_triggered() {
188                    bail!("Cancelled by user");
189                }
190                match c {
191                    Ok(c) => {
192                        tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
193                        let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
194                            let mut parents = c.parent_ids.into_iter();
195                            parents
196                                .next()
197                                .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
198                                .filter(|_| {
199                                    if parents.next().is_some() {
200                                        skipped_merge_commits += 1;
201                                        false
202                                    } else {
203                                        true
204                                    }
205                                })
206                        });
207                        if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
208                            if chunk.len() == CHUNK_SIZE {
209                                tx_tree
210                                    .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
211                                    .ok();
212                            } else {
213                                chunk.push((commit_idx, first_parent, commit));
214                            }
215                        }
216                        commit_idx += 1;
217                    }
218                    Err(gix::traverse::commit::simple::Error::Find { .. }) => {
219                        is_shallow = true;
220                        break;
221                    }
222                    Err(err) => return Err(err.into()),
223                }
224            }
225            if let Some(tx) = tx_tree_id {
226                tx.send(chunk).ok();
227            }
228            drop(tx);
229            progress.show_throughput(start);
230            drop(progress);
231
232            let stats_by_commit_idx = match stats_progresses {
233                Some((mut stat_progress, change_progress, line_progress)) => {
234                    stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
235                    let mut stats = Vec::new();
236                    for handle in stat_threads {
237                        stats.extend(handle.join().expect("no panic")?);
238                        if gix::interrupt::is_triggered() {
239                            bail!("Cancelled by user");
240                        }
241                    }
242                    stats.sort_by_key(|t| t.0);
243                    stat_progress.show_throughput(start);
244                    change_progress.show_throughput(start);
245                    line_progress.show_throughput(start);
246                    stats
247                }
248                None => Vec::new(),
249            };
250
251            Ok((
252                extract_signatures.join().expect("no panic")?,
253                stats_by_commit_idx,
254                is_shallow,
255                skipped_merge_commits,
256                commit_idx,
257            ))
258        })?
259    };
260
261    if commit_authors.is_empty() {
262        bail!("No commits to process");
263    }
264
265    let start = Instant::now();
266    let mut current_email = &commit_authors[0].1.email;
267    let mut slice_start = 0;
268    let mut results_by_hours = Vec::new();
269    let mut ignored_bot_commits = 0_u32;
270    let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
271        let estimate = estimate_hours(commits, &stats);
272        if ignore_bots && estimate.name.contains_str(b"[bot]") {
273            ignored_bot_commits += estimate.num_commits;
274            return;
275        }
276        results_by_hours.push(estimate);
277    };
278    for (idx, (_, elm)) in commit_authors.iter().enumerate() {
279        if elm.email != *current_email {
280            push_estimate(&commit_authors[slice_start..idx]);
281            slice_start = idx;
282            current_email = &elm.email;
283        }
284    }
285    if let Some(commits) = commit_authors.get(slice_start..) {
286        push_estimate(commits);
287    }
288
289    let num_authors = results_by_hours.len();
290    let mut results_by_hours = if !omit_unify_identities {
291        deduplicate_identities(&results_by_hours)
292    } else {
293        results_by_hours
294            .iter()
295            .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
296                acc.push(e.into());
297                acc
298            })
299    };
300    let elapsed = start.elapsed();
301    progress.done(format!(
302        "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
303        num_commits,
304        elapsed,
305        num_commits as f32 / elapsed.as_secs_f32()
306    ));
307
308    let num_unique_authors = results_by_hours.len();
309    let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
310    let included_commit_ids = commit_authors
311        .iter()
312        .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
313        .map(|(commit_idx, _)| *commit_idx)
314        .collect::<BTreeSet<_>>();
315    let total_commits = included_commit_ids.len() as u32;
316    let (total_files, total_lines) = stats
317        .iter()
318        .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
319        .fold(
320            (FileStats::default(), LineStats::default()),
321            |mut acc, (_, files, lines)| {
322                acc.0.add(files);
323                acc.1.add(lines);
324                acc
325            },
326        );
327    if show_pii {
328        results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
329        for entry in &results_by_hours {
330            entry.write_to(
331                total_hours,
332                file_stats.then_some(total_files),
333                line_stats.then_some(total_lines),
334                &mut out,
335            )?;
336            writeln!(out)?;
337        }
338    }
339    writeln!(
340        out,
341        "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
342        total_hours,
343        total_hours / HOURS_PER_WORKDAY,
344        total_commits,
345        if is_shallow { " (shallow)" } else { Default::default() },
346        num_authors
347    )?;
348    if file_stats {
349        writeln!(
350            out,
351            "total files added/removed/modified/remaining: {}/{}/{}/{}",
352            total_files.added,
353            total_files.removed,
354            total_files.modified,
355            total_files.added - total_files.removed
356        )?;
357    }
358    if line_stats {
359        writeln!(
360            out,
361            "total lines added/removed/remaining: {}/{}/{}",
362            total_lines.added,
363            total_lines.removed,
364            total_lines.added - total_lines.removed
365        )?;
366    }
367    if !omit_unify_identities {
368        writeln!(
369            out,
370            "total unique authors: {} ({:.02}% duplication)",
371            num_unique_authors,
372            (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
373        )?;
374    }
375    if ignored_bot_commits != 0 {
376        writeln!(out, "commits by bots: {ignored_bot_commits}")?;
377    }
378    if needs_stats && skipped_merge_commits != 0 {
379        writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
380    }
381    debug_assert!(total_commits <= num_commits);
382    Ok(())
383}
384
385mod core;
386use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
387
388mod util;
389use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
390
391use crate::hours::core::spawn_tree_delta_threads;
392
393#[cfg(test)]
394mod tests {
395    use gix::bstr::ByteSlice;
396
397    use super::commit_author_identities;
398
399    #[test]
400    fn commit_author_identities_include_coauthors() {
401        let commit = b"tree 1111111111111111111111111111111111111111\n\
402author Main Author <main@example.com> 1710000000 +0000\n\
403committer Main Author <main@example.com> 1710000000 +0000\n\
404\n\
405subject\n\
406\n\
407body\n\
408\n\
409Co-authored-by: Second Author <second@example.com>\n\
410Co-authored-by: Third Author <third@example.com>\n";
411        let (author, authors) = commit_author_identities(commit).expect("valid commit");
412        assert_eq!(author.time, "1710000000 +0000");
413        assert_eq!(
414            authors
415                .iter()
416                .map(|identity| (identity.name, identity.email))
417                .collect::<Vec<_>>(),
418            vec![
419                (
420                    "Main Author".as_bytes().as_bstr(),
421                    "main@example.com".as_bytes().as_bstr()
422                ),
423                (
424                    "Second Author".as_bytes().as_bstr(),
425                    "second@example.com".as_bytes().as_bstr()
426                ),
427                (
428                    "Third Author".as_bytes().as_bstr(),
429                    "third@example.com".as_bytes().as_bstr()
430                ),
431            ]
432        );
433    }
434
435    #[test]
436    fn commit_author_identities_skip_invalid_coauthors() {
437        let commit = b"tree 1111111111111111111111111111111111111111\n\
438author Main Author <main@example.com> 1710000000 +0000\n\
439committer Main Author <main@example.com> 1710000000 +0000\n\
440\n\
441subject\n\
442\n\
443Co-authored-by: not a signature\n";
444        let (_, authors) = commit_author_identities(commit).expect("valid commit");
445        assert_eq!(authors.len(), 1);
446        assert_eq!(authors[0].name, "Main Author".as_bytes().as_bstr());
447        assert_eq!(authors[0].email, "main@example.com".as_bytes().as_bstr());
448    }
449}