Skip to main content

gitoxide_core/hours/
mod.rs

1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5    actor::{Identity, IdentityRef},
6    bstr::{BStr, ByteSlice},
7    prelude::*,
8    progress, Count, NestedProgress, Progress,
9};
10use smallvec::{smallvec, SmallVec};
11
12/// Additional configuration for the hours estimation functionality.
13pub struct Context<W> {
14    /// Ignore github bots which match the `[bot]` search string.
15    pub ignore_bots: bool,
16    /// Show personally identifiable information before the summary. Includes names and email addresses.
17    pub show_pii: bool,
18    /// Collect how many files have been added, removed and modified (without rename tracking).
19    pub file_stats: bool,
20    /// Collect how many lines in files have been added, removed and modified (without rename tracking).
21    pub line_stats: bool,
22    /// The number of threads to use. If unset, use all cores, if 0 use all physical cores.
23    pub threads: Option<usize>,
24    /// Omit unifying identities by name and email which can lead to the same author appear multiple times
25    /// due to using different names or email addresses.
26    pub omit_unify_identities: bool,
27    /// Where to write our output to
28    pub out: W,
29}
30
31pub struct SignatureRef<'a> {
32    name: &'a BStr,
33    email: &'a BStr,
34    time: gix::date::Time,
35}
36
37impl SignatureRef<'_> {
38    fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
39        self.time.seconds
40    }
41}
42
43/// A parsed author identity that can either borrow from commit data or own its
44/// storage when trailer parsing had to synthesize/unfold the value first.
45///
46/// This is not a `Cow<IdentityRef<'a>>` because `IdentityRef<'a>` is itself a
47/// borrowed view, while the owned case here is a different type altogether:
48/// [`Identity`]. We keep this enum private so callers can use `name()` and
49/// `email()` without caring whether the identity is borrowed or owned, and so
50/// the common borrowed case stays allocation-free.
51enum ParsedIdentity<'a> {
52    Borrowed(IdentityRef<'a>),
53    Owned(Identity),
54}
55
56impl ParsedIdentity<'_> {
57    fn name(&self) -> &BStr {
58        match self {
59            ParsedIdentity::Borrowed(identity) => identity.name,
60            ParsedIdentity::Owned(identity) => identity.name.as_ref(),
61        }
62    }
63
64    fn email(&self) -> &BStr {
65        match self {
66            ParsedIdentity::Borrowed(identity) => identity.email,
67            ParsedIdentity::Owned(identity) => identity.email.as_ref(),
68        }
69    }
70}
71
72fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
73    match trailer.value {
74        std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes(value.as_ref())
75            .ok()
76            .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
77        std::borrow::Cow::Owned(value) => IdentityRef::from_bytes(value.as_ref())
78            .ok()
79            .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
80    }
81}
82
83/// Return `(commit_author, [commit_author, co_authors...])`. Use the `commit_author` for easy access to the commit author itself.
84fn commit_author_identities(
85    commit_data: &[u8],
86    hash_kind: gix::hash::Kind,
87) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
88    let commit = gix::objs::CommitRef::from_bytes(commit_data, hash_kind)?;
89    let author = commit.author()?.trim();
90    let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
91    authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
92    Ok((author, authors))
93}
94
95/// Estimate the hours it takes to produce the content of the repository in `_working_dir_`, with `_refname_` for
96/// the start of the commit graph traversal.
97///
98/// * `_working_dir_` - The directory containing a '.git/' folder.
99/// * `_refname_` - The name of the ref like 'main' or 'master' at which to start iterating the commit graph.
100/// * `_progress_` - A way to provide progress and performance information
101pub fn estimate<W, P>(
102    working_dir: &Path,
103    rev_spec: &BStr,
104    mut progress: P,
105    Context {
106        show_pii,
107        ignore_bots,
108        file_stats,
109        line_stats,
110        omit_unify_identities,
111        threads,
112        mut out,
113    }: Context<W>,
114) -> anyhow::Result<()>
115where
116    W: io::Write,
117    P: NestedProgress,
118{
119    let repo = gix::discover(working_dir)?;
120    let commit_id = repo.rev_parse_single(rev_spec)?.detach();
121    let mut string_heap = BTreeSet::<&'static [u8]>::new();
122    let needs_stats = file_stats || line_stats;
123    let threads = gix::features::parallel::num_threads(threads);
124
125    let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
126        std::thread::scope(|scope| -> anyhow::Result<_> {
127            let start = Instant::now();
128            let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
129            let mailmap = repo.open_mailmap();
130
131            let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
132                let mut out = Vec::new();
133                for (commit_idx, commit_data) in rx {
134                    if let Ok((commit_author, authors)) = commit_author_identities(&commit_data, commit_id.kind()) {
135                        let mut string_ref = |s: &[u8]| -> &'static BStr {
136                            match string_heap.get(s) {
137                                Some(n) => n.as_bstr(),
138                                None => {
139                                    let sv: Vec<u8> = s.to_owned();
140                                    string_heap.insert(Box::leak(sv.into_boxed_slice()));
141                                    (*string_heap.get(s).expect("present")).as_ref()
142                                }
143                            }
144                        };
145                        let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
146                        for identity in authors {
147                            let author = mailmap.resolve_cow(gix::actor::SignatureRef {
148                                name: identity.name(),
149                                email: identity.email(),
150                                time: commit_author.time,
151                            });
152                            let name = string_ref(author.name.as_ref());
153                            let email = string_ref(author.email.as_ref());
154                            if authors_for_commit
155                                .iter()
156                                .any(|existing| existing.name == name && existing.email == email)
157                            {
158                                continue;
159                            }
160                            authors_for_commit.push(SignatureRef {
161                                name,
162                                email,
163                                time: author.time,
164                            });
165                        }
166                        out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
167                    }
168                }
169                out.shrink_to_fit();
170                out.sort_by(|a, b| {
171                    a.1.email
172                        .cmp(b.1.email)
173                        .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
174                        .then(a.0.cmp(&b.0))
175                });
176                Ok(out)
177            });
178
179            let (stats_progresses, stats_counters) = if needs_stats {
180                {
181                    let mut sp = progress.add_child("extract stats");
182                    sp.init(None, progress::count("commits"));
183                    let sc = sp.counter();
184
185                    let mut cp = progress.add_child("find changes");
186                    cp.init(None, progress::count("modified files"));
187                    let cc = cp.counter();
188
189                    let mut lp = progress.add_child("find changes");
190                    lp.init(None, progress::count("diff lines"));
191                    let lc = lp.counter();
192
193                    (Some((sp, cp, lp)), Some((sc, cc, lc)))
194                }
195            } else {
196                Default::default()
197            };
198
199            let mut progress = progress.add_child("traverse commit graph");
200            progress.init(None, progress::count("commits"));
201
202            let (tx_tree_id, stat_threads) = if needs_stats {
203                {
204                    let (tx, threads) = spawn_tree_delta_threads(
205                        scope,
206                        threads,
207                        line_stats,
208                        repo.clone(),
209                        stats_counters.clone().expect("counters are set"),
210                    );
211                    (Some(tx), threads)
212                }
213            } else {
214                Default::default()
215            };
216
217            let mut commit_idx = 0_u32;
218            let mut skipped_merge_commits = 0;
219            const CHUNK_SIZE: usize = 50;
220            let mut chunk = Vec::with_capacity(CHUNK_SIZE);
221            let mut commit_iter = commit_id.ancestors(&repo.objects);
222            let mut is_shallow = false;
223            while let Some(c) = commit_iter.next() {
224                progress.inc();
225                if gix::interrupt::is_triggered() {
226                    bail!("Cancelled by user");
227                }
228                match c {
229                    Ok(c) => {
230                        tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
231                        let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
232                            let mut parents = c.parent_ids.into_iter();
233                            parents
234                                .next()
235                                .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
236                                .filter(|_| {
237                                    if parents.next().is_some() {
238                                        skipped_merge_commits += 1;
239                                        false
240                                    } else {
241                                        true
242                                    }
243                                })
244                        });
245                        if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
246                            if chunk.len() == CHUNK_SIZE {
247                                tx_tree
248                                    .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
249                                    .ok();
250                            } else {
251                                chunk.push((commit_idx, first_parent, commit));
252                            }
253                        }
254                        commit_idx += 1;
255                    }
256                    Err(gix::traverse::commit::simple::Error::Find { .. }) => {
257                        is_shallow = true;
258                        break;
259                    }
260                    Err(err) => return Err(err.into()),
261                }
262            }
263            if let Some(tx) = tx_tree_id {
264                tx.send(chunk).ok();
265            }
266            drop(tx);
267            progress.show_throughput(start);
268            drop(progress);
269
270            let stats_by_commit_idx = match stats_progresses {
271                Some((mut stat_progress, change_progress, line_progress)) => {
272                    stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
273                    let mut stats = Vec::new();
274                    for handle in stat_threads {
275                        stats.extend(handle.join().expect("no panic")?);
276                        if gix::interrupt::is_triggered() {
277                            bail!("Cancelled by user");
278                        }
279                    }
280                    stats.sort_by_key(|t| t.0);
281                    stat_progress.show_throughput(start);
282                    change_progress.show_throughput(start);
283                    line_progress.show_throughput(start);
284                    stats
285                }
286                None => Vec::new(),
287            };
288
289            Ok((
290                extract_signatures.join().expect("no panic")?,
291                stats_by_commit_idx,
292                is_shallow,
293                skipped_merge_commits,
294                commit_idx,
295            ))
296        })?
297    };
298
299    if commit_authors.is_empty() {
300        bail!("No commits to process");
301    }
302
303    let start = Instant::now();
304    let mut current_email = &commit_authors[0].1.email;
305    let mut slice_start = 0;
306    let mut results_by_hours = Vec::new();
307    let mut ignored_bot_commits = 0_u32;
308    let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
309        let estimate = estimate_hours(commits, &stats);
310        if ignore_bots && estimate.name.contains_str(b"[bot]") {
311            ignored_bot_commits += estimate.num_commits;
312            return;
313        }
314        results_by_hours.push(estimate);
315    };
316    for (idx, (_, elm)) in commit_authors.iter().enumerate() {
317        if elm.email != *current_email {
318            push_estimate(&commit_authors[slice_start..idx]);
319            slice_start = idx;
320            current_email = &elm.email;
321        }
322    }
323    if let Some(commits) = commit_authors.get(slice_start..) {
324        push_estimate(commits);
325    }
326
327    let num_authors = results_by_hours.len();
328    let mut results_by_hours = if !omit_unify_identities {
329        deduplicate_identities(&results_by_hours)
330    } else {
331        results_by_hours
332            .iter()
333            .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
334                acc.push(e.into());
335                acc
336            })
337    };
338    let elapsed = start.elapsed();
339    progress.done(format!(
340        "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
341        num_commits,
342        elapsed,
343        num_commits as f32 / elapsed.as_secs_f32()
344    ));
345
346    let num_unique_authors = results_by_hours.len();
347    let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
348    let included_commit_ids = commit_authors
349        .iter()
350        .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
351        .map(|(commit_idx, _)| *commit_idx)
352        .collect::<BTreeSet<_>>();
353    let total_commits = included_commit_ids.len() as u32;
354    let (total_files, total_lines) = stats
355        .iter()
356        .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
357        .fold(
358            (FileStats::default(), LineStats::default()),
359            |mut acc, (_, files, lines)| {
360                acc.0.add(files);
361                acc.1.add(lines);
362                acc
363            },
364        );
365    if show_pii {
366        results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
367        for entry in &results_by_hours {
368            entry.write_to(
369                total_hours,
370                file_stats.then_some(total_files),
371                line_stats.then_some(total_lines),
372                &mut out,
373            )?;
374            writeln!(out)?;
375        }
376    }
377    writeln!(
378        out,
379        "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
380        total_hours,
381        total_hours / HOURS_PER_WORKDAY,
382        total_commits,
383        if is_shallow { " (shallow)" } else { Default::default() },
384        num_authors
385    )?;
386    if file_stats {
387        writeln!(
388            out,
389            "total files added/removed/modified/remaining: {}/{}/{}/{}",
390            total_files.added,
391            total_files.removed,
392            total_files.modified,
393            total_files.added - total_files.removed
394        )?;
395    }
396    if line_stats {
397        writeln!(
398            out,
399            "total lines added/removed/remaining: {}/{}/{}",
400            total_lines.added,
401            total_lines.removed,
402            total_lines.added - total_lines.removed
403        )?;
404    }
405    if !omit_unify_identities {
406        writeln!(
407            out,
408            "total unique authors: {} ({:.02}% duplication)",
409            num_unique_authors,
410            (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
411        )?;
412    }
413    if ignored_bot_commits != 0 {
414        writeln!(out, "commits by bots: {ignored_bot_commits}")?;
415    }
416    if needs_stats && skipped_merge_commits != 0 {
417        writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
418    }
419    debug_assert!(total_commits <= num_commits);
420    Ok(())
421}
422
423mod core;
424use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
425
426mod util;
427use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
428
429use crate::hours::core::spawn_tree_delta_threads;
430
431#[cfg(test)]
432mod tests {
433    use gix::bstr::ByteSlice;
434
435    use super::commit_author_identities;
436
437    #[test]
438    fn commit_author_identities_include_coauthors() {
439        let commit = b"tree 1111111111111111111111111111111111111111\n\
440author Main Author <main@example.com> 1710000000 +0000\n\
441committer Main Author <main@example.com> 1710000000 +0000\n\
442\n\
443subject\n\
444\n\
445body\n\
446\n\
447Co-authored-by: Second Author <second@example.com>\n\
448Co-authored-by: Third Author <third@example.com>\n";
449        let (author, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
450        assert_eq!(author.time, "1710000000 +0000");
451        assert_eq!(
452            authors
453                .iter()
454                .map(|identity| (identity.name(), identity.email()))
455                .collect::<Vec<_>>(),
456            vec![
457                (
458                    "Main Author".as_bytes().as_bstr(),
459                    "main@example.com".as_bytes().as_bstr()
460                ),
461                (
462                    "Second Author".as_bytes().as_bstr(),
463                    "second@example.com".as_bytes().as_bstr()
464                ),
465                (
466                    "Third Author".as_bytes().as_bstr(),
467                    "third@example.com".as_bytes().as_bstr()
468                ),
469            ]
470        );
471    }
472
473    #[test]
474    fn commit_author_identities_skip_invalid_coauthors() {
475        let commit = b"tree 1111111111111111111111111111111111111111\n\
476author Main Author <main@example.com> 1710000000 +0000\n\
477committer Main Author <main@example.com> 1710000000 +0000\n\
478\n\
479subject\n\
480\n\
481Co-authored-by: not a signature\n";
482        let (_, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
483        assert_eq!(authors.len(), 1);
484        assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
485        assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
486    }
487}