Skip to main content

gitoxide_core/hours/
mod.rs

1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5    actor::{Identity, IdentityRef},
6    bstr::{BStr, ByteSlice},
7    prelude::*,
8    progress, Count, NestedProgress, Progress,
9};
10use smallvec::{smallvec, SmallVec};
11
12/// Additional configuration for the hours estimation functionality.
13pub struct Context<W> {
14    /// Ignore github bots which match the `[bot]` search string.
15    pub ignore_bots: bool,
16    /// Show personally identifiable information before the summary. Includes names and email addresses.
17    pub show_pii: bool,
18    /// Collect how many files have been added, removed and modified (without rename tracking).
19    pub file_stats: bool,
20    /// Collect how many lines in files have been added, removed and modified (without rename tracking).
21    pub line_stats: bool,
22    /// The number of threads to use. If unset, use all cores, if 0 use all physical cores.
23    pub threads: Option<usize>,
24    /// Omit unifying identities by name and email which can lead to the same author appear multiple times
25    /// due to using different names or email addresses.
26    pub omit_unify_identities: bool,
27    /// Where to write our output to
28    pub out: W,
29}
30
31pub struct SignatureRef<'a> {
32    name: &'a BStr,
33    email: &'a BStr,
34    time: gix::date::Time,
35}
36
37impl SignatureRef<'_> {
38    fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
39        self.time.seconds
40    }
41}
42
43/// A parsed author identity that can either borrow from commit data or own its
44/// storage when trailer parsing had to synthesize/unfold the value first.
45///
46/// This is not a `Cow<IdentityRef<'a>>` because `IdentityRef<'a>` is itself a
47/// borrowed view, while the owned case here is a different type altogether:
48/// [`Identity`]. We keep this enum private so callers can use `name()` and
49/// `email()` without caring whether the identity is borrowed or owned, and so
50/// the common borrowed case stays allocation-free.
51enum ParsedIdentity<'a> {
52    Borrowed(IdentityRef<'a>),
53    Owned(Identity),
54}
55
56impl ParsedIdentity<'_> {
57    fn name(&self) -> &BStr {
58        match self {
59            ParsedIdentity::Borrowed(identity) => identity.name,
60            ParsedIdentity::Owned(identity) => identity.name.as_ref(),
61        }
62    }
63
64    fn email(&self) -> &BStr {
65        match self {
66            ParsedIdentity::Borrowed(identity) => identity.email,
67            ParsedIdentity::Owned(identity) => identity.email.as_ref(),
68        }
69    }
70}
71
72fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
73    match trailer.value {
74        std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes::<gix::objs::decode::ParseError>(value.as_ref())
75            .ok()
76            .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
77        std::borrow::Cow::Owned(value) => IdentityRef::from_bytes::<gix::objs::decode::ParseError>(value.as_ref())
78            .ok()
79            .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
80    }
81}
82
83/// Return `(commit_author, [commit_author, co_authors...])`. Use the `commit_author` for easy access to the commit author itself.
84fn commit_author_identities(
85    commit_data: &[u8],
86) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
87    let commit = gix::objs::CommitRef::from_bytes(commit_data)?;
88    let author = commit.author()?.trim();
89    let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
90    authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
91    Ok((author, authors))
92}
93
94/// Estimate the hours it takes to produce the content of the repository in `_working_dir_`, with `_refname_` for
95/// the start of the commit graph traversal.
96///
97/// * `_working_dir_` - The directory containing a '.git/' folder.
98/// * `_refname_` - The name of the ref like 'main' or 'master' at which to start iterating the commit graph.
99/// * `_progress_` - A way to provide progress and performance information
100pub fn estimate<W, P>(
101    working_dir: &Path,
102    rev_spec: &BStr,
103    mut progress: P,
104    Context {
105        show_pii,
106        ignore_bots,
107        file_stats,
108        line_stats,
109        omit_unify_identities,
110        threads,
111        mut out,
112    }: Context<W>,
113) -> anyhow::Result<()>
114where
115    W: io::Write,
116    P: NestedProgress,
117{
118    let repo = gix::discover(working_dir)?;
119    let commit_id = repo.rev_parse_single(rev_spec)?.detach();
120    let mut string_heap = BTreeSet::<&'static [u8]>::new();
121    let needs_stats = file_stats || line_stats;
122    let threads = gix::features::parallel::num_threads(threads);
123
124    let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
125        std::thread::scope(|scope| -> anyhow::Result<_> {
126            let start = Instant::now();
127            let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
128            let mailmap = repo.open_mailmap();
129
130            let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
131                let mut out = Vec::new();
132                for (commit_idx, commit_data) in rx {
133                    if let Ok((commit_author, authors)) = commit_author_identities(&commit_data) {
134                        let mut string_ref = |s: &[u8]| -> &'static BStr {
135                            match string_heap.get(s) {
136                                Some(n) => n.as_bstr(),
137                                None => {
138                                    let sv: Vec<u8> = s.to_owned();
139                                    string_heap.insert(Box::leak(sv.into_boxed_slice()));
140                                    (*string_heap.get(s).expect("present")).as_ref()
141                                }
142                            }
143                        };
144                        let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
145                        for identity in authors {
146                            let author = mailmap.resolve_cow(gix::actor::SignatureRef {
147                                name: identity.name(),
148                                email: identity.email(),
149                                time: commit_author.time,
150                            });
151                            let name = string_ref(author.name.as_ref());
152                            let email = string_ref(author.email.as_ref());
153                            if authors_for_commit
154                                .iter()
155                                .any(|existing| existing.name == name && existing.email == email)
156                            {
157                                continue;
158                            }
159                            authors_for_commit.push(SignatureRef {
160                                name,
161                                email,
162                                time: author.time,
163                            });
164                        }
165                        out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
166                    }
167                }
168                out.shrink_to_fit();
169                out.sort_by(|a, b| {
170                    a.1.email
171                        .cmp(b.1.email)
172                        .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
173                        .then(a.0.cmp(&b.0))
174                });
175                Ok(out)
176            });
177
178            let (stats_progresses, stats_counters) = if needs_stats {
179                {
180                    let mut sp = progress.add_child("extract stats");
181                    sp.init(None, progress::count("commits"));
182                    let sc = sp.counter();
183
184                    let mut cp = progress.add_child("find changes");
185                    cp.init(None, progress::count("modified files"));
186                    let cc = cp.counter();
187
188                    let mut lp = progress.add_child("find changes");
189                    lp.init(None, progress::count("diff lines"));
190                    let lc = lp.counter();
191
192                    (Some((sp, cp, lp)), Some((sc, cc, lc)))
193                }
194            } else {
195                Default::default()
196            };
197
198            let mut progress = progress.add_child("traverse commit graph");
199            progress.init(None, progress::count("commits"));
200
201            let (tx_tree_id, stat_threads) = if needs_stats {
202                {
203                    let (tx, threads) = spawn_tree_delta_threads(
204                        scope,
205                        threads,
206                        line_stats,
207                        repo.clone(),
208                        stats_counters.clone().expect("counters are set"),
209                    );
210                    (Some(tx), threads)
211                }
212            } else {
213                Default::default()
214            };
215
216            let mut commit_idx = 0_u32;
217            let mut skipped_merge_commits = 0;
218            const CHUNK_SIZE: usize = 50;
219            let mut chunk = Vec::with_capacity(CHUNK_SIZE);
220            let mut commit_iter = commit_id.ancestors(&repo.objects);
221            let mut is_shallow = false;
222            while let Some(c) = commit_iter.next() {
223                progress.inc();
224                if gix::interrupt::is_triggered() {
225                    bail!("Cancelled by user");
226                }
227                match c {
228                    Ok(c) => {
229                        tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
230                        let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
231                            let mut parents = c.parent_ids.into_iter();
232                            parents
233                                .next()
234                                .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
235                                .filter(|_| {
236                                    if parents.next().is_some() {
237                                        skipped_merge_commits += 1;
238                                        false
239                                    } else {
240                                        true
241                                    }
242                                })
243                        });
244                        if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
245                            if chunk.len() == CHUNK_SIZE {
246                                tx_tree
247                                    .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
248                                    .ok();
249                            } else {
250                                chunk.push((commit_idx, first_parent, commit));
251                            }
252                        }
253                        commit_idx += 1;
254                    }
255                    Err(gix::traverse::commit::simple::Error::Find { .. }) => {
256                        is_shallow = true;
257                        break;
258                    }
259                    Err(err) => return Err(err.into()),
260                }
261            }
262            if let Some(tx) = tx_tree_id {
263                tx.send(chunk).ok();
264            }
265            drop(tx);
266            progress.show_throughput(start);
267            drop(progress);
268
269            let stats_by_commit_idx = match stats_progresses {
270                Some((mut stat_progress, change_progress, line_progress)) => {
271                    stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
272                    let mut stats = Vec::new();
273                    for handle in stat_threads {
274                        stats.extend(handle.join().expect("no panic")?);
275                        if gix::interrupt::is_triggered() {
276                            bail!("Cancelled by user");
277                        }
278                    }
279                    stats.sort_by_key(|t| t.0);
280                    stat_progress.show_throughput(start);
281                    change_progress.show_throughput(start);
282                    line_progress.show_throughput(start);
283                    stats
284                }
285                None => Vec::new(),
286            };
287
288            Ok((
289                extract_signatures.join().expect("no panic")?,
290                stats_by_commit_idx,
291                is_shallow,
292                skipped_merge_commits,
293                commit_idx,
294            ))
295        })?
296    };
297
298    if commit_authors.is_empty() {
299        bail!("No commits to process");
300    }
301
302    let start = Instant::now();
303    let mut current_email = &commit_authors[0].1.email;
304    let mut slice_start = 0;
305    let mut results_by_hours = Vec::new();
306    let mut ignored_bot_commits = 0_u32;
307    let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
308        let estimate = estimate_hours(commits, &stats);
309        if ignore_bots && estimate.name.contains_str(b"[bot]") {
310            ignored_bot_commits += estimate.num_commits;
311            return;
312        }
313        results_by_hours.push(estimate);
314    };
315    for (idx, (_, elm)) in commit_authors.iter().enumerate() {
316        if elm.email != *current_email {
317            push_estimate(&commit_authors[slice_start..idx]);
318            slice_start = idx;
319            current_email = &elm.email;
320        }
321    }
322    if let Some(commits) = commit_authors.get(slice_start..) {
323        push_estimate(commits);
324    }
325
326    let num_authors = results_by_hours.len();
327    let mut results_by_hours = if !omit_unify_identities {
328        deduplicate_identities(&results_by_hours)
329    } else {
330        results_by_hours
331            .iter()
332            .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
333                acc.push(e.into());
334                acc
335            })
336    };
337    let elapsed = start.elapsed();
338    progress.done(format!(
339        "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
340        num_commits,
341        elapsed,
342        num_commits as f32 / elapsed.as_secs_f32()
343    ));
344
345    let num_unique_authors = results_by_hours.len();
346    let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
347    let included_commit_ids = commit_authors
348        .iter()
349        .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
350        .map(|(commit_idx, _)| *commit_idx)
351        .collect::<BTreeSet<_>>();
352    let total_commits = included_commit_ids.len() as u32;
353    let (total_files, total_lines) = stats
354        .iter()
355        .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
356        .fold(
357            (FileStats::default(), LineStats::default()),
358            |mut acc, (_, files, lines)| {
359                acc.0.add(files);
360                acc.1.add(lines);
361                acc
362            },
363        );
364    if show_pii {
365        results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
366        for entry in &results_by_hours {
367            entry.write_to(
368                total_hours,
369                file_stats.then_some(total_files),
370                line_stats.then_some(total_lines),
371                &mut out,
372            )?;
373            writeln!(out)?;
374        }
375    }
376    writeln!(
377        out,
378        "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
379        total_hours,
380        total_hours / HOURS_PER_WORKDAY,
381        total_commits,
382        if is_shallow { " (shallow)" } else { Default::default() },
383        num_authors
384    )?;
385    if file_stats {
386        writeln!(
387            out,
388            "total files added/removed/modified/remaining: {}/{}/{}/{}",
389            total_files.added,
390            total_files.removed,
391            total_files.modified,
392            total_files.added - total_files.removed
393        )?;
394    }
395    if line_stats {
396        writeln!(
397            out,
398            "total lines added/removed/remaining: {}/{}/{}",
399            total_lines.added,
400            total_lines.removed,
401            total_lines.added - total_lines.removed
402        )?;
403    }
404    if !omit_unify_identities {
405        writeln!(
406            out,
407            "total unique authors: {} ({:.02}% duplication)",
408            num_unique_authors,
409            (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
410        )?;
411    }
412    if ignored_bot_commits != 0 {
413        writeln!(out, "commits by bots: {ignored_bot_commits}")?;
414    }
415    if needs_stats && skipped_merge_commits != 0 {
416        writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
417    }
418    debug_assert!(total_commits <= num_commits);
419    Ok(())
420}
421
422mod core;
423use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
424
425mod util;
426use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
427
428use crate::hours::core::spawn_tree_delta_threads;
429
430#[cfg(test)]
431mod tests {
432    use gix::bstr::ByteSlice;
433
434    use super::commit_author_identities;
435
436    #[test]
437    fn commit_author_identities_include_coauthors() {
438        let commit = b"tree 1111111111111111111111111111111111111111\n\
439author Main Author <main@example.com> 1710000000 +0000\n\
440committer Main Author <main@example.com> 1710000000 +0000\n\
441\n\
442subject\n\
443\n\
444body\n\
445\n\
446Co-authored-by: Second Author <second@example.com>\n\
447Co-authored-by: Third Author <third@example.com>\n";
448        let (author, authors) = commit_author_identities(commit).expect("valid commit");
449        assert_eq!(author.time, "1710000000 +0000");
450        assert_eq!(
451            authors
452                .iter()
453                .map(|identity| (identity.name(), identity.email()))
454                .collect::<Vec<_>>(),
455            vec![
456                (
457                    "Main Author".as_bytes().as_bstr(),
458                    "main@example.com".as_bytes().as_bstr()
459                ),
460                (
461                    "Second Author".as_bytes().as_bstr(),
462                    "second@example.com".as_bytes().as_bstr()
463                ),
464                (
465                    "Third Author".as_bytes().as_bstr(),
466                    "third@example.com".as_bytes().as_bstr()
467                ),
468            ]
469        );
470    }
471
472    #[test]
473    fn commit_author_identities_skip_invalid_coauthors() {
474        let commit = b"tree 1111111111111111111111111111111111111111\n\
475author Main Author <main@example.com> 1710000000 +0000\n\
476committer Main Author <main@example.com> 1710000000 +0000\n\
477\n\
478subject\n\
479\n\
480Co-authored-by: not a signature\n";
481        let (_, authors) = commit_author_identities(commit).expect("valid commit");
482        assert_eq!(authors.len(), 1);
483        assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
484        assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
485    }
486}