Skip to main content

gitoxide_core/hours/
mod.rs

1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5    Count, NestedProgress, Progress,
6    actor::{Identity, IdentityRef},
7    bstr::{BStr, ByteSlice},
8    prelude::*,
9    progress,
10};
11use smallvec::{SmallVec, smallvec};
12
13/// Additional configuration for the hours estimation functionality.
14pub struct Context<W> {
15    /// Ignore github bots which match the `[bot]` search string.
16    pub ignore_bots: bool,
17    /// Show personally identifiable information before the summary. Includes names and email addresses.
18    pub show_pii: bool,
19    /// Collect how many files have been added, removed and modified (without rename tracking).
20    pub file_stats: bool,
21    /// Collect how many lines in files have been added, removed and modified (without rename tracking).
22    pub line_stats: bool,
23    /// The number of threads to use. If unset, use all cores, if 0 use all physical cores.
24    pub threads: Option<usize>,
25    /// Omit unifying identities by name and email which can lead to the same author appear multiple times
26    /// due to using different names or email addresses.
27    pub omit_unify_identities: bool,
28    /// Where to write our output to
29    pub out: W,
30}
31
32pub struct SignatureRef<'a> {
33    name: &'a BStr,
34    email: &'a BStr,
35    time: gix::date::Time,
36}
37
38impl SignatureRef<'_> {
39    fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
40        self.time.seconds
41    }
42}
43
44/// A parsed author identity that can either borrow from commit data or own its
45/// storage when trailer parsing had to synthesize/unfold the value first.
46///
47/// This is not a `Cow<IdentityRef<'a>>` because `IdentityRef<'a>` is itself a
48/// borrowed view, while the owned case here is a different type altogether:
49/// [`Identity`]. We keep this enum private so callers can use `name()` and
50/// `email()` without caring whether the identity is borrowed or owned, and so
51/// the common borrowed case stays allocation-free.
52enum ParsedIdentity<'a> {
53    Borrowed(IdentityRef<'a>),
54    Owned(Identity),
55}
56
57impl ParsedIdentity<'_> {
58    fn name(&self) -> &BStr {
59        match self {
60            ParsedIdentity::Borrowed(identity) => identity.name,
61            ParsedIdentity::Owned(identity) => identity.name.as_ref(),
62        }
63    }
64
65    fn email(&self) -> &BStr {
66        match self {
67            ParsedIdentity::Borrowed(identity) => identity.email,
68            ParsedIdentity::Owned(identity) => identity.email.as_ref(),
69        }
70    }
71}
72
73fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
74    match trailer.value {
75        std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes(value.as_ref())
76            .ok()
77            .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
78        std::borrow::Cow::Owned(value) => IdentityRef::from_bytes(value.as_ref())
79            .ok()
80            .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
81    }
82}
83
84/// Return `(commit_author, [commit_author, co_authors...])`. Use the `commit_author` for easy access to the commit author itself.
85fn commit_author_identities(
86    commit_data: &[u8],
87    object_hash: gix::hash::Kind,
88) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
89    let commit = gix::objs::CommitRef::from_bytes(commit_data, object_hash)?;
90    let author = commit.author()?.trim();
91    let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
92    authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
93    Ok((author, authors))
94}
95
96/// Estimate the hours it takes to produce the content of the repository in `_working_dir_`, with `_refname_` for
97/// the start of the commit graph traversal.
98///
99/// * `_working_dir_` - The directory containing a '.git/' folder.
100/// * `_refname_` - The name of the ref like 'main' or 'master' at which to start iterating the commit graph.
101/// * `_progress_` - A way to provide progress and performance information
102pub fn estimate<W, P>(
103    working_dir: &Path,
104    rev_spec: &BStr,
105    mut progress: P,
106    Context {
107        show_pii,
108        ignore_bots,
109        file_stats,
110        line_stats,
111        omit_unify_identities,
112        threads,
113        mut out,
114    }: Context<W>,
115) -> anyhow::Result<()>
116where
117    W: io::Write,
118    P: NestedProgress,
119{
120    let repo = gix::discover(working_dir)?;
121    let commit_id = repo.rev_parse_single(rev_spec)?.detach();
122    let mut string_heap = BTreeSet::<&'static [u8]>::new();
123    let needs_stats = file_stats || line_stats;
124    let threads = gix::features::parallel::num_threads(threads);
125
126    let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
127        std::thread::scope(|scope| -> anyhow::Result<_> {
128            let start = Instant::now();
129            let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
130            let mailmap = repo.open_mailmap();
131
132            let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
133                let mut out = Vec::new();
134                for (commit_idx, commit_data) in rx {
135                    if let Ok((commit_author, authors)) = commit_author_identities(&commit_data, commit_id.kind()) {
136                        let mut string_ref = |s: &[u8]| -> &'static BStr {
137                            match string_heap.get(s) {
138                                Some(n) => n.as_bstr(),
139                                None => {
140                                    let sv: Vec<u8> = s.to_owned();
141                                    string_heap.insert(Box::leak(sv.into_boxed_slice()));
142                                    (*string_heap.get(s).expect("present")).as_ref()
143                                }
144                            }
145                        };
146                        let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
147                        for identity in authors {
148                            let author = mailmap.resolve_cow(gix::actor::SignatureRef {
149                                name: identity.name(),
150                                email: identity.email(),
151                                time: commit_author.time,
152                            });
153                            let name = string_ref(author.name.as_ref());
154                            let email = string_ref(author.email.as_ref());
155                            if authors_for_commit
156                                .iter()
157                                .any(|existing| existing.name == name && existing.email == email)
158                            {
159                                continue;
160                            }
161                            authors_for_commit.push(SignatureRef {
162                                name,
163                                email,
164                                time: author.time,
165                            });
166                        }
167                        out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
168                    }
169                }
170                out.shrink_to_fit();
171                out.sort_by(|a, b| {
172                    a.1.email
173                        .cmp(b.1.email)
174                        .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
175                        .then(a.0.cmp(&b.0))
176                });
177                Ok(out)
178            });
179
180            let (stats_progresses, stats_counters) = if needs_stats {
181                {
182                    let mut sp = progress.add_child("extract stats");
183                    sp.init(None, progress::count("commits"));
184                    let sc = sp.counter();
185
186                    let mut cp = progress.add_child("find changes");
187                    cp.init(None, progress::count("modified files"));
188                    let cc = cp.counter();
189
190                    let mut lp = progress.add_child("find changes");
191                    lp.init(None, progress::count("diff lines"));
192                    let lc = lp.counter();
193
194                    (Some((sp, cp, lp)), Some((sc, cc, lc)))
195                }
196            } else {
197                Default::default()
198            };
199
200            let mut progress = progress.add_child("traverse commit graph");
201            progress.init(None, progress::count("commits"));
202
203            let (tx_tree_id, stat_threads) = if needs_stats {
204                {
205                    let (tx, threads) = spawn_tree_delta_threads(
206                        scope,
207                        threads,
208                        line_stats,
209                        repo.clone(),
210                        stats_counters.clone().expect("counters are set"),
211                    );
212                    (Some(tx), threads)
213                }
214            } else {
215                Default::default()
216            };
217
218            let mut commit_idx = 0_u32;
219            let mut skipped_merge_commits = 0;
220            const CHUNK_SIZE: usize = 50;
221            let mut chunk = Vec::with_capacity(CHUNK_SIZE);
222            let mut commit_iter = commit_id.ancestors(&repo.objects);
223            let mut is_shallow = false;
224            while let Some(c) = commit_iter.next() {
225                progress.inc();
226                if gix::interrupt::is_triggered() {
227                    bail!("Cancelled by user");
228                }
229                match c {
230                    Ok(c) => {
231                        tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
232                        let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
233                            let mut parents = c.parent_ids.into_iter();
234                            parents
235                                .next()
236                                .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
237                                .filter(|_| {
238                                    if parents.next().is_some() {
239                                        skipped_merge_commits += 1;
240                                        false
241                                    } else {
242                                        true
243                                    }
244                                })
245                        });
246                        if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
247                            if chunk.len() == CHUNK_SIZE {
248                                tx_tree
249                                    .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
250                                    .ok();
251                            } else {
252                                chunk.push((commit_idx, first_parent, commit));
253                            }
254                        }
255                        commit_idx += 1;
256                    }
257                    Err(gix::traverse::commit::simple::Error::Find { .. }) => {
258                        is_shallow = true;
259                        break;
260                    }
261                    Err(err) => return Err(err.into()),
262                }
263            }
264            if let Some(tx) = tx_tree_id {
265                tx.send(chunk).ok();
266            }
267            drop(tx);
268            progress.show_throughput(start);
269            drop(progress);
270
271            let stats_by_commit_idx = match stats_progresses {
272                Some((mut stat_progress, change_progress, line_progress)) => {
273                    stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
274                    let mut stats = Vec::new();
275                    for handle in stat_threads {
276                        stats.extend(handle.join().expect("no panic")?);
277                        if gix::interrupt::is_triggered() {
278                            bail!("Cancelled by user");
279                        }
280                    }
281                    stats.sort_by_key(|t| t.0);
282                    stat_progress.show_throughput(start);
283                    change_progress.show_throughput(start);
284                    line_progress.show_throughput(start);
285                    stats
286                }
287                None => Vec::new(),
288            };
289
290            Ok((
291                extract_signatures.join().expect("no panic")?,
292                stats_by_commit_idx,
293                is_shallow,
294                skipped_merge_commits,
295                commit_idx,
296            ))
297        })?
298    };
299
300    if commit_authors.is_empty() {
301        bail!("No commits to process");
302    }
303
304    let start = Instant::now();
305    let mut current_email = &commit_authors[0].1.email;
306    let mut slice_start = 0;
307    let mut results_by_hours = Vec::new();
308    let mut ignored_bot_commits = 0_u32;
309    let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
310        let estimate = estimate_hours(commits, &stats);
311        if ignore_bots && estimate.name.contains_str(b"[bot]") {
312            ignored_bot_commits += estimate.num_commits;
313            return;
314        }
315        results_by_hours.push(estimate);
316    };
317    for (idx, (_, elm)) in commit_authors.iter().enumerate() {
318        if elm.email != *current_email {
319            push_estimate(&commit_authors[slice_start..idx]);
320            slice_start = idx;
321            current_email = &elm.email;
322        }
323    }
324    if let Some(commits) = commit_authors.get(slice_start..) {
325        push_estimate(commits);
326    }
327
328    let num_authors = results_by_hours.len();
329    let mut results_by_hours = if !omit_unify_identities {
330        deduplicate_identities(&results_by_hours)
331    } else {
332        results_by_hours
333            .iter()
334            .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
335                acc.push(e.into());
336                acc
337            })
338    };
339    let elapsed = start.elapsed();
340    progress.done(format!(
341        "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
342        num_commits,
343        elapsed,
344        num_commits as f32 / elapsed.as_secs_f32()
345    ));
346
347    let num_unique_authors = results_by_hours.len();
348    let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
349    let included_commit_ids = commit_authors
350        .iter()
351        .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
352        .map(|(commit_idx, _)| *commit_idx)
353        .collect::<BTreeSet<_>>();
354    let total_commits = included_commit_ids.len() as u32;
355    let (total_files, total_lines) = stats
356        .iter()
357        .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
358        .fold(
359            (FileStats::default(), LineStats::default()),
360            |mut acc, (_, files, lines)| {
361                acc.0.add(files);
362                acc.1.add(lines);
363                acc
364            },
365        );
366    if show_pii {
367        results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
368        for entry in &results_by_hours {
369            entry.write_to(
370                total_hours,
371                file_stats.then_some(total_files),
372                line_stats.then_some(total_lines),
373                &mut out,
374            )?;
375            writeln!(out)?;
376        }
377    }
378    writeln!(
379        out,
380        "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
381        total_hours,
382        total_hours / HOURS_PER_WORKDAY,
383        total_commits,
384        if is_shallow { " (shallow)" } else { Default::default() },
385        num_authors
386    )?;
387    if file_stats {
388        writeln!(
389            out,
390            "total files added/removed/modified/remaining: {}/{}/{}/{}",
391            total_files.added,
392            total_files.removed,
393            total_files.modified,
394            total_files.added - total_files.removed
395        )?;
396    }
397    if line_stats {
398        writeln!(
399            out,
400            "total lines added/removed/remaining: {}/{}/{}",
401            total_lines.added,
402            total_lines.removed,
403            total_lines.added - total_lines.removed
404        )?;
405    }
406    if !omit_unify_identities {
407        writeln!(
408            out,
409            "total unique authors: {} ({:.02}% duplication)",
410            num_unique_authors,
411            (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
412        )?;
413    }
414    if ignored_bot_commits != 0 {
415        writeln!(out, "commits by bots: {ignored_bot_commits}")?;
416    }
417    if needs_stats && skipped_merge_commits != 0 {
418        writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
419    }
420    debug_assert!(total_commits <= num_commits);
421    Ok(())
422}
423
424mod core;
425use self::core::{HOURS_PER_WORKDAY, deduplicate_identities, estimate_hours};
426
427mod util;
428use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
429
430use crate::hours::core::spawn_tree_delta_threads;
431
432#[cfg(test)]
433mod tests {
434    use gix::bstr::ByteSlice;
435
436    use super::commit_author_identities;
437
438    #[test]
439    fn commit_author_identities_include_coauthors() {
440        let commit = b"tree 1111111111111111111111111111111111111111\n\
441author Main Author <main@example.com> 1710000000 +0000\n\
442committer Main Author <main@example.com> 1710000000 +0000\n\
443\n\
444subject\n\
445\n\
446body\n\
447\n\
448Co-authored-by: Second Author <second@example.com>\n\
449Co-authored-by: Third Author <third@example.com>\n";
450        let (author, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
451        assert_eq!(author.time, "1710000000 +0000");
452        assert_eq!(
453            authors
454                .iter()
455                .map(|identity| (identity.name(), identity.email()))
456                .collect::<Vec<_>>(),
457            vec![
458                (
459                    "Main Author".as_bytes().as_bstr(),
460                    "main@example.com".as_bytes().as_bstr()
461                ),
462                (
463                    "Second Author".as_bytes().as_bstr(),
464                    "second@example.com".as_bytes().as_bstr()
465                ),
466                (
467                    "Third Author".as_bytes().as_bstr(),
468                    "third@example.com".as_bytes().as_bstr()
469                ),
470            ]
471        );
472    }
473
474    #[test]
475    fn commit_author_identities_skip_invalid_coauthors() {
476        let commit = b"tree 1111111111111111111111111111111111111111\n\
477author Main Author <main@example.com> 1710000000 +0000\n\
478committer Main Author <main@example.com> 1710000000 +0000\n\
479\n\
480subject\n\
481\n\
482Co-authored-by: not a signature\n";
483        let (_, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
484        assert_eq!(authors.len(), 1);
485        assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
486        assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
487    }
488}