1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5 actor::{Identity, IdentityRef},
6 bstr::{BStr, ByteSlice},
7 prelude::*,
8 progress, Count, NestedProgress, Progress,
9};
10use smallvec::{smallvec, SmallVec};
11
12pub struct Context<W> {
14 pub ignore_bots: bool,
16 pub show_pii: bool,
18 pub file_stats: bool,
20 pub line_stats: bool,
22 pub threads: Option<usize>,
24 pub omit_unify_identities: bool,
27 pub out: W,
29}
30
31pub struct SignatureRef<'a> {
32 name: &'a BStr,
33 email: &'a BStr,
34 time: gix::date::Time,
35}
36
37impl SignatureRef<'_> {
38 fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
39 self.time.seconds
40 }
41}
42
43enum ParsedIdentity<'a> {
52 Borrowed(IdentityRef<'a>),
53 Owned(Identity),
54}
55
56impl ParsedIdentity<'_> {
57 fn name(&self) -> &BStr {
58 match self {
59 ParsedIdentity::Borrowed(identity) => identity.name,
60 ParsedIdentity::Owned(identity) => identity.name.as_ref(),
61 }
62 }
63
64 fn email(&self) -> &BStr {
65 match self {
66 ParsedIdentity::Borrowed(identity) => identity.email,
67 ParsedIdentity::Owned(identity) => identity.email.as_ref(),
68 }
69 }
70}
71
72fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
73 match trailer.value {
74 std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes(value.as_ref())
75 .ok()
76 .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
77 std::borrow::Cow::Owned(value) => IdentityRef::from_bytes(value.as_ref())
78 .ok()
79 .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
80 }
81}
82
83fn commit_author_identities(
85 commit_data: &[u8],
86 hash_kind: gix::hash::Kind,
87) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
88 let commit = gix::objs::CommitRef::from_bytes(commit_data, hash_kind)?;
89 let author = commit.author()?.trim();
90 let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
91 authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
92 Ok((author, authors))
93}
94
95pub fn estimate<W, P>(
102 working_dir: &Path,
103 rev_spec: &BStr,
104 mut progress: P,
105 Context {
106 show_pii,
107 ignore_bots,
108 file_stats,
109 line_stats,
110 omit_unify_identities,
111 threads,
112 mut out,
113 }: Context<W>,
114) -> anyhow::Result<()>
115where
116 W: io::Write,
117 P: NestedProgress,
118{
119 let repo = gix::discover(working_dir)?;
120 let commit_id = repo.rev_parse_single(rev_spec)?.detach();
121 let mut string_heap = BTreeSet::<&'static [u8]>::new();
122 let needs_stats = file_stats || line_stats;
123 let threads = gix::features::parallel::num_threads(threads);
124
125 let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
126 std::thread::scope(|scope| -> anyhow::Result<_> {
127 let start = Instant::now();
128 let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
129 let mailmap = repo.open_mailmap();
130
131 let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
132 let mut out = Vec::new();
133 for (commit_idx, commit_data) in rx {
134 if let Ok((commit_author, authors)) = commit_author_identities(&commit_data, commit_id.kind()) {
135 let mut string_ref = |s: &[u8]| -> &'static BStr {
136 match string_heap.get(s) {
137 Some(n) => n.as_bstr(),
138 None => {
139 let sv: Vec<u8> = s.to_owned();
140 string_heap.insert(Box::leak(sv.into_boxed_slice()));
141 (*string_heap.get(s).expect("present")).as_ref()
142 }
143 }
144 };
145 let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
146 for identity in authors {
147 let author = mailmap.resolve_cow(gix::actor::SignatureRef {
148 name: identity.name(),
149 email: identity.email(),
150 time: commit_author.time,
151 });
152 let name = string_ref(author.name.as_ref());
153 let email = string_ref(author.email.as_ref());
154 if authors_for_commit
155 .iter()
156 .any(|existing| existing.name == name && existing.email == email)
157 {
158 continue;
159 }
160 authors_for_commit.push(SignatureRef {
161 name,
162 email,
163 time: author.time,
164 });
165 }
166 out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
167 }
168 }
169 out.shrink_to_fit();
170 out.sort_by(|a, b| {
171 a.1.email
172 .cmp(b.1.email)
173 .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
174 .then(a.0.cmp(&b.0))
175 });
176 Ok(out)
177 });
178
179 let (stats_progresses, stats_counters) = if needs_stats {
180 {
181 let mut sp = progress.add_child("extract stats");
182 sp.init(None, progress::count("commits"));
183 let sc = sp.counter();
184
185 let mut cp = progress.add_child("find changes");
186 cp.init(None, progress::count("modified files"));
187 let cc = cp.counter();
188
189 let mut lp = progress.add_child("find changes");
190 lp.init(None, progress::count("diff lines"));
191 let lc = lp.counter();
192
193 (Some((sp, cp, lp)), Some((sc, cc, lc)))
194 }
195 } else {
196 Default::default()
197 };
198
199 let mut progress = progress.add_child("traverse commit graph");
200 progress.init(None, progress::count("commits"));
201
202 let (tx_tree_id, stat_threads) = if needs_stats {
203 {
204 let (tx, threads) = spawn_tree_delta_threads(
205 scope,
206 threads,
207 line_stats,
208 repo.clone(),
209 stats_counters.clone().expect("counters are set"),
210 );
211 (Some(tx), threads)
212 }
213 } else {
214 Default::default()
215 };
216
217 let mut commit_idx = 0_u32;
218 let mut skipped_merge_commits = 0;
219 const CHUNK_SIZE: usize = 50;
220 let mut chunk = Vec::with_capacity(CHUNK_SIZE);
221 let mut commit_iter = commit_id.ancestors(&repo.objects);
222 let mut is_shallow = false;
223 while let Some(c) = commit_iter.next() {
224 progress.inc();
225 if gix::interrupt::is_triggered() {
226 bail!("Cancelled by user");
227 }
228 match c {
229 Ok(c) => {
230 tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
231 let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
232 let mut parents = c.parent_ids.into_iter();
233 parents
234 .next()
235 .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
236 .filter(|_| {
237 if parents.next().is_some() {
238 skipped_merge_commits += 1;
239 false
240 } else {
241 true
242 }
243 })
244 });
245 if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
246 if chunk.len() == CHUNK_SIZE {
247 tx_tree
248 .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
249 .ok();
250 } else {
251 chunk.push((commit_idx, first_parent, commit));
252 }
253 }
254 commit_idx += 1;
255 }
256 Err(gix::traverse::commit::simple::Error::Find { .. }) => {
257 is_shallow = true;
258 break;
259 }
260 Err(err) => return Err(err.into()),
261 }
262 }
263 if let Some(tx) = tx_tree_id {
264 tx.send(chunk).ok();
265 }
266 drop(tx);
267 progress.show_throughput(start);
268 drop(progress);
269
270 let stats_by_commit_idx = match stats_progresses {
271 Some((mut stat_progress, change_progress, line_progress)) => {
272 stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
273 let mut stats = Vec::new();
274 for handle in stat_threads {
275 stats.extend(handle.join().expect("no panic")?);
276 if gix::interrupt::is_triggered() {
277 bail!("Cancelled by user");
278 }
279 }
280 stats.sort_by_key(|t| t.0);
281 stat_progress.show_throughput(start);
282 change_progress.show_throughput(start);
283 line_progress.show_throughput(start);
284 stats
285 }
286 None => Vec::new(),
287 };
288
289 Ok((
290 extract_signatures.join().expect("no panic")?,
291 stats_by_commit_idx,
292 is_shallow,
293 skipped_merge_commits,
294 commit_idx,
295 ))
296 })?
297 };
298
299 if commit_authors.is_empty() {
300 bail!("No commits to process");
301 }
302
303 let start = Instant::now();
304 let mut current_email = &commit_authors[0].1.email;
305 let mut slice_start = 0;
306 let mut results_by_hours = Vec::new();
307 let mut ignored_bot_commits = 0_u32;
308 let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
309 let estimate = estimate_hours(commits, &stats);
310 if ignore_bots && estimate.name.contains_str(b"[bot]") {
311 ignored_bot_commits += estimate.num_commits;
312 return;
313 }
314 results_by_hours.push(estimate);
315 };
316 for (idx, (_, elm)) in commit_authors.iter().enumerate() {
317 if elm.email != *current_email {
318 push_estimate(&commit_authors[slice_start..idx]);
319 slice_start = idx;
320 current_email = &elm.email;
321 }
322 }
323 if let Some(commits) = commit_authors.get(slice_start..) {
324 push_estimate(commits);
325 }
326
327 let num_authors = results_by_hours.len();
328 let mut results_by_hours = if !omit_unify_identities {
329 deduplicate_identities(&results_by_hours)
330 } else {
331 results_by_hours
332 .iter()
333 .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
334 acc.push(e.into());
335 acc
336 })
337 };
338 let elapsed = start.elapsed();
339 progress.done(format!(
340 "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
341 num_commits,
342 elapsed,
343 num_commits as f32 / elapsed.as_secs_f32()
344 ));
345
346 let num_unique_authors = results_by_hours.len();
347 let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
348 let included_commit_ids = commit_authors
349 .iter()
350 .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
351 .map(|(commit_idx, _)| *commit_idx)
352 .collect::<BTreeSet<_>>();
353 let total_commits = included_commit_ids.len() as u32;
354 let (total_files, total_lines) = stats
355 .iter()
356 .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
357 .fold(
358 (FileStats::default(), LineStats::default()),
359 |mut acc, (_, files, lines)| {
360 acc.0.add(files);
361 acc.1.add(lines);
362 acc
363 },
364 );
365 if show_pii {
366 results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
367 for entry in &results_by_hours {
368 entry.write_to(
369 total_hours,
370 file_stats.then_some(total_files),
371 line_stats.then_some(total_lines),
372 &mut out,
373 )?;
374 writeln!(out)?;
375 }
376 }
377 writeln!(
378 out,
379 "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
380 total_hours,
381 total_hours / HOURS_PER_WORKDAY,
382 total_commits,
383 if is_shallow { " (shallow)" } else { Default::default() },
384 num_authors
385 )?;
386 if file_stats {
387 writeln!(
388 out,
389 "total files added/removed/modified/remaining: {}/{}/{}/{}",
390 total_files.added,
391 total_files.removed,
392 total_files.modified,
393 total_files.added - total_files.removed
394 )?;
395 }
396 if line_stats {
397 writeln!(
398 out,
399 "total lines added/removed/remaining: {}/{}/{}",
400 total_lines.added,
401 total_lines.removed,
402 total_lines.added - total_lines.removed
403 )?;
404 }
405 if !omit_unify_identities {
406 writeln!(
407 out,
408 "total unique authors: {} ({:.02}% duplication)",
409 num_unique_authors,
410 (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
411 )?;
412 }
413 if ignored_bot_commits != 0 {
414 writeln!(out, "commits by bots: {ignored_bot_commits}")?;
415 }
416 if needs_stats && skipped_merge_commits != 0 {
417 writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
418 }
419 debug_assert!(total_commits <= num_commits);
420 Ok(())
421}
422
423mod core;
424use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
425
426mod util;
427use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
428
429use crate::hours::core::spawn_tree_delta_threads;
430
431#[cfg(test)]
432mod tests {
433 use gix::bstr::ByteSlice;
434
435 use super::commit_author_identities;
436
437 #[test]
438 fn commit_author_identities_include_coauthors() {
439 let commit = b"tree 1111111111111111111111111111111111111111\n\
440author Main Author <main@example.com> 1710000000 +0000\n\
441committer Main Author <main@example.com> 1710000000 +0000\n\
442\n\
443subject\n\
444\n\
445body\n\
446\n\
447Co-authored-by: Second Author <second@example.com>\n\
448Co-authored-by: Third Author <third@example.com>\n";
449 let (author, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
450 assert_eq!(author.time, "1710000000 +0000");
451 assert_eq!(
452 authors
453 .iter()
454 .map(|identity| (identity.name(), identity.email()))
455 .collect::<Vec<_>>(),
456 vec![
457 (
458 "Main Author".as_bytes().as_bstr(),
459 "main@example.com".as_bytes().as_bstr()
460 ),
461 (
462 "Second Author".as_bytes().as_bstr(),
463 "second@example.com".as_bytes().as_bstr()
464 ),
465 (
466 "Third Author".as_bytes().as_bstr(),
467 "third@example.com".as_bytes().as_bstr()
468 ),
469 ]
470 );
471 }
472
473 #[test]
474 fn commit_author_identities_skip_invalid_coauthors() {
475 let commit = b"tree 1111111111111111111111111111111111111111\n\
476author Main Author <main@example.com> 1710000000 +0000\n\
477committer Main Author <main@example.com> 1710000000 +0000\n\
478\n\
479subject\n\
480\n\
481Co-authored-by: not a signature\n";
482 let (_, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
483 assert_eq!(authors.len(), 1);
484 assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
485 assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
486 }
487}