1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5 Count, NestedProgress, Progress,
6 actor::{Identity, IdentityRef},
7 bstr::{BStr, ByteSlice},
8 prelude::*,
9 progress,
10};
11use smallvec::{SmallVec, smallvec};
12
13pub struct Context<W> {
15 pub ignore_bots: bool,
17 pub show_pii: bool,
19 pub file_stats: bool,
21 pub line_stats: bool,
23 pub threads: Option<usize>,
25 pub omit_unify_identities: bool,
28 pub out: W,
30}
31
32pub struct SignatureRef<'a> {
33 name: &'a BStr,
34 email: &'a BStr,
35 time: gix::date::Time,
36}
37
38impl SignatureRef<'_> {
39 fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
40 self.time.seconds
41 }
42}
43
44enum ParsedIdentity<'a> {
53 Borrowed(IdentityRef<'a>),
54 Owned(Identity),
55}
56
57impl ParsedIdentity<'_> {
58 fn name(&self) -> &BStr {
59 match self {
60 ParsedIdentity::Borrowed(identity) => identity.name,
61 ParsedIdentity::Owned(identity) => identity.name.as_ref(),
62 }
63 }
64
65 fn email(&self) -> &BStr {
66 match self {
67 ParsedIdentity::Borrowed(identity) => identity.email,
68 ParsedIdentity::Owned(identity) => identity.email.as_ref(),
69 }
70 }
71}
72
73fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
74 match trailer.value {
75 std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes(value.as_ref())
76 .ok()
77 .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
78 std::borrow::Cow::Owned(value) => IdentityRef::from_bytes(value.as_ref())
79 .ok()
80 .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
81 }
82}
83
84fn commit_author_identities(
86 commit_data: &[u8],
87 object_hash: gix::hash::Kind,
88) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
89 let commit = gix::objs::CommitRef::from_bytes(commit_data, object_hash)?;
90 let author = commit.author()?.trim();
91 let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
92 authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
93 Ok((author, authors))
94}
95
96pub fn estimate<W, P>(
103 working_dir: &Path,
104 rev_spec: &BStr,
105 mut progress: P,
106 Context {
107 show_pii,
108 ignore_bots,
109 file_stats,
110 line_stats,
111 omit_unify_identities,
112 threads,
113 mut out,
114 }: Context<W>,
115) -> anyhow::Result<()>
116where
117 W: io::Write,
118 P: NestedProgress,
119{
120 let repo = gix::discover(working_dir)?;
121 let commit_id = repo.rev_parse_single(rev_spec)?.detach();
122 let mut string_heap = BTreeSet::<&'static [u8]>::new();
123 let needs_stats = file_stats || line_stats;
124 let threads = gix::features::parallel::num_threads(threads);
125
126 let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
127 std::thread::scope(|scope| -> anyhow::Result<_> {
128 let start = Instant::now();
129 let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
130 let mailmap = repo.open_mailmap();
131
132 let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
133 let mut out = Vec::new();
134 for (commit_idx, commit_data) in rx {
135 if let Ok((commit_author, authors)) = commit_author_identities(&commit_data, commit_id.kind()) {
136 let mut string_ref = |s: &[u8]| -> &'static BStr {
137 match string_heap.get(s) {
138 Some(n) => n.as_bstr(),
139 None => {
140 let sv: Vec<u8> = s.to_owned();
141 string_heap.insert(Box::leak(sv.into_boxed_slice()));
142 (*string_heap.get(s).expect("present")).as_ref()
143 }
144 }
145 };
146 let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
147 for identity in authors {
148 let author = mailmap.resolve_cow(gix::actor::SignatureRef {
149 name: identity.name(),
150 email: identity.email(),
151 time: commit_author.time,
152 });
153 let name = string_ref(author.name.as_ref());
154 let email = string_ref(author.email.as_ref());
155 if authors_for_commit
156 .iter()
157 .any(|existing| existing.name == name && existing.email == email)
158 {
159 continue;
160 }
161 authors_for_commit.push(SignatureRef {
162 name,
163 email,
164 time: author.time,
165 });
166 }
167 out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
168 }
169 }
170 out.shrink_to_fit();
171 out.sort_by(|a, b| {
172 a.1.email
173 .cmp(b.1.email)
174 .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
175 .then(a.0.cmp(&b.0))
176 });
177 Ok(out)
178 });
179
180 let (stats_progresses, stats_counters) = if needs_stats {
181 {
182 let mut sp = progress.add_child("extract stats");
183 sp.init(None, progress::count("commits"));
184 let sc = sp.counter();
185
186 let mut cp = progress.add_child("find changes");
187 cp.init(None, progress::count("modified files"));
188 let cc = cp.counter();
189
190 let mut lp = progress.add_child("find changes");
191 lp.init(None, progress::count("diff lines"));
192 let lc = lp.counter();
193
194 (Some((sp, cp, lp)), Some((sc, cc, lc)))
195 }
196 } else {
197 Default::default()
198 };
199
200 let mut progress = progress.add_child("traverse commit graph");
201 progress.init(None, progress::count("commits"));
202
203 let (tx_tree_id, stat_threads) = if needs_stats {
204 {
205 let (tx, threads) = spawn_tree_delta_threads(
206 scope,
207 threads,
208 line_stats,
209 repo.clone(),
210 stats_counters.clone().expect("counters are set"),
211 );
212 (Some(tx), threads)
213 }
214 } else {
215 Default::default()
216 };
217
218 let mut commit_idx = 0_u32;
219 let mut skipped_merge_commits = 0;
220 const CHUNK_SIZE: usize = 50;
221 let mut chunk = Vec::with_capacity(CHUNK_SIZE);
222 let mut commit_iter = commit_id.ancestors(&repo.objects);
223 let mut is_shallow = false;
224 while let Some(c) = commit_iter.next() {
225 progress.inc();
226 if gix::interrupt::is_triggered() {
227 bail!("Cancelled by user");
228 }
229 match c {
230 Ok(c) => {
231 tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
232 let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
233 let mut parents = c.parent_ids.into_iter();
234 parents
235 .next()
236 .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
237 .filter(|_| {
238 if parents.next().is_some() {
239 skipped_merge_commits += 1;
240 false
241 } else {
242 true
243 }
244 })
245 });
246 if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
247 if chunk.len() == CHUNK_SIZE {
248 tx_tree
249 .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
250 .ok();
251 } else {
252 chunk.push((commit_idx, first_parent, commit));
253 }
254 }
255 commit_idx += 1;
256 }
257 Err(gix::traverse::commit::simple::Error::Find { .. }) => {
258 is_shallow = true;
259 break;
260 }
261 Err(err) => return Err(err.into()),
262 }
263 }
264 if let Some(tx) = tx_tree_id {
265 tx.send(chunk).ok();
266 }
267 drop(tx);
268 progress.show_throughput(start);
269 drop(progress);
270
271 let stats_by_commit_idx = match stats_progresses {
272 Some((mut stat_progress, change_progress, line_progress)) => {
273 stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
274 let mut stats = Vec::new();
275 for handle in stat_threads {
276 stats.extend(handle.join().expect("no panic")?);
277 if gix::interrupt::is_triggered() {
278 bail!("Cancelled by user");
279 }
280 }
281 stats.sort_by_key(|t| t.0);
282 stat_progress.show_throughput(start);
283 change_progress.show_throughput(start);
284 line_progress.show_throughput(start);
285 stats
286 }
287 None => Vec::new(),
288 };
289
290 Ok((
291 extract_signatures.join().expect("no panic")?,
292 stats_by_commit_idx,
293 is_shallow,
294 skipped_merge_commits,
295 commit_idx,
296 ))
297 })?
298 };
299
300 if commit_authors.is_empty() {
301 bail!("No commits to process");
302 }
303
304 let start = Instant::now();
305 let mut current_email = &commit_authors[0].1.email;
306 let mut slice_start = 0;
307 let mut results_by_hours = Vec::new();
308 let mut ignored_bot_commits = 0_u32;
309 let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
310 let estimate = estimate_hours(commits, &stats);
311 if ignore_bots && estimate.name.contains_str(b"[bot]") {
312 ignored_bot_commits += estimate.num_commits;
313 return;
314 }
315 results_by_hours.push(estimate);
316 };
317 for (idx, (_, elm)) in commit_authors.iter().enumerate() {
318 if elm.email != *current_email {
319 push_estimate(&commit_authors[slice_start..idx]);
320 slice_start = idx;
321 current_email = &elm.email;
322 }
323 }
324 if let Some(commits) = commit_authors.get(slice_start..) {
325 push_estimate(commits);
326 }
327
328 let num_authors = results_by_hours.len();
329 let mut results_by_hours = if !omit_unify_identities {
330 deduplicate_identities(&results_by_hours)
331 } else {
332 results_by_hours
333 .iter()
334 .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
335 acc.push(e.into());
336 acc
337 })
338 };
339 let elapsed = start.elapsed();
340 progress.done(format!(
341 "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
342 num_commits,
343 elapsed,
344 num_commits as f32 / elapsed.as_secs_f32()
345 ));
346
347 let num_unique_authors = results_by_hours.len();
348 let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
349 let included_commit_ids = commit_authors
350 .iter()
351 .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
352 .map(|(commit_idx, _)| *commit_idx)
353 .collect::<BTreeSet<_>>();
354 let total_commits = included_commit_ids.len() as u32;
355 let (total_files, total_lines) = stats
356 .iter()
357 .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
358 .fold(
359 (FileStats::default(), LineStats::default()),
360 |mut acc, (_, files, lines)| {
361 acc.0.add(files);
362 acc.1.add(lines);
363 acc
364 },
365 );
366 if show_pii {
367 results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
368 for entry in &results_by_hours {
369 entry.write_to(
370 total_hours,
371 file_stats.then_some(total_files),
372 line_stats.then_some(total_lines),
373 &mut out,
374 )?;
375 writeln!(out)?;
376 }
377 }
378 writeln!(
379 out,
380 "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
381 total_hours,
382 total_hours / HOURS_PER_WORKDAY,
383 total_commits,
384 if is_shallow { " (shallow)" } else { Default::default() },
385 num_authors
386 )?;
387 if file_stats {
388 writeln!(
389 out,
390 "total files added/removed/modified/remaining: {}/{}/{}/{}",
391 total_files.added,
392 total_files.removed,
393 total_files.modified,
394 total_files.added - total_files.removed
395 )?;
396 }
397 if line_stats {
398 writeln!(
399 out,
400 "total lines added/removed/remaining: {}/{}/{}",
401 total_lines.added,
402 total_lines.removed,
403 total_lines.added - total_lines.removed
404 )?;
405 }
406 if !omit_unify_identities {
407 writeln!(
408 out,
409 "total unique authors: {} ({:.02}% duplication)",
410 num_unique_authors,
411 (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
412 )?;
413 }
414 if ignored_bot_commits != 0 {
415 writeln!(out, "commits by bots: {ignored_bot_commits}")?;
416 }
417 if needs_stats && skipped_merge_commits != 0 {
418 writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
419 }
420 debug_assert!(total_commits <= num_commits);
421 Ok(())
422}
423
424mod core;
425use self::core::{HOURS_PER_WORKDAY, deduplicate_identities, estimate_hours};
426
427mod util;
428use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
429
430use crate::hours::core::spawn_tree_delta_threads;
431
432#[cfg(test)]
433mod tests {
434 use gix::bstr::ByteSlice;
435
436 use super::commit_author_identities;
437
438 #[test]
439 fn commit_author_identities_include_coauthors() {
440 let commit = b"tree 1111111111111111111111111111111111111111\n\
441author Main Author <main@example.com> 1710000000 +0000\n\
442committer Main Author <main@example.com> 1710000000 +0000\n\
443\n\
444subject\n\
445\n\
446body\n\
447\n\
448Co-authored-by: Second Author <second@example.com>\n\
449Co-authored-by: Third Author <third@example.com>\n";
450 let (author, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
451 assert_eq!(author.time, "1710000000 +0000");
452 assert_eq!(
453 authors
454 .iter()
455 .map(|identity| (identity.name(), identity.email()))
456 .collect::<Vec<_>>(),
457 vec![
458 (
459 "Main Author".as_bytes().as_bstr(),
460 "main@example.com".as_bytes().as_bstr()
461 ),
462 (
463 "Second Author".as_bytes().as_bstr(),
464 "second@example.com".as_bytes().as_bstr()
465 ),
466 (
467 "Third Author".as_bytes().as_bstr(),
468 "third@example.com".as_bytes().as_bstr()
469 ),
470 ]
471 );
472 }
473
474 #[test]
475 fn commit_author_identities_skip_invalid_coauthors() {
476 let commit = b"tree 1111111111111111111111111111111111111111\n\
477author Main Author <main@example.com> 1710000000 +0000\n\
478committer Main Author <main@example.com> 1710000000 +0000\n\
479\n\
480subject\n\
481\n\
482Co-authored-by: not a signature\n";
483 let (_, authors) = commit_author_identities(commit, gix::hash::Kind::Sha1).expect("valid commit");
484 assert_eq!(authors.len(), 1);
485 assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
486 assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
487 }
488}