1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5 actor::{Identity, IdentityRef},
6 bstr::{BStr, ByteSlice},
7 prelude::*,
8 progress, Count, NestedProgress, Progress,
9};
10use smallvec::{smallvec, SmallVec};
11
12pub struct Context<W> {
14 pub ignore_bots: bool,
16 pub show_pii: bool,
18 pub file_stats: bool,
20 pub line_stats: bool,
22 pub threads: Option<usize>,
24 pub omit_unify_identities: bool,
27 pub out: W,
29}
30
31pub struct SignatureRef<'a> {
32 name: &'a BStr,
33 email: &'a BStr,
34 time: gix::date::Time,
35}
36
37impl SignatureRef<'_> {
38 fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
39 self.time.seconds
40 }
41}
42
43enum ParsedIdentity<'a> {
52 Borrowed(IdentityRef<'a>),
53 Owned(Identity),
54}
55
56impl ParsedIdentity<'_> {
57 fn name(&self) -> &BStr {
58 match self {
59 ParsedIdentity::Borrowed(identity) => identity.name,
60 ParsedIdentity::Owned(identity) => identity.name.as_ref(),
61 }
62 }
63
64 fn email(&self) -> &BStr {
65 match self {
66 ParsedIdentity::Borrowed(identity) => identity.email,
67 ParsedIdentity::Owned(identity) => identity.email.as_ref(),
68 }
69 }
70}
71
72fn parse_trailer_identity(trailer: gix::objs::commit::message::body::TrailerRef<'_>) -> Option<ParsedIdentity<'_>> {
73 match trailer.value {
74 std::borrow::Cow::Borrowed(value) => IdentityRef::from_bytes::<gix::objs::decode::ParseError>(value.as_ref())
75 .ok()
76 .map(|identity| ParsedIdentity::Borrowed(identity.trim())),
77 std::borrow::Cow::Owned(value) => IdentityRef::from_bytes::<gix::objs::decode::ParseError>(value.as_ref())
78 .ok()
79 .map(|identity| ParsedIdentity::Owned(identity.trim().to_owned())),
80 }
81}
82
83fn commit_author_identities(
85 commit_data: &[u8],
86) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[ParsedIdentity<'_>; 2]>), gix::objs::decode::Error> {
87 let commit = gix::objs::CommitRef::from_bytes(commit_data)?;
88 let author = commit.author()?.trim();
89 let mut authors = smallvec![ParsedIdentity::Borrowed(gix::actor::IdentityRef::from(author))];
90 authors.extend(commit.co_authored_by_trailers().filter_map(parse_trailer_identity));
91 Ok((author, authors))
92}
93
94pub fn estimate<W, P>(
101 working_dir: &Path,
102 rev_spec: &BStr,
103 mut progress: P,
104 Context {
105 show_pii,
106 ignore_bots,
107 file_stats,
108 line_stats,
109 omit_unify_identities,
110 threads,
111 mut out,
112 }: Context<W>,
113) -> anyhow::Result<()>
114where
115 W: io::Write,
116 P: NestedProgress,
117{
118 let repo = gix::discover(working_dir)?;
119 let commit_id = repo.rev_parse_single(rev_spec)?.detach();
120 let mut string_heap = BTreeSet::<&'static [u8]>::new();
121 let needs_stats = file_stats || line_stats;
122 let threads = gix::features::parallel::num_threads(threads);
123
124 let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
125 std::thread::scope(|scope| -> anyhow::Result<_> {
126 let start = Instant::now();
127 let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
128 let mailmap = repo.open_mailmap();
129
130 let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
131 let mut out = Vec::new();
132 for (commit_idx, commit_data) in rx {
133 if let Ok((commit_author, authors)) = commit_author_identities(&commit_data) {
134 let mut string_ref = |s: &[u8]| -> &'static BStr {
135 match string_heap.get(s) {
136 Some(n) => n.as_bstr(),
137 None => {
138 let sv: Vec<u8> = s.to_owned();
139 string_heap.insert(Box::leak(sv.into_boxed_slice()));
140 (*string_heap.get(s).expect("present")).as_ref()
141 }
142 }
143 };
144 let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
145 for identity in authors {
146 let author = mailmap.resolve_cow(gix::actor::SignatureRef {
147 name: identity.name(),
148 email: identity.email(),
149 time: commit_author.time,
150 });
151 let name = string_ref(author.name.as_ref());
152 let email = string_ref(author.email.as_ref());
153 if authors_for_commit
154 .iter()
155 .any(|existing| existing.name == name && existing.email == email)
156 {
157 continue;
158 }
159 authors_for_commit.push(SignatureRef {
160 name,
161 email,
162 time: author.time,
163 });
164 }
165 out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
166 }
167 }
168 out.shrink_to_fit();
169 out.sort_by(|a, b| {
170 a.1.email
171 .cmp(b.1.email)
172 .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
173 .then(a.0.cmp(&b.0))
174 });
175 Ok(out)
176 });
177
178 let (stats_progresses, stats_counters) = if needs_stats {
179 {
180 let mut sp = progress.add_child("extract stats");
181 sp.init(None, progress::count("commits"));
182 let sc = sp.counter();
183
184 let mut cp = progress.add_child("find changes");
185 cp.init(None, progress::count("modified files"));
186 let cc = cp.counter();
187
188 let mut lp = progress.add_child("find changes");
189 lp.init(None, progress::count("diff lines"));
190 let lc = lp.counter();
191
192 (Some((sp, cp, lp)), Some((sc, cc, lc)))
193 }
194 } else {
195 Default::default()
196 };
197
198 let mut progress = progress.add_child("traverse commit graph");
199 progress.init(None, progress::count("commits"));
200
201 let (tx_tree_id, stat_threads) = if needs_stats {
202 {
203 let (tx, threads) = spawn_tree_delta_threads(
204 scope,
205 threads,
206 line_stats,
207 repo.clone(),
208 stats_counters.clone().expect("counters are set"),
209 );
210 (Some(tx), threads)
211 }
212 } else {
213 Default::default()
214 };
215
216 let mut commit_idx = 0_u32;
217 let mut skipped_merge_commits = 0;
218 const CHUNK_SIZE: usize = 50;
219 let mut chunk = Vec::with_capacity(CHUNK_SIZE);
220 let mut commit_iter = commit_id.ancestors(&repo.objects);
221 let mut is_shallow = false;
222 while let Some(c) = commit_iter.next() {
223 progress.inc();
224 if gix::interrupt::is_triggered() {
225 bail!("Cancelled by user");
226 }
227 match c {
228 Ok(c) => {
229 tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
230 let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
231 let mut parents = c.parent_ids.into_iter();
232 parents
233 .next()
234 .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
235 .filter(|_| {
236 if parents.next().is_some() {
237 skipped_merge_commits += 1;
238 false
239 } else {
240 true
241 }
242 })
243 });
244 if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
245 if chunk.len() == CHUNK_SIZE {
246 tx_tree
247 .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
248 .ok();
249 } else {
250 chunk.push((commit_idx, first_parent, commit));
251 }
252 }
253 commit_idx += 1;
254 }
255 Err(gix::traverse::commit::simple::Error::Find { .. }) => {
256 is_shallow = true;
257 break;
258 }
259 Err(err) => return Err(err.into()),
260 }
261 }
262 if let Some(tx) = tx_tree_id {
263 tx.send(chunk).ok();
264 }
265 drop(tx);
266 progress.show_throughput(start);
267 drop(progress);
268
269 let stats_by_commit_idx = match stats_progresses {
270 Some((mut stat_progress, change_progress, line_progress)) => {
271 stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
272 let mut stats = Vec::new();
273 for handle in stat_threads {
274 stats.extend(handle.join().expect("no panic")?);
275 if gix::interrupt::is_triggered() {
276 bail!("Cancelled by user");
277 }
278 }
279 stats.sort_by_key(|t| t.0);
280 stat_progress.show_throughput(start);
281 change_progress.show_throughput(start);
282 line_progress.show_throughput(start);
283 stats
284 }
285 None => Vec::new(),
286 };
287
288 Ok((
289 extract_signatures.join().expect("no panic")?,
290 stats_by_commit_idx,
291 is_shallow,
292 skipped_merge_commits,
293 commit_idx,
294 ))
295 })?
296 };
297
298 if commit_authors.is_empty() {
299 bail!("No commits to process");
300 }
301
302 let start = Instant::now();
303 let mut current_email = &commit_authors[0].1.email;
304 let mut slice_start = 0;
305 let mut results_by_hours = Vec::new();
306 let mut ignored_bot_commits = 0_u32;
307 let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
308 let estimate = estimate_hours(commits, &stats);
309 if ignore_bots && estimate.name.contains_str(b"[bot]") {
310 ignored_bot_commits += estimate.num_commits;
311 return;
312 }
313 results_by_hours.push(estimate);
314 };
315 for (idx, (_, elm)) in commit_authors.iter().enumerate() {
316 if elm.email != *current_email {
317 push_estimate(&commit_authors[slice_start..idx]);
318 slice_start = idx;
319 current_email = &elm.email;
320 }
321 }
322 if let Some(commits) = commit_authors.get(slice_start..) {
323 push_estimate(commits);
324 }
325
326 let num_authors = results_by_hours.len();
327 let mut results_by_hours = if !omit_unify_identities {
328 deduplicate_identities(&results_by_hours)
329 } else {
330 results_by_hours
331 .iter()
332 .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
333 acc.push(e.into());
334 acc
335 })
336 };
337 let elapsed = start.elapsed();
338 progress.done(format!(
339 "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
340 num_commits,
341 elapsed,
342 num_commits as f32 / elapsed.as_secs_f32()
343 ));
344
345 let num_unique_authors = results_by_hours.len();
346 let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
347 let included_commit_ids = commit_authors
348 .iter()
349 .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
350 .map(|(commit_idx, _)| *commit_idx)
351 .collect::<BTreeSet<_>>();
352 let total_commits = included_commit_ids.len() as u32;
353 let (total_files, total_lines) = stats
354 .iter()
355 .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
356 .fold(
357 (FileStats::default(), LineStats::default()),
358 |mut acc, (_, files, lines)| {
359 acc.0.add(files);
360 acc.1.add(lines);
361 acc
362 },
363 );
364 if show_pii {
365 results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
366 for entry in &results_by_hours {
367 entry.write_to(
368 total_hours,
369 file_stats.then_some(total_files),
370 line_stats.then_some(total_lines),
371 &mut out,
372 )?;
373 writeln!(out)?;
374 }
375 }
376 writeln!(
377 out,
378 "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
379 total_hours,
380 total_hours / HOURS_PER_WORKDAY,
381 total_commits,
382 if is_shallow { " (shallow)" } else { Default::default() },
383 num_authors
384 )?;
385 if file_stats {
386 writeln!(
387 out,
388 "total files added/removed/modified/remaining: {}/{}/{}/{}",
389 total_files.added,
390 total_files.removed,
391 total_files.modified,
392 total_files.added - total_files.removed
393 )?;
394 }
395 if line_stats {
396 writeln!(
397 out,
398 "total lines added/removed/remaining: {}/{}/{}",
399 total_lines.added,
400 total_lines.removed,
401 total_lines.added - total_lines.removed
402 )?;
403 }
404 if !omit_unify_identities {
405 writeln!(
406 out,
407 "total unique authors: {} ({:.02}% duplication)",
408 num_unique_authors,
409 (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
410 )?;
411 }
412 if ignored_bot_commits != 0 {
413 writeln!(out, "commits by bots: {ignored_bot_commits}")?;
414 }
415 if needs_stats && skipped_merge_commits != 0 {
416 writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
417 }
418 debug_assert!(total_commits <= num_commits);
419 Ok(())
420}
421
422mod core;
423use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
424
425mod util;
426use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
427
428use crate::hours::core::spawn_tree_delta_threads;
429
430#[cfg(test)]
431mod tests {
432 use gix::bstr::ByteSlice;
433
434 use super::commit_author_identities;
435
436 #[test]
437 fn commit_author_identities_include_coauthors() {
438 let commit = b"tree 1111111111111111111111111111111111111111\n\
439author Main Author <main@example.com> 1710000000 +0000\n\
440committer Main Author <main@example.com> 1710000000 +0000\n\
441\n\
442subject\n\
443\n\
444body\n\
445\n\
446Co-authored-by: Second Author <second@example.com>\n\
447Co-authored-by: Third Author <third@example.com>\n";
448 let (author, authors) = commit_author_identities(commit).expect("valid commit");
449 assert_eq!(author.time, "1710000000 +0000");
450 assert_eq!(
451 authors
452 .iter()
453 .map(|identity| (identity.name(), identity.email()))
454 .collect::<Vec<_>>(),
455 vec![
456 (
457 "Main Author".as_bytes().as_bstr(),
458 "main@example.com".as_bytes().as_bstr()
459 ),
460 (
461 "Second Author".as_bytes().as_bstr(),
462 "second@example.com".as_bytes().as_bstr()
463 ),
464 (
465 "Third Author".as_bytes().as_bstr(),
466 "third@example.com".as_bytes().as_bstr()
467 ),
468 ]
469 );
470 }
471
472 #[test]
473 fn commit_author_identities_skip_invalid_coauthors() {
474 let commit = b"tree 1111111111111111111111111111111111111111\n\
475author Main Author <main@example.com> 1710000000 +0000\n\
476committer Main Author <main@example.com> 1710000000 +0000\n\
477\n\
478subject\n\
479\n\
480Co-authored-by: not a signature\n";
481 let (_, authors) = commit_author_identities(commit).expect("valid commit");
482 assert_eq!(authors.len(), 1);
483 assert_eq!(authors[0].name(), "Main Author".as_bytes().as_bstr());
484 assert_eq!(authors[0].email(), "main@example.com".as_bytes().as_bstr());
485 }
486}