1use std::{collections::BTreeSet, io, path::Path, time::Instant};
2
3use anyhow::bail;
4use gix::{
5 bstr::{BStr, ByteSlice},
6 prelude::*,
7 progress, Count, NestedProgress, Progress,
8};
9use smallvec::{smallvec, SmallVec};
10
11pub struct Context<W> {
13 pub ignore_bots: bool,
15 pub show_pii: bool,
17 pub file_stats: bool,
19 pub line_stats: bool,
21 pub threads: Option<usize>,
23 pub omit_unify_identities: bool,
26 pub out: W,
28}
29
30pub struct SignatureRef<'a> {
31 name: &'a BStr,
32 email: &'a BStr,
33 time: gix::date::Time,
34}
35
36impl SignatureRef<'_> {
37 fn seconds(&self) -> gix::date::SecondsSinceUnixEpoch {
38 self.time.seconds
39 }
40}
41
42fn commit_author_identities(
44 commit_data: &[u8],
45) -> Result<(gix::actor::SignatureRef<'_>, SmallVec<[gix::actor::IdentityRef<'_>; 2]>), gix::objs::decode::Error> {
46 let commit = gix::objs::CommitRef::from_bytes(commit_data)?;
47 let author = commit.author()?.trim();
48 let mut authors = smallvec![gix::actor::IdentityRef::from(author)];
49 authors.extend(commit.co_authored_by_trailers().filter_map(|trailer| {
50 gix::actor::IdentityRef::from_bytes::<gix::objs::decode::ParseError>(trailer.value.as_ref())
51 .ok()
52 .map(|identity| identity.trim())
53 }));
54 Ok((author, authors))
55}
56
57pub fn estimate<W, P>(
64 working_dir: &Path,
65 rev_spec: &BStr,
66 mut progress: P,
67 Context {
68 show_pii,
69 ignore_bots,
70 file_stats,
71 line_stats,
72 omit_unify_identities,
73 threads,
74 mut out,
75 }: Context<W>,
76) -> anyhow::Result<()>
77where
78 W: io::Write,
79 P: NestedProgress,
80{
81 let repo = gix::discover(working_dir)?;
82 let commit_id = repo.rev_parse_single(rev_spec)?.detach();
83 let mut string_heap = BTreeSet::<&'static [u8]>::new();
84 let needs_stats = file_stats || line_stats;
85 let threads = gix::features::parallel::num_threads(threads);
86
87 let (commit_authors, stats, is_shallow, skipped_merge_commits, num_commits) = {
88 std::thread::scope(|scope| -> anyhow::Result<_> {
89 let start = Instant::now();
90 let (tx, rx) = std::sync::mpsc::channel::<(u32, Vec<u8>)>();
91 let mailmap = repo.open_mailmap();
92
93 let extract_signatures = scope.spawn(move || -> anyhow::Result<Vec<_>> {
94 let mut out = Vec::new();
95 for (commit_idx, commit_data) in rx {
96 if let Ok((commit_author, authors)) = commit_author_identities(&commit_data) {
97 let mut string_ref = |s: &[u8]| -> &'static BStr {
98 match string_heap.get(s) {
99 Some(n) => n.as_bstr(),
100 None => {
101 let sv: Vec<u8> = s.to_owned();
102 string_heap.insert(Box::leak(sv.into_boxed_slice()));
103 (*string_heap.get(s).expect("present")).as_ref()
104 }
105 }
106 };
107 let mut authors_for_commit = SmallVec::<[SignatureRef<'static>; 2]>::new();
108 for identity in authors {
109 let author = mailmap.resolve_cow(gix::actor::SignatureRef {
110 name: identity.name,
111 email: identity.email,
112 time: commit_author.time,
113 });
114 let name = string_ref(author.name.as_ref());
115 let email = string_ref(author.email.as_ref());
116 if authors_for_commit
117 .iter()
118 .any(|existing| existing.name == name && existing.email == email)
119 {
120 continue;
121 }
122 authors_for_commit.push(SignatureRef {
123 name,
124 email,
125 time: author.time,
126 });
127 }
128 out.extend(authors_for_commit.into_iter().map(|author| (commit_idx, author)));
129 }
130 }
131 out.shrink_to_fit();
132 out.sort_by(|a, b| {
133 a.1.email
134 .cmp(b.1.email)
135 .then(a.1.seconds().cmp(&b.1.seconds()).reverse())
136 .then(a.0.cmp(&b.0))
137 });
138 Ok(out)
139 });
140
141 let (stats_progresses, stats_counters) = if needs_stats {
142 {
143 let mut sp = progress.add_child("extract stats");
144 sp.init(None, progress::count("commits"));
145 let sc = sp.counter();
146
147 let mut cp = progress.add_child("find changes");
148 cp.init(None, progress::count("modified files"));
149 let cc = cp.counter();
150
151 let mut lp = progress.add_child("find changes");
152 lp.init(None, progress::count("diff lines"));
153 let lc = lp.counter();
154
155 (Some((sp, cp, lp)), Some((sc, cc, lc)))
156 }
157 } else {
158 Default::default()
159 };
160
161 let mut progress = progress.add_child("traverse commit graph");
162 progress.init(None, progress::count("commits"));
163
164 let (tx_tree_id, stat_threads) = if needs_stats {
165 {
166 let (tx, threads) = spawn_tree_delta_threads(
167 scope,
168 threads,
169 line_stats,
170 repo.clone(),
171 stats_counters.clone().expect("counters are set"),
172 );
173 (Some(tx), threads)
174 }
175 } else {
176 Default::default()
177 };
178
179 let mut commit_idx = 0_u32;
180 let mut skipped_merge_commits = 0;
181 const CHUNK_SIZE: usize = 50;
182 let mut chunk = Vec::with_capacity(CHUNK_SIZE);
183 let mut commit_iter = commit_id.ancestors(&repo.objects);
184 let mut is_shallow = false;
185 while let Some(c) = commit_iter.next() {
186 progress.inc();
187 if gix::interrupt::is_triggered() {
188 bail!("Cancelled by user");
189 }
190 match c {
191 Ok(c) => {
192 tx.send((commit_idx, commit_iter.commit_data().to_owned())).ok();
193 let tree_delta_info = tx_tree_id.as_ref().and_then(|tx| {
194 let mut parents = c.parent_ids.into_iter();
195 parents
196 .next()
197 .map(|first_parent| (tx, Some(first_parent), c.id.to_owned()))
198 .filter(|_| {
199 if parents.next().is_some() {
200 skipped_merge_commits += 1;
201 false
202 } else {
203 true
204 }
205 })
206 });
207 if let Some((tx_tree, first_parent, commit)) = tree_delta_info {
208 if chunk.len() == CHUNK_SIZE {
209 tx_tree
210 .send(std::mem::replace(&mut chunk, Vec::with_capacity(CHUNK_SIZE)))
211 .ok();
212 } else {
213 chunk.push((commit_idx, first_parent, commit));
214 }
215 }
216 commit_idx += 1;
217 }
218 Err(gix::traverse::commit::simple::Error::Find { .. }) => {
219 is_shallow = true;
220 break;
221 }
222 Err(err) => return Err(err.into()),
223 }
224 }
225 if let Some(tx) = tx_tree_id {
226 tx.send(chunk).ok();
227 }
228 drop(tx);
229 progress.show_throughput(start);
230 drop(progress);
231
232 let stats_by_commit_idx = match stats_progresses {
233 Some((mut stat_progress, change_progress, line_progress)) => {
234 stat_progress.set_max(Some(commit_idx as usize - skipped_merge_commits));
235 let mut stats = Vec::new();
236 for handle in stat_threads {
237 stats.extend(handle.join().expect("no panic")?);
238 if gix::interrupt::is_triggered() {
239 bail!("Cancelled by user");
240 }
241 }
242 stats.sort_by_key(|t| t.0);
243 stat_progress.show_throughput(start);
244 change_progress.show_throughput(start);
245 line_progress.show_throughput(start);
246 stats
247 }
248 None => Vec::new(),
249 };
250
251 Ok((
252 extract_signatures.join().expect("no panic")?,
253 stats_by_commit_idx,
254 is_shallow,
255 skipped_merge_commits,
256 commit_idx,
257 ))
258 })?
259 };
260
261 if commit_authors.is_empty() {
262 bail!("No commits to process");
263 }
264
265 let start = Instant::now();
266 let mut current_email = &commit_authors[0].1.email;
267 let mut slice_start = 0;
268 let mut results_by_hours = Vec::new();
269 let mut ignored_bot_commits = 0_u32;
270 let mut push_estimate = |commits: &[(u32, SignatureRef<'static>)]| {
271 let estimate = estimate_hours(commits, &stats);
272 if ignore_bots && estimate.name.contains_str(b"[bot]") {
273 ignored_bot_commits += estimate.num_commits;
274 return;
275 }
276 results_by_hours.push(estimate);
277 };
278 for (idx, (_, elm)) in commit_authors.iter().enumerate() {
279 if elm.email != *current_email {
280 push_estimate(&commit_authors[slice_start..idx]);
281 slice_start = idx;
282 current_email = &elm.email;
283 }
284 }
285 if let Some(commits) = commit_authors.get(slice_start..) {
286 push_estimate(commits);
287 }
288
289 let num_authors = results_by_hours.len();
290 let mut results_by_hours = if !omit_unify_identities {
291 deduplicate_identities(&results_by_hours)
292 } else {
293 results_by_hours
294 .iter()
295 .fold(Vec::with_capacity(results_by_hours.len()), |mut acc, e| {
296 acc.push(e.into());
297 acc
298 })
299 };
300 let elapsed = start.elapsed();
301 progress.done(format!(
302 "Extracted and organized data from {} commits in {:?} ({:0.0} commits/s)",
303 num_commits,
304 elapsed,
305 num_commits as f32 / elapsed.as_secs_f32()
306 ));
307
308 let num_unique_authors = results_by_hours.len();
309 let total_hours = results_by_hours.iter().map(|e| e.hours).sum::<f32>();
310 let included_commit_ids = commit_authors
311 .iter()
312 .filter(|(_, author)| !(ignore_bots && author.name.contains_str(b"[bot]")))
313 .map(|(commit_idx, _)| *commit_idx)
314 .collect::<BTreeSet<_>>();
315 let total_commits = included_commit_ids.len() as u32;
316 let (total_files, total_lines) = stats
317 .iter()
318 .filter(|(commit_idx, _, _)| included_commit_ids.contains(commit_idx))
319 .fold(
320 (FileStats::default(), LineStats::default()),
321 |mut acc, (_, files, lines)| {
322 acc.0.add(files);
323 acc.1.add(lines);
324 acc
325 },
326 );
327 if show_pii {
328 results_by_hours.sort_by(|a, b| a.hours.partial_cmp(&b.hours).unwrap_or(std::cmp::Ordering::Equal));
329 for entry in &results_by_hours {
330 entry.write_to(
331 total_hours,
332 file_stats.then_some(total_files),
333 line_stats.then_some(total_lines),
334 &mut out,
335 )?;
336 writeln!(out)?;
337 }
338 }
339 writeln!(
340 out,
341 "total hours: {:.02}\ntotal 8h days: {:.02}\ntotal commits = {}{}\ntotal authors: {}",
342 total_hours,
343 total_hours / HOURS_PER_WORKDAY,
344 total_commits,
345 if is_shallow { " (shallow)" } else { Default::default() },
346 num_authors
347 )?;
348 if file_stats {
349 writeln!(
350 out,
351 "total files added/removed/modified/remaining: {}/{}/{}/{}",
352 total_files.added,
353 total_files.removed,
354 total_files.modified,
355 total_files.added - total_files.removed
356 )?;
357 }
358 if line_stats {
359 writeln!(
360 out,
361 "total lines added/removed/remaining: {}/{}/{}",
362 total_lines.added,
363 total_lines.removed,
364 total_lines.added - total_lines.removed
365 )?;
366 }
367 if !omit_unify_identities {
368 writeln!(
369 out,
370 "total unique authors: {} ({:.02}% duplication)",
371 num_unique_authors,
372 (1.0 - (num_unique_authors as f32 / num_authors as f32)) * 100.0
373 )?;
374 }
375 if ignored_bot_commits != 0 {
376 writeln!(out, "commits by bots: {ignored_bot_commits}")?;
377 }
378 if needs_stats && skipped_merge_commits != 0 {
379 writeln!(out, "stats omitted for {skipped_merge_commits} merge commits")?;
380 }
381 debug_assert!(total_commits <= num_commits);
382 Ok(())
383}
384
385mod core;
386use self::core::{deduplicate_identities, estimate_hours, HOURS_PER_WORKDAY};
387
388mod util;
389use util::{CommitIdx, FileStats, LineStats, WorkByEmail, WorkByPerson};
390
391use crate::hours::core::spawn_tree_delta_threads;
392
393#[cfg(test)]
394mod tests {
395 use gix::bstr::ByteSlice;
396
397 use super::commit_author_identities;
398
399 #[test]
400 fn commit_author_identities_include_coauthors() {
401 let commit = b"tree 1111111111111111111111111111111111111111\n\
402author Main Author <main@example.com> 1710000000 +0000\n\
403committer Main Author <main@example.com> 1710000000 +0000\n\
404\n\
405subject\n\
406\n\
407body\n\
408\n\
409Co-authored-by: Second Author <second@example.com>\n\
410Co-authored-by: Third Author <third@example.com>\n";
411 let (author, authors) = commit_author_identities(commit).expect("valid commit");
412 assert_eq!(author.time, "1710000000 +0000");
413 assert_eq!(
414 authors
415 .iter()
416 .map(|identity| (identity.name, identity.email))
417 .collect::<Vec<_>>(),
418 vec![
419 (
420 "Main Author".as_bytes().as_bstr(),
421 "main@example.com".as_bytes().as_bstr()
422 ),
423 (
424 "Second Author".as_bytes().as_bstr(),
425 "second@example.com".as_bytes().as_bstr()
426 ),
427 (
428 "Third Author".as_bytes().as_bstr(),
429 "third@example.com".as_bytes().as_bstr()
430 ),
431 ]
432 );
433 }
434
435 #[test]
436 fn commit_author_identities_skip_invalid_coauthors() {
437 let commit = b"tree 1111111111111111111111111111111111111111\n\
438author Main Author <main@example.com> 1710000000 +0000\n\
439committer Main Author <main@example.com> 1710000000 +0000\n\
440\n\
441subject\n\
442\n\
443Co-authored-by: not a signature\n";
444 let (_, authors) = commit_author_identities(commit).expect("valid commit");
445 assert_eq!(authors.len(), 1);
446 assert_eq!(authors[0].name, "Main Author".as_bytes().as_bstr());
447 assert_eq!(authors[0].email, "main@example.com".as_bytes().as_bstr());
448 }
449}