Skip to main content

contributor_graphs/
lib.rs

1//! Build contributor timelines from a git or GitHub repository.
2//!
3//! `contributor-graphs` is primarily a command-line tool, but the same engine
4//! is available as a library. The usual flow is [`analyze`] to turn a
5//! repository into [`Contributor`] rows plus [`RepoMeta`], then one of the
6//! renderers in [`svg`] or [`html`].
7//!
8//! ```no_run
9//! use contributor_graphs::{analyze, svg, Config};
10//!
11//! let analysis = analyze("nf-core/rnaseq", &Config::default())?;
12//! let rows: Vec<_> = analysis.contributors.iter().filter(|c| !c.bot).cloned().collect();
13//! let opts = svg::SvgOptions {
14//!     title: analysis.meta.name.clone(),
15//!     ..Default::default()
16//! };
17//! std::fs::write("rnaseq.svg", svg::render_svg(&rows, &opts))?;
18//! # Ok::<(), anyhow::Error>(())
19//! ```
20//!
21//! The lower-level modules ([`repo`], [`identity`], [`github`]) are public too,
22//! for callers who want to assemble a custom pipeline.
23
24pub mod cache;
25pub mod github;
26pub mod html;
27pub mod identity;
28pub mod model;
29pub mod progress;
30pub mod repo;
31pub mod svg;
32pub mod theme;
33
34use anyhow::{bail, Result};
35use std::io::IsTerminal;
36use std::sync::atomic::{AtomicUsize, Ordering};
37use std::sync::Mutex;
38
39pub use model::{Contributor, RepoMeta};
40
41/// Worker threads for reading per-source history in parallel.
42const READ_THREADS: usize = 8;
43
44/// How to read history and resolve identities. Construct with `Config::default()`
45/// and override fields as needed.
46#[derive(Clone)]
47pub struct Config {
48    /// Branch or ref to read (default: `HEAD`).
49    pub branch: Option<String>,
50    /// Only include commits after this date (passed to `git log --since`).
51    pub since: Option<String>,
52    /// Only include commits before this date (`git log --until`).
53    pub until: Option<String>,
54    /// Skip merge commits.
55    pub no_merges: bool,
56    /// Override the chart/repository title.
57    pub title: Option<String>,
58    /// Exclude contributors whose name or login contains any of these strings.
59    pub exclude: Vec<String>,
60    /// When expanding a bare owner into its repositories, drop any whose slug
61    /// (`owner/repo`) or bare repo name matches one of these (case-insensitive).
62    pub exclude_repos: Vec<String>,
63    /// Manual `matcher → group` rules (matcher = name, email, or login),
64    /// optionally date-bounded for affiliations that change over time.
65    pub groups: Vec<model::GroupRule>,
66    /// Manual group-name aliases: `(canonical, [variants])`. Variants are
67    /// folded into the canonical name, which is then authoritative.
68    pub group_aliases: Vec<(String, Vec<String>)>,
69    /// Manual identity merges: each row is `[canonical, alias, …]`.
70    pub identities: Vec<Vec<String>>,
71    /// Authoritative display names: `(matcher, name)`. When a matcher (name,
72    /// email, or login) hits a cluster, `name` overrides the GitHub profile
73    /// name and the commit-derived name. Comes from the TSV affiliations file's
74    /// `full name` column; empty there for most people.
75    pub forced_names: Vec<(String, String)>,
76    /// Query the GitHub API for logins, avatars, and profiles.
77    pub use_github: bool,
78    /// Auto-detect affiliations from GitHub profile companies.
79    pub detect_affiliation: bool,
80    /// Merge identities that share a normalised author name.
81    pub merge_names: bool,
82    /// Count `Co-authored-by` trailers as commits for those contributors.
83    pub count_coauthors: bool,
84    /// Download avatars and embed them as data URIs.
85    pub embed_avatars: bool,
86    /// Avatar pixel size to request when embedding.
87    pub avatar_size: u32,
88    /// Ignore cached git history and GitHub lookups, forcing a fresh pull
89    /// (the caches are still refreshed with the new results).
90    pub refresh: bool,
91    /// Print progress to stderr.
92    pub verbose: bool,
93}
94
95impl Default for Config {
96    fn default() -> Self {
97        Config {
98            branch: None,
99            since: None,
100            until: None,
101            no_merges: false,
102            title: None,
103            exclude: Vec::new(),
104            exclude_repos: Vec::new(),
105            groups: Vec::new(),
106            group_aliases: Vec::new(),
107            identities: Vec::new(),
108            forced_names: Vec::new(),
109            use_github: true,
110            detect_affiliation: true,
111            merge_names: true,
112            count_coauthors: true,
113            embed_avatars: true,
114            avatar_size: 64,
115            refresh: false,
116            verbose: false,
117        }
118    }
119}
120
121/// The result of [`analyze`]: every contributor (bots included — filter on
122/// [`Contributor::bot`] if you don't want them) and repository metadata.
123pub struct Analysis {
124    pub contributors: Vec<Contributor>,
125    pub meta: RepoMeta,
126}
127
128/// Row ordering for [`sort`].
129#[derive(Copy, Clone, PartialEq, Eq)]
130pub enum Sort {
131    /// Oldest first commit at the top.
132    First,
133    /// Most recent commit first.
134    Last,
135    /// Most commits first.
136    Commits,
137    /// Longest active period first.
138    Duration,
139    /// Alphabetical by name.
140    Name,
141}
142
143/// Sort contributor rows in place.
144pub fn sort(rows: &mut [Contributor], key: Sort) {
145    match key {
146        Sort::First => rows.sort_by(|a, b| a.first.cmp(&b.first).then(b.commits.cmp(&a.commits))),
147        Sort::Last => rows.sort_by(|a, b| b.last.cmp(&a.last).then(b.commits.cmp(&a.commits))),
148        Sort::Commits => rows.sort_by_key(|c| std::cmp::Reverse(c.commits)),
149        Sort::Duration => rows.sort_by_key(|c| std::cmp::Reverse(c.last - c.first)),
150        Sort::Name => rows.sort_by_key(|a| a.name.to_lowercase()),
151    }
152}
153
154/// Resolve a single repository (local path, `owner/repo` slug, or git URL)
155/// into contributor data and metadata. Shorthand for [`analyze_many`] with one
156/// source.
157pub fn analyze(input: &str, cfg: &Config) -> Result<Analysis> {
158    analyze_many(std::slice::from_ref(&input), cfg)
159}
160
161/// Whether an org-expanded slug (`owner/repo`) should be dropped given the
162/// user's exclude list. Matches the full slug or the bare repo name,
163/// case-insensitively, so both `nf-validation` and `nextflow-io/nf-validation`
164/// exclude the same repository.
165fn repo_excluded(slug: &str, excludes: &[String]) -> bool {
166    if excludes.is_empty() {
167        return false;
168    }
169    let slug_l = slug.to_lowercase();
170    let name_l = slug_l.rsplit('/').next().unwrap_or(slug_l.as_str());
171    excludes.iter().any(|e| {
172        let e = e.trim().to_lowercase();
173        !e.is_empty() && (e == slug_l || e == name_l)
174    })
175}
176
177/// Resolve one or more repositories into a single combined timeline. Commits
178/// from every source are pooled, author identities are clustered across the
179/// whole pool, and commits that appear in more than one source (overlapping
180/// histories) are de-duplicated by commit SHA. Disjoint sources contribute
181/// distinct SHAs and so are simply concatenated.
182pub fn analyze_many(inputs: &[&str], cfg: &Config) -> Result<Analysis> {
183    macro_rules! log {
184        ($($arg:tt)*) => { if cfg.verbose { eprintln!($($arg)*); } };
185    }
186    if inputs.is_empty() {
187        bail!("no repository sources given");
188    }
189
190    let client = github::GhClient::new(if cfg.use_github {
191        github::find_token()
192    } else {
193        None
194    });
195    let now = chrono::Utc::now().timestamp();
196    let mut caches = cache::Caches::load(cfg.refresh, now);
197
198    // Expand any bare owner names (a single token that is not a local path,
199    // slug, or URL) into every repository under that GitHub org or user.
200    // Everything else passes through unchanged. Explicit repos and whole orgs
201    // can be mixed freely; overlaps are dropped later by commit SHA.
202    let mut sources: Vec<String> = Vec::new();
203    for input in inputs {
204        if repo::looks_like_owner(input) {
205            if !cfg.use_github {
206                bail!("'{input}' looks like an org/user, but listing its repositories needs GitHub access (remove --no-github, or pass owner/repo slugs)");
207            }
208            let (slugs, cached) = match caches.org_repos(input) {
209                Some(repos) => (repos, true),
210                None => {
211                    log!("→ listing repositories for '{input}'");
212                    let fetched = client.list_owner_repos(input);
213                    if !fetched.is_empty() {
214                        caches.put_org_repos((*input).to_string(), fetched.clone());
215                    }
216                    (fetched, false)
217                }
218            };
219            if slugs.is_empty() {
220                if inputs.len() == 1 {
221                    bail!("no repositories found for org/user '{input}' (it may not exist or has no non-fork repos)");
222                }
223                log!("  warning: no repositories found for '{input}'");
224            } else {
225                let before = slugs.len();
226                let kept: Vec<String> = slugs
227                    .into_iter()
228                    .filter(|s| !repo_excluded(s, &cfg.exclude_repos))
229                    .collect();
230                let excluded = before - kept.len();
231                log!(
232                    "  {} repositories{}{}",
233                    kept.len(),
234                    if cached { " (cached)" } else { "" },
235                    if excluded > 0 {
236                        format!(", {excluded} excluded")
237                    } else {
238                        String::new()
239                    }
240                );
241                sources.extend(kept);
242            }
243        } else {
244            sources.push((*input).to_string());
245        }
246    }
247    if sources.is_empty() {
248        bail!("no usable repository sources");
249    }
250
251    // Prepare each source. With several sources, a single repo that fails to
252    // clone or has no commits shouldn't abort the whole pool: skip it with a
253    // warning. A single-source run still surfaces the error.
254    // Show progress bars only when there are several items to count, the caller
255    // wants output, and stderr is a real terminal (so piped/CI logs stay plain).
256    let multi = sources.len() > 1;
257    let show_bars = cfg.verbose && multi && std::io::stderr().is_terminal();
258    let mut prepared: Vec<repo::PreparedRepo> = Vec::new();
259    let clone_bar = progress::bar("cloning repositories", sources.len(), show_bars);
260    for input in &sources {
261        // With the bar active, `prepare` stays quiet so the bar owns the line.
262        match repo::prepare(input, cfg.branch.as_deref(), show_bars) {
263            Ok(p) => prepared.push(p),
264            Err(e) if multi => {
265                clone_bar.suspend(|| log!("  warning: skipping source '{input}' ({e})"))
266            }
267            Err(e) => return Err(e),
268        }
269        clone_bar.inc(1);
270    }
271    clone_bar.finish_and_clear();
272    if prepared.is_empty() {
273        bail!("no usable repository sources");
274    }
275    let source_slugs: Vec<Option<String>> = prepared.iter().map(|p| p.slug.clone()).collect();
276    // A line per source is just noise once the bar is in play (the commit
277    // summary below covers the totals); keep it for a single source or plain logs.
278    if !show_bars {
279        for p in &prepared {
280            log!("→ source: {} (branch {})", p.display_name, p.branch);
281        }
282    }
283
284    let filter = model::CommitFilter {
285        since: cfg.since.clone(),
286        until: cfg.until.clone(),
287        no_merges: cfg.no_merges,
288    };
289    let branch = cfg.branch.as_deref();
290
291    // Read each source's history, reusing the cached `git log` when the branch
292    // tip is unchanged. Sources are independent, so read them in parallel and
293    // merge in source order afterwards, which keeps de-duplication deterministic
294    // (an earlier source keeps a SHA it shares with a later one).
295    let outcomes: Vec<Mutex<Option<Result<SourceRead>>>> =
296        (0..prepared.len()).map(|_| Mutex::new(None)).collect();
297    let cursor = AtomicUsize::new(0);
298    let read_bar = progress::bar(
299        "reading history",
300        prepared.len(),
301        show_bars && prepared.len() > 1,
302    );
303    std::thread::scope(|s| {
304        for _ in 0..READ_THREADS.min(prepared.len()) {
305            s.spawn(|| loop {
306                let i = cursor.fetch_add(1, Ordering::Relaxed);
307                let Some(p) = prepared.get(i) else { break };
308                let r = read_source(p, &caches, &filter, branch);
309                *outcomes[i].lock().unwrap() = Some(r);
310                read_bar.inc(1);
311            });
312        }
313    });
314    read_bar.finish_and_clear();
315
316    let mut commits: Vec<model::Commit> = Vec::new();
317    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
318    let mut duplicates = 0u64;
319    let mut cached_sources = 0usize;
320    for (i, (p, slot)) in prepared.iter().zip(outcomes).enumerate() {
321        let read = match slot.into_inner().unwrap() {
322            Some(Ok(r)) => r,
323            Some(Err(e)) if prepared.len() > 1 => {
324                log!("  warning: skipping {} ({e})", p.display_name);
325                continue;
326            }
327            Some(Err(e)) => return Err(e),
328            None => continue,
329        };
330        if read.from_cache {
331            cached_sources += 1;
332        }
333        for mut c in read.commits {
334            if !seen.insert(c.sha.clone()) {
335                duplicates += 1;
336                continue;
337            }
338            c.src = i as u32;
339            commits.push(c);
340        }
341    }
342    if commits.is_empty() {
343        bail!("no commits found");
344    }
345    if cached_sources > 0 {
346        log!(
347            "→ reused cached history for {cached_sources}/{} sources",
348            prepared.len()
349        );
350    }
351    if prepared.len() > 1 {
352        log!(
353            "→ {} commits from {} sources ({} duplicate commits dropped), {} distinct author emails",
354            model::thousands(commits.len() as u64),
355            prepared.len(),
356            model::thousands(duplicates),
357            distinct_emails(&commits)
358        );
359    } else {
360        log!(
361            "→ {} commits from {} distinct author emails",
362            model::thousands(commits.len() as u64),
363            distinct_emails(&commits)
364        );
365    }
366
367    let mut clusters = identity::cluster_commits(&commits, cfg.merge_names);
368
369    let any_slug = source_slugs.iter().any(|s| s.is_some());
370    if cfg.use_github {
371        if any_slug {
372            log!("→ enriching from GitHub");
373            github::enrich_clusters(
374                &mut clusters,
375                &commits,
376                &source_slugs,
377                &client,
378                &mut caches,
379                cfg.verbose,
380            );
381            clusters = identity::merge_by_login(clusters);
382            github::fetch_profiles(&mut clusters, &client, &mut caches, cfg.verbose);
383            if !cfg.detect_affiliation {
384                for cl in clusters.iter_mut() {
385                    cl.affiliation = None;
386                }
387            }
388        } else {
389            log!("→ no GitHub sources, skipping enrichment");
390        }
391    }
392
393    if !cfg.identities.is_empty() {
394        clusters = identity::apply_identity_file(clusters, &cfg.identities);
395        log!("→ applied {} identity overrides", cfg.identities.len());
396    }
397
398    let mut contributors = identity::build_contributors(
399        &clusters,
400        &commits,
401        &cfg.groups,
402        &cfg.forced_names,
403        cfg.count_coauthors,
404    );
405
406    // Apply manual group-name aliases: fold each variant into its canonical
407    // name (case-insensitively), on both the primary group and the per-month
408    // affiliations.
409    apply_group_aliases(&mut contributors, &cfg.group_aliases);
410
411    // Collapse a leading "The " (case-insensitively) so e.g. "The Francis Crick
412    // Institute" and "Francis Crick Institute" are treated as one affiliation.
413    strip_leading_the(&mut contributors);
414
415    // Names the user supplied (manual affiliations and alias canonicals) are
416    // authoritative; canonicalisation only folds auto-detected variants and
417    // leaves these exactly as written ("The"-stripped to match the rows).
418    let mut manual_groups: std::collections::HashSet<String> = cfg
419        .groups
420        .iter()
421        .map(|r| strip_the(&r.group).to_string())
422        .collect();
423    manual_groups.extend(
424        cfg.group_aliases
425            .iter()
426            .map(|(canon, _)| strip_the(canon).to_string()),
427    );
428    let n_groups = canonicalize_groups(&mut contributors, &manual_groups);
429    if n_groups > 0 {
430        log!("→ {n_groups} distinct affiliations/groups");
431    }
432
433    if !cfg.exclude.is_empty() {
434        contributors.retain(|c| {
435            !cfg.exclude.iter().any(|pat| {
436                let p = pat.to_lowercase();
437                c.name.to_lowercase().contains(&p)
438                    || c.login
439                        .as_deref()
440                        .is_some_and(|l| l.to_lowercase().contains(&p))
441            })
442        });
443    }
444
445    log!(
446        "→ merged to {} contributors ({} bots)",
447        contributors.len(),
448        contributors.iter().filter(|c| c.bot).count()
449    );
450
451    if cfg.embed_avatars && cfg.use_github {
452        github::embed_avatars(
453            &mut contributors,
454            &client,
455            &mut caches,
456            cfg.avatar_size,
457            cfg.verbose,
458        );
459    }
460
461    // A single source keeps its slug/url/branch; multiple sources collapse to a
462    // combined label with no single canonical URL.
463    let single = if prepared.len() == 1 {
464        Some(&prepared[0])
465    } else {
466        None
467    };
468
469    // The GitHub owner shared by every source, if any (a single repo or a
470    // same-owner pool such as a whole-org timeline).
471    let owner = common_owner(&prepared);
472
473    // Owner/org avatar for the interactive page header. Shown for a single
474    // GitHub source and for a same-owner pool, but not for a mix of owners.
475    let owner_avatar = if cfg.use_github && cfg.embed_avatars {
476        owner
477            .as_deref()
478            .and_then(|owner| github::fetch_avatar(&client, &mut caches, owner, 48))
479    } else {
480        None
481    };
482
483    // Repository description (single GitHub source).
484    let description = if cfg.use_github {
485        single
486            .and_then(|p| p.slug.as_deref())
487            .and_then(|slug| github::fetch_repo_description(&client, slug))
488    } else {
489        None
490    };
491
492    // A single repo keeps its name; a same-owner pool is labelled by the owner
493    // (so a whole org reads as "nf-core"); a mixed pool joins the repo names.
494    let default_name = match (single, &owner) {
495        (Some(p), _) => p.display_name.clone(),
496        (None, Some(owner)) => owner.clone(),
497        (None, None) => combined_name(&prepared),
498    };
499    let branch = match single {
500        Some(p) => p.branch.clone(),
501        None => "combined".to_string(),
502    };
503
504    // Tagged releases, for timeline release markers. Gathered from every
505    // source; with more than one, the repo name is prefixed onto each tag so a
506    // pooled / org timeline's markers stay attributable (and necessarily
507    // busier — the page can toggle them off).
508    let releases: Vec<model::Release> = if prepared.len() == 1 {
509        repo::read_tags(&prepared[0])
510    } else {
511        prepared
512            .iter()
513            .flat_map(|p| {
514                repo::read_tags(p).into_iter().map(|mut r| {
515                    r.name = format!("{} {}", p.display_name, r.name);
516                    r
517                })
518            })
519            .collect()
520    };
521    if !releases.is_empty() {
522        log!("→ {} releases", releases.len());
523    }
524
525    let first = contributors.iter().map(|c| c.first).min().unwrap_or(0);
526    let last = contributors.iter().map(|c| c.last).max().unwrap_or(0);
527    let meta = RepoMeta {
528        name: cfg.title.clone().unwrap_or(default_name),
529        url: single.and_then(|p| p.url.clone()),
530        slug: single.and_then(|p| p.slug.clone()),
531        branch,
532        first,
533        last,
534        total_commits: commits.len() as u64,
535        total_contributors: contributors.iter().filter(|c| !c.bot).count(),
536        generated: chrono::Utc::now().format("%Y-%m-%d").to_string(),
537        owner_avatar,
538        description,
539        releases,
540    };
541
542    caches.save();
543    Ok(Analysis { contributors, meta })
544}
545
546struct SourceRead {
547    commits: Vec<model::Commit>,
548    from_cache: bool,
549}
550
551/// Read one source's history, reusing the cached `git log` when the branch tip
552/// is unchanged. On a miss, the clone is brought up to date, parsed, and the
553/// result is written back to the cache against the new tip.
554fn read_source(
555    p: &repo::PreparedRepo,
556    caches: &cache::Caches,
557    filter: &model::CommitFilter,
558    branch: Option<&str>,
559) -> Result<SourceRead> {
560    let key = source_cache_key(p);
561    // Freshness token: the remote tip via `ls-remote` (no object transfer),
562    // falling back to the local tip when offline or for a local checkout.
563    let remote = repo::remote_tip(p);
564    let tip = remote.clone().or_else(|| repo::local_tip(p));
565
566    if let Some(tip) = &tip {
567        if let Some(cached) = caches.commits(&key, tip, filter) {
568            let commits = cached
569                .into_iter()
570                .map(|c| model::Commit {
571                    sha: c.sha,
572                    ts: c.ts,
573                    name: c.name,
574                    email: c.email,
575                    coauthors: c.coauthors,
576                    src: 0,
577                })
578                .collect();
579            return Ok(SourceRead {
580                commits,
581                from_cache: true,
582            });
583        }
584    }
585
586    // Cache miss. Update the clone only when it is genuinely behind the remote;
587    // a just-cloned repo or an offline run reads what is already on disk.
588    let local = repo::local_tip(p);
589    if p.is_remote && remote.is_some() && remote != local {
590        repo::fetch(p);
591    }
592    let commits = repo::read_commits(p, branch, filter)?;
593    // After a fetch the tip moved, so re-read it; otherwise the pre-fetch local
594    // tip still stands.
595    if let Some(tip) = repo::local_tip(p) {
596        let cached = commits
597            .iter()
598            .map(|c| cache::CachedCommit {
599                sha: c.sha.clone(),
600                ts: c.ts,
601                name: c.name.clone(),
602                email: c.email.clone(),
603                coauthors: c.coauthors.clone(),
604            })
605            .collect();
606        caches.put_commits(&key, &tip, filter, cached);
607    }
608    Ok(SourceRead {
609        commits,
610        from_cache: false,
611    })
612}
613
614/// Stable per-repo cache key: the slug (or display name) plus the branch.
615fn source_cache_key(p: &repo::PreparedRepo) -> String {
616    let base = p.slug.as_deref().unwrap_or(&p.display_name);
617    repo::sanitize(&format!("{base}__{}", p.branch))
618}
619
620/// Build a chart title for a multi-source run: join up to three source names
621/// with " + ", and summarise the rest as "+N more".
622fn combined_name(prepared: &[repo::PreparedRepo]) -> String {
623    let names: Vec<&str> = prepared.iter().map(|p| p.display_name.as_str()).collect();
624    match names.len() {
625        0 => "repositories".to_string(),
626        1..=3 => names.join(" + "),
627        n => format!("{} + {} more", names[..2].join(" + "), n - 2),
628    }
629}
630
631/// The GitHub owner shared by every source, if all sources are GitHub slugs
632/// under one owner. Returns `None` if any source has no slug or the owners
633/// differ, so a mixed-owner run falls back to the generic header icon.
634fn common_owner(prepared: &[repo::PreparedRepo]) -> Option<String> {
635    let mut owner: Option<String> = None;
636    for p in prepared {
637        let o = p.slug.as_deref()?.split('/').next()?.to_string();
638        match &owner {
639            Some(prev) if *prev != o => return None,
640            _ => owner = Some(o),
641        }
642    }
643    owner
644}
645
646fn distinct_emails(commits: &[model::Commit]) -> usize {
647    let mut e: Vec<&str> = commits.iter().map(|c| c.email.as_str()).collect();
648    e.sort_unstable();
649    e.dedup();
650    e.len()
651}
652
653/// Merge group-name variants that refer to the same organisation:
654/// case/punctuation differences ("Seqera Labs" vs "seqeralabs"), a leading
655/// "The", and prefix forms ("Seqera" vs "Seqera Labs"). Returns the final
656/// group count.
657/// Rewrite group names that match a manual alias to their canonical form, on
658/// both the primary `group` and per-month affiliations (matched case-insensitively).
659fn apply_group_aliases(contributors: &mut [Contributor], aliases: &[(String, Vec<String>)]) {
660    if aliases.is_empty() {
661        return;
662    }
663    let mut map: std::collections::HashMap<String, String> = std::collections::HashMap::new();
664    for (canon, variants) in aliases {
665        map.insert(canon.to_lowercase(), canon.clone());
666        for v in variants {
667            map.insert(v.to_lowercase(), canon.clone());
668        }
669    }
670    let canon = |g: &str| map.get(&g.to_lowercase()).cloned();
671    for c in contributors.iter_mut() {
672        if let Some(g) = &c.group {
673            if let Some(cn) = canon(g) {
674                c.group = Some(cn);
675            }
676        }
677        if let Some(mg) = &mut c.month_groups {
678            for slot in mg.iter_mut().flatten() {
679                if let Some(cn) = canon(slot) {
680                    *slot = cn;
681                }
682            }
683        }
684    }
685}
686
687/// A group name with a leading "The " (case-insensitive) removed, or the name
688/// unchanged if it has no such prefix or stripping would leave it empty.
689fn strip_the(g: &str) -> &str {
690    match g.get(..4) {
691        Some(head) if head.eq_ignore_ascii_case("the ") => {
692            let rest = g[4..].trim_start();
693            if rest.is_empty() {
694                g
695            } else {
696                rest
697            }
698        }
699        _ => g,
700    }
701}
702
703/// Drop a leading "The " from every group name (primary and per-month) so
704/// "The X" collapses into "X".
705fn strip_leading_the(contributors: &mut [Contributor]) {
706    for c in contributors.iter_mut() {
707        if let Some(g) = &mut c.group {
708            if strip_the(g).len() != g.len() {
709                *g = strip_the(g).to_string();
710            }
711        }
712        if let Some(mg) = &mut c.month_groups {
713            for slot in mg.iter_mut().flatten() {
714                if strip_the(slot).len() != slot.len() {
715                    *slot = strip_the(slot).to_string();
716                }
717            }
718        }
719    }
720}
721
722fn canonicalize_groups(
723    contributors: &mut [Contributor],
724    manual: &std::collections::HashSet<String>,
725) -> usize {
726    use std::collections::HashMap;
727    let alnum_key = |g: &str| -> String {
728        let lower = g.to_lowercase();
729        let trimmed = lower.strip_prefix("the ").unwrap_or(&lower);
730        trimmed.chars().filter(|c| c.is_alphanumeric()).collect()
731    };
732
733    let mut variants: HashMap<String, usize> = HashMap::new();
734    for c in contributors.iter() {
735        if let Some(g) = &c.group {
736            *variants.entry(g.clone()).or_default() += 1;
737        }
738    }
739
740    // Prefix-merge keys come only from auto-detected names: a name the user
741    // wrote in --groups is authoritative, gets its own cluster, and is never
742    // folded into (or renamed by) a detected variant.
743    let mut keys: Vec<String> = variants
744        .keys()
745        .filter(|g| !manual.contains(*g))
746        .map(|g| alnum_key(g))
747        .collect();
748    keys.sort();
749    keys.dedup();
750    let resolve = |key: &str| -> String {
751        keys.iter()
752            .filter(|k| k.len() >= 6 && key.starts_with(*k))
753            .min_by_key(|k| k.len())
754            .map(|k| k.to_string())
755            .unwrap_or_else(|| key.to_string())
756    };
757    let cluster_of = |g: &str| -> String {
758        if manual.contains(g) {
759            format!("\u{0}{g}")
760        } else {
761            resolve(&alnum_key(g))
762        }
763    };
764
765    let mut best: HashMap<String, (&String, usize)> = HashMap::new();
766    for (g, n) in &variants {
767        let cluster = cluster_of(g);
768        let score = |g: &str, n: usize| {
769            n * 4
770                + usize::from(g.contains(' ')) * 2
771                + usize::from(g.chars().any(|c| c.is_uppercase()))
772        };
773        let entry = best.entry(cluster).or_insert((g, *n));
774        if score(g, *n) > score(entry.0, entry.1) {
775            *entry = (g, *n);
776        }
777    }
778
779    let display: HashMap<String, String> = best
780        .iter()
781        .map(|(k, (g, _))| (k.clone(), (*g).clone()))
782        .collect();
783    for c in contributors.iter_mut() {
784        if let Some(g) = &c.group {
785            c.group = display.get(&cluster_of(g)).cloned().or(c.group.clone());
786        }
787    }
788    display.len()
789}