Skip to main content

contributor_graphs/
lib.rs

1//! Build contributor timelines from a git or GitHub repository.
2//!
3//! `contributor-graphs` is primarily a command-line tool, but the same engine
4//! is available as a library. The usual flow is [`analyze`] to turn a
5//! repository into [`Contributor`] rows plus [`RepoMeta`], then one of the
6//! renderers in [`svg`] or [`html`].
7//!
8//! ```no_run
9//! use contributor_graphs::{analyze, svg, Config};
10//!
11//! let analysis = analyze("nf-core/rnaseq", &Config::default())?;
12//! let rows: Vec<_> = analysis.contributors.iter().filter(|c| !c.bot).cloned().collect();
13//! let opts = svg::SvgOptions {
14//!     title: analysis.meta.name.clone(),
15//!     ..Default::default()
16//! };
17//! std::fs::write("rnaseq.svg", svg::render_svg(&rows, &opts))?;
18//! # Ok::<(), anyhow::Error>(())
19//! ```
20//!
21//! The lower-level modules ([`repo`], [`identity`], [`github`]) are public too,
22//! for callers who want to assemble a custom pipeline.
23
24pub mod cache;
25pub mod github;
26pub mod html;
27pub mod identity;
28pub mod model;
29pub mod repo;
30pub mod svg;
31pub mod theme;
32
33use anyhow::{bail, Result};
34use std::sync::atomic::{AtomicUsize, Ordering};
35use std::sync::Mutex;
36
37pub use model::{Contributor, RepoMeta};
38
39/// Worker threads for reading per-source history in parallel.
40const READ_THREADS: usize = 8;
41
42/// How to read history and resolve identities. Construct with `Config::default()`
43/// and override fields as needed.
44#[derive(Clone)]
45pub struct Config {
46    /// Branch or ref to read (default: `HEAD`).
47    pub branch: Option<String>,
48    /// Only include commits after this date (passed to `git log --since`).
49    pub since: Option<String>,
50    /// Only include commits before this date (`git log --until`).
51    pub until: Option<String>,
52    /// Skip merge commits.
53    pub no_merges: bool,
54    /// Override the chart/repository title.
55    pub title: Option<String>,
56    /// Exclude contributors whose name or login contains any of these strings.
57    pub exclude: Vec<String>,
58    /// Manual `matcher → group` rules (matcher = name, email, or login),
59    /// optionally date-bounded for affiliations that change over time.
60    pub groups: Vec<model::GroupRule>,
61    /// Manual group-name aliases: `(canonical, [variants])`. Variants are
62    /// folded into the canonical name, which is then authoritative.
63    pub group_aliases: Vec<(String, Vec<String>)>,
64    /// Manual identity merges: each row is `[canonical, alias, …]`.
65    pub identities: Vec<Vec<String>>,
66    /// Authoritative display names: `(matcher, name)`. When a matcher (name,
67    /// email, or login) hits a cluster, `name` overrides the GitHub profile
68    /// name and the commit-derived name. Comes from the TSV affiliations file's
69    /// `full name` column; empty there for most people.
70    pub forced_names: Vec<(String, String)>,
71    /// Query the GitHub API for logins, avatars, and profiles.
72    pub use_github: bool,
73    /// Auto-detect affiliations from GitHub profile companies.
74    pub detect_affiliation: bool,
75    /// Merge identities that share a normalised author name.
76    pub merge_names: bool,
77    /// Count `Co-authored-by` trailers as commits for those contributors.
78    pub count_coauthors: bool,
79    /// Download avatars and embed them as data URIs.
80    pub embed_avatars: bool,
81    /// Avatar pixel size to request when embedding.
82    pub avatar_size: u32,
83    /// Ignore cached git history and GitHub lookups, forcing a fresh pull
84    /// (the caches are still refreshed with the new results).
85    pub refresh: bool,
86    /// Print progress to stderr.
87    pub verbose: bool,
88}
89
90impl Default for Config {
91    fn default() -> Self {
92        Config {
93            branch: None,
94            since: None,
95            until: None,
96            no_merges: false,
97            title: None,
98            exclude: Vec::new(),
99            groups: Vec::new(),
100            group_aliases: Vec::new(),
101            identities: Vec::new(),
102            forced_names: Vec::new(),
103            use_github: true,
104            detect_affiliation: true,
105            merge_names: true,
106            count_coauthors: true,
107            embed_avatars: true,
108            avatar_size: 64,
109            refresh: false,
110            verbose: false,
111        }
112    }
113}
114
115/// The result of [`analyze`]: every contributor (bots included — filter on
116/// [`Contributor::bot`] if you don't want them) and repository metadata.
117pub struct Analysis {
118    pub contributors: Vec<Contributor>,
119    pub meta: RepoMeta,
120}
121
122/// Row ordering for [`sort`].
123#[derive(Copy, Clone, PartialEq, Eq)]
124pub enum Sort {
125    /// Oldest first commit at the top.
126    First,
127    /// Most recent commit first.
128    Last,
129    /// Most commits first.
130    Commits,
131    /// Longest active period first.
132    Duration,
133    /// Alphabetical by name.
134    Name,
135}
136
137/// Sort contributor rows in place.
138pub fn sort(rows: &mut [Contributor], key: Sort) {
139    match key {
140        Sort::First => rows.sort_by(|a, b| a.first.cmp(&b.first).then(b.commits.cmp(&a.commits))),
141        Sort::Last => rows.sort_by(|a, b| b.last.cmp(&a.last).then(b.commits.cmp(&a.commits))),
142        Sort::Commits => rows.sort_by_key(|c| std::cmp::Reverse(c.commits)),
143        Sort::Duration => rows.sort_by_key(|c| std::cmp::Reverse(c.last - c.first)),
144        Sort::Name => rows.sort_by_key(|a| a.name.to_lowercase()),
145    }
146}
147
148/// Resolve a single repository (local path, `owner/repo` slug, or git URL)
149/// into contributor data and metadata. Shorthand for [`analyze_many`] with one
150/// source.
151pub fn analyze(input: &str, cfg: &Config) -> Result<Analysis> {
152    analyze_many(std::slice::from_ref(&input), cfg)
153}
154
155/// Resolve one or more repositories into a single combined timeline. Commits
156/// from every source are pooled, author identities are clustered across the
157/// whole pool, and commits that appear in more than one source (overlapping
158/// histories) are de-duplicated by commit SHA. Disjoint sources contribute
159/// distinct SHAs and so are simply concatenated.
160pub fn analyze_many(inputs: &[&str], cfg: &Config) -> Result<Analysis> {
161    macro_rules! log {
162        ($($arg:tt)*) => { if cfg.verbose { eprintln!($($arg)*); } };
163    }
164    if inputs.is_empty() {
165        bail!("no repository sources given");
166    }
167
168    let client = github::GhClient::new(if cfg.use_github {
169        github::find_token()
170    } else {
171        None
172    });
173    let now = chrono::Utc::now().timestamp();
174    let mut caches = cache::Caches::load(cfg.refresh, now);
175
176    // Expand any bare owner names (a single token that is not a local path,
177    // slug, or URL) into every repository under that GitHub org or user.
178    // Everything else passes through unchanged. Explicit repos and whole orgs
179    // can be mixed freely; overlaps are dropped later by commit SHA.
180    let mut sources: Vec<String> = Vec::new();
181    for input in inputs {
182        if repo::looks_like_owner(input) {
183            if !cfg.use_github {
184                bail!("'{input}' looks like an org/user, but listing its repositories needs GitHub access (remove --no-github, or pass owner/repo slugs)");
185            }
186            let (slugs, cached) = match caches.org_repos(input) {
187                Some(repos) => (repos, true),
188                None => {
189                    log!("→ listing repositories for '{input}'");
190                    let fetched = client.list_owner_repos(input);
191                    if !fetched.is_empty() {
192                        caches.put_org_repos((*input).to_string(), fetched.clone());
193                    }
194                    (fetched, false)
195                }
196            };
197            if slugs.is_empty() {
198                if inputs.len() == 1 {
199                    bail!("no repositories found for org/user '{input}' (it may not exist or has no non-fork repos)");
200                }
201                log!("  warning: no repositories found for '{input}'");
202            } else {
203                log!(
204                    "  {} repositories{}",
205                    slugs.len(),
206                    if cached { " (cached)" } else { "" }
207                );
208                sources.extend(slugs);
209            }
210        } else {
211            sources.push((*input).to_string());
212        }
213    }
214    if sources.is_empty() {
215        bail!("no usable repository sources");
216    }
217
218    // Prepare each source. With several sources, a single repo that fails to
219    // clone or has no commits shouldn't abort the whole pool: skip it with a
220    // warning. A single-source run still surfaces the error.
221    let mut prepared: Vec<repo::PreparedRepo> = Vec::new();
222    for input in &sources {
223        match repo::prepare(input, cfg.branch.as_deref()) {
224            Ok(p) => prepared.push(p),
225            Err(e) if sources.len() > 1 => log!("  warning: skipping source '{input}' ({e})"),
226            Err(e) => return Err(e),
227        }
228    }
229    if prepared.is_empty() {
230        bail!("no usable repository sources");
231    }
232    let source_slugs: Vec<Option<String>> = prepared.iter().map(|p| p.slug.clone()).collect();
233    for p in &prepared {
234        log!("→ source: {} (branch {})", p.display_name, p.branch);
235    }
236
237    let filter = model::CommitFilter {
238        since: cfg.since.clone(),
239        until: cfg.until.clone(),
240        no_merges: cfg.no_merges,
241    };
242    let branch = cfg.branch.as_deref();
243
244    // Read each source's history, reusing the cached `git log` when the branch
245    // tip is unchanged. Sources are independent, so read them in parallel and
246    // merge in source order afterwards, which keeps de-duplication deterministic
247    // (an earlier source keeps a SHA it shares with a later one).
248    let outcomes: Vec<Mutex<Option<Result<SourceRead>>>> =
249        (0..prepared.len()).map(|_| Mutex::new(None)).collect();
250    let cursor = AtomicUsize::new(0);
251    std::thread::scope(|s| {
252        for _ in 0..READ_THREADS.min(prepared.len()) {
253            s.spawn(|| loop {
254                let i = cursor.fetch_add(1, Ordering::Relaxed);
255                let Some(p) = prepared.get(i) else { break };
256                let r = read_source(p, &caches, &filter, branch);
257                *outcomes[i].lock().unwrap() = Some(r);
258            });
259        }
260    });
261
262    let mut commits: Vec<model::Commit> = Vec::new();
263    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
264    let mut duplicates = 0u64;
265    let mut cached_sources = 0usize;
266    for (i, (p, slot)) in prepared.iter().zip(outcomes).enumerate() {
267        let read = match slot.into_inner().unwrap() {
268            Some(Ok(r)) => r,
269            Some(Err(e)) if prepared.len() > 1 => {
270                log!("  warning: skipping {} ({e})", p.display_name);
271                continue;
272            }
273            Some(Err(e)) => return Err(e),
274            None => continue,
275        };
276        if read.from_cache {
277            cached_sources += 1;
278        }
279        for mut c in read.commits {
280            if !seen.insert(c.sha.clone()) {
281                duplicates += 1;
282                continue;
283            }
284            c.src = i as u32;
285            commits.push(c);
286        }
287    }
288    if commits.is_empty() {
289        bail!("no commits found");
290    }
291    if cached_sources > 0 {
292        log!(
293            "→ reused cached history for {cached_sources}/{} sources",
294            prepared.len()
295        );
296    }
297    if prepared.len() > 1 {
298        log!(
299            "→ {} commits from {} sources ({} duplicate commits dropped), {} distinct author emails",
300            model::thousands(commits.len() as u64),
301            prepared.len(),
302            model::thousands(duplicates),
303            distinct_emails(&commits)
304        );
305    } else {
306        log!(
307            "→ {} commits from {} distinct author emails",
308            model::thousands(commits.len() as u64),
309            distinct_emails(&commits)
310        );
311    }
312
313    let mut clusters = identity::cluster_commits(&commits, cfg.merge_names);
314
315    let any_slug = source_slugs.iter().any(|s| s.is_some());
316    if cfg.use_github {
317        if any_slug {
318            log!("→ enriching from GitHub");
319            github::enrich_clusters(
320                &mut clusters,
321                &commits,
322                &source_slugs,
323                &client,
324                &mut caches,
325                cfg.verbose,
326            );
327            clusters = identity::merge_by_login(clusters);
328            github::fetch_profiles(&mut clusters, &client, &mut caches, cfg.verbose);
329            if !cfg.detect_affiliation {
330                for cl in clusters.iter_mut() {
331                    cl.affiliation = None;
332                }
333            }
334        } else {
335            log!("→ no GitHub sources, skipping enrichment");
336        }
337    }
338
339    if !cfg.identities.is_empty() {
340        clusters = identity::apply_identity_file(clusters, &cfg.identities);
341        log!("→ applied {} identity overrides", cfg.identities.len());
342    }
343
344    let mut contributors = identity::build_contributors(
345        &clusters,
346        &commits,
347        &cfg.groups,
348        &cfg.forced_names,
349        cfg.count_coauthors,
350    );
351
352    // Apply manual group-name aliases: fold each variant into its canonical
353    // name (case-insensitively), on both the primary group and the per-month
354    // affiliations.
355    apply_group_aliases(&mut contributors, &cfg.group_aliases);
356
357    // Collapse a leading "The " (case-insensitively) so e.g. "The Francis Crick
358    // Institute" and "Francis Crick Institute" are treated as one affiliation.
359    strip_leading_the(&mut contributors);
360
361    // Names the user supplied (manual affiliations and alias canonicals) are
362    // authoritative; canonicalisation only folds auto-detected variants and
363    // leaves these exactly as written ("The"-stripped to match the rows).
364    let mut manual_groups: std::collections::HashSet<String> = cfg
365        .groups
366        .iter()
367        .map(|r| strip_the(&r.group).to_string())
368        .collect();
369    manual_groups.extend(
370        cfg.group_aliases
371            .iter()
372            .map(|(canon, _)| strip_the(canon).to_string()),
373    );
374    let n_groups = canonicalize_groups(&mut contributors, &manual_groups);
375    if n_groups > 0 {
376        log!("→ {n_groups} distinct affiliations/groups");
377    }
378
379    if !cfg.exclude.is_empty() {
380        contributors.retain(|c| {
381            !cfg.exclude.iter().any(|pat| {
382                let p = pat.to_lowercase();
383                c.name.to_lowercase().contains(&p)
384                    || c.login
385                        .as_deref()
386                        .is_some_and(|l| l.to_lowercase().contains(&p))
387            })
388        });
389    }
390
391    log!(
392        "→ merged to {} contributors ({} bots)",
393        contributors.len(),
394        contributors.iter().filter(|c| c.bot).count()
395    );
396
397    if cfg.embed_avatars && cfg.use_github {
398        github::embed_avatars(
399            &mut contributors,
400            &client,
401            &mut caches,
402            cfg.avatar_size,
403            cfg.verbose,
404        );
405    }
406
407    // A single source keeps its slug/url/branch; multiple sources collapse to a
408    // combined label with no single canonical URL.
409    let single = if prepared.len() == 1 {
410        Some(&prepared[0])
411    } else {
412        None
413    };
414
415    // The GitHub owner shared by every source, if any (a single repo or a
416    // same-owner pool such as a whole-org timeline).
417    let owner = common_owner(&prepared);
418
419    // Owner/org avatar for the interactive page header. Shown for a single
420    // GitHub source and for a same-owner pool, but not for a mix of owners.
421    let owner_avatar = if cfg.use_github && cfg.embed_avatars {
422        owner
423            .as_deref()
424            .and_then(|owner| github::fetch_avatar(&client, &mut caches, owner, 48))
425    } else {
426        None
427    };
428
429    // Repository description (single GitHub source).
430    let description = if cfg.use_github {
431        single
432            .and_then(|p| p.slug.as_deref())
433            .and_then(|slug| github::fetch_repo_description(&client, slug))
434    } else {
435        None
436    };
437
438    // A single repo keeps its name; a same-owner pool is labelled by the owner
439    // (so a whole org reads as "nf-core"); a mixed pool joins the repo names.
440    let default_name = match (single, &owner) {
441        (Some(p), _) => p.display_name.clone(),
442        (None, Some(owner)) => owner.clone(),
443        (None, None) => combined_name(&prepared),
444    };
445    let branch = match single {
446        Some(p) => p.branch.clone(),
447        None => "combined".to_string(),
448    };
449
450    // Tagged releases, for timeline release markers. Gathered from every
451    // source; with more than one, the repo name is prefixed onto each tag so a
452    // pooled / org timeline's markers stay attributable (and necessarily
453    // busier — the page can toggle them off).
454    let releases: Vec<model::Release> = if prepared.len() == 1 {
455        repo::read_tags(&prepared[0])
456    } else {
457        prepared
458            .iter()
459            .flat_map(|p| {
460                repo::read_tags(p).into_iter().map(|mut r| {
461                    r.name = format!("{} {}", p.display_name, r.name);
462                    r
463                })
464            })
465            .collect()
466    };
467    if !releases.is_empty() {
468        log!("→ {} releases", releases.len());
469    }
470
471    let first = contributors.iter().map(|c| c.first).min().unwrap_or(0);
472    let last = contributors.iter().map(|c| c.last).max().unwrap_or(0);
473    let meta = RepoMeta {
474        name: cfg.title.clone().unwrap_or(default_name),
475        url: single.and_then(|p| p.url.clone()),
476        slug: single.and_then(|p| p.slug.clone()),
477        branch,
478        first,
479        last,
480        total_commits: commits.len() as u64,
481        total_contributors: contributors.iter().filter(|c| !c.bot).count(),
482        generated: chrono::Utc::now().format("%Y-%m-%d").to_string(),
483        owner_avatar,
484        description,
485        releases,
486    };
487
488    caches.save();
489    Ok(Analysis { contributors, meta })
490}
491
492struct SourceRead {
493    commits: Vec<model::Commit>,
494    from_cache: bool,
495}
496
497/// Read one source's history, reusing the cached `git log` when the branch tip
498/// is unchanged. On a miss, the clone is brought up to date, parsed, and the
499/// result is written back to the cache against the new tip.
500fn read_source(
501    p: &repo::PreparedRepo,
502    caches: &cache::Caches,
503    filter: &model::CommitFilter,
504    branch: Option<&str>,
505) -> Result<SourceRead> {
506    let key = source_cache_key(p);
507    // Freshness token: the remote tip via `ls-remote` (no object transfer),
508    // falling back to the local tip when offline or for a local checkout.
509    let remote = repo::remote_tip(p);
510    let tip = remote.clone().or_else(|| repo::local_tip(p));
511
512    if let Some(tip) = &tip {
513        if let Some(cached) = caches.commits(&key, tip, filter) {
514            let commits = cached
515                .into_iter()
516                .map(|c| model::Commit {
517                    sha: c.sha,
518                    ts: c.ts,
519                    name: c.name,
520                    email: c.email,
521                    coauthors: c.coauthors,
522                    src: 0,
523                })
524                .collect();
525            return Ok(SourceRead {
526                commits,
527                from_cache: true,
528            });
529        }
530    }
531
532    // Cache miss. Update the clone only when it is genuinely behind the remote;
533    // a just-cloned repo or an offline run reads what is already on disk.
534    let local = repo::local_tip(p);
535    if p.is_remote && remote.is_some() && remote != local {
536        repo::fetch(p);
537    }
538    let commits = repo::read_commits(p, branch, filter)?;
539    // After a fetch the tip moved, so re-read it; otherwise the pre-fetch local
540    // tip still stands.
541    if let Some(tip) = repo::local_tip(p) {
542        let cached = commits
543            .iter()
544            .map(|c| cache::CachedCommit {
545                sha: c.sha.clone(),
546                ts: c.ts,
547                name: c.name.clone(),
548                email: c.email.clone(),
549                coauthors: c.coauthors.clone(),
550            })
551            .collect();
552        caches.put_commits(&key, &tip, filter, cached);
553    }
554    Ok(SourceRead {
555        commits,
556        from_cache: false,
557    })
558}
559
560/// Stable per-repo cache key: the slug (or display name) plus the branch.
561fn source_cache_key(p: &repo::PreparedRepo) -> String {
562    let base = p.slug.as_deref().unwrap_or(&p.display_name);
563    repo::sanitize(&format!("{base}__{}", p.branch))
564}
565
566/// Build a chart title for a multi-source run: join up to three source names
567/// with " + ", and summarise the rest as "+N more".
568fn combined_name(prepared: &[repo::PreparedRepo]) -> String {
569    let names: Vec<&str> = prepared.iter().map(|p| p.display_name.as_str()).collect();
570    match names.len() {
571        0 => "repositories".to_string(),
572        1..=3 => names.join(" + "),
573        n => format!("{} + {} more", names[..2].join(" + "), n - 2),
574    }
575}
576
577/// The GitHub owner shared by every source, if all sources are GitHub slugs
578/// under one owner. Returns `None` if any source has no slug or the owners
579/// differ, so a mixed-owner run falls back to the generic header icon.
580fn common_owner(prepared: &[repo::PreparedRepo]) -> Option<String> {
581    let mut owner: Option<String> = None;
582    for p in prepared {
583        let o = p.slug.as_deref()?.split('/').next()?.to_string();
584        match &owner {
585            Some(prev) if *prev != o => return None,
586            _ => owner = Some(o),
587        }
588    }
589    owner
590}
591
592fn distinct_emails(commits: &[model::Commit]) -> usize {
593    let mut e: Vec<&str> = commits.iter().map(|c| c.email.as_str()).collect();
594    e.sort_unstable();
595    e.dedup();
596    e.len()
597}
598
599/// Merge group-name variants that refer to the same organisation:
600/// case/punctuation differences ("Seqera Labs" vs "seqeralabs"), a leading
601/// "The", and prefix forms ("Seqera" vs "Seqera Labs"). Returns the final
602/// group count.
603/// Rewrite group names that match a manual alias to their canonical form, on
604/// both the primary `group` and per-month affiliations (matched case-insensitively).
605fn apply_group_aliases(contributors: &mut [Contributor], aliases: &[(String, Vec<String>)]) {
606    if aliases.is_empty() {
607        return;
608    }
609    let mut map: std::collections::HashMap<String, String> = std::collections::HashMap::new();
610    for (canon, variants) in aliases {
611        map.insert(canon.to_lowercase(), canon.clone());
612        for v in variants {
613            map.insert(v.to_lowercase(), canon.clone());
614        }
615    }
616    let canon = |g: &str| map.get(&g.to_lowercase()).cloned();
617    for c in contributors.iter_mut() {
618        if let Some(g) = &c.group {
619            if let Some(cn) = canon(g) {
620                c.group = Some(cn);
621            }
622        }
623        if let Some(mg) = &mut c.month_groups {
624            for slot in mg.iter_mut().flatten() {
625                if let Some(cn) = canon(slot) {
626                    *slot = cn;
627                }
628            }
629        }
630    }
631}
632
633/// A group name with a leading "The " (case-insensitive) removed, or the name
634/// unchanged if it has no such prefix or stripping would leave it empty.
635fn strip_the(g: &str) -> &str {
636    match g.get(..4) {
637        Some(head) if head.eq_ignore_ascii_case("the ") => {
638            let rest = g[4..].trim_start();
639            if rest.is_empty() {
640                g
641            } else {
642                rest
643            }
644        }
645        _ => g,
646    }
647}
648
649/// Drop a leading "The " from every group name (primary and per-month) so
650/// "The X" collapses into "X".
651fn strip_leading_the(contributors: &mut [Contributor]) {
652    for c in contributors.iter_mut() {
653        if let Some(g) = &mut c.group {
654            if strip_the(g).len() != g.len() {
655                *g = strip_the(g).to_string();
656            }
657        }
658        if let Some(mg) = &mut c.month_groups {
659            for slot in mg.iter_mut().flatten() {
660                if strip_the(slot).len() != slot.len() {
661                    *slot = strip_the(slot).to_string();
662                }
663            }
664        }
665    }
666}
667
668fn canonicalize_groups(
669    contributors: &mut [Contributor],
670    manual: &std::collections::HashSet<String>,
671) -> usize {
672    use std::collections::HashMap;
673    let alnum_key = |g: &str| -> String {
674        let lower = g.to_lowercase();
675        let trimmed = lower.strip_prefix("the ").unwrap_or(&lower);
676        trimmed.chars().filter(|c| c.is_alphanumeric()).collect()
677    };
678
679    let mut variants: HashMap<String, usize> = HashMap::new();
680    for c in contributors.iter() {
681        if let Some(g) = &c.group {
682            *variants.entry(g.clone()).or_default() += 1;
683        }
684    }
685
686    // Prefix-merge keys come only from auto-detected names: a name the user
687    // wrote in --groups is authoritative, gets its own cluster, and is never
688    // folded into (or renamed by) a detected variant.
689    let mut keys: Vec<String> = variants
690        .keys()
691        .filter(|g| !manual.contains(*g))
692        .map(|g| alnum_key(g))
693        .collect();
694    keys.sort();
695    keys.dedup();
696    let resolve = |key: &str| -> String {
697        keys.iter()
698            .filter(|k| k.len() >= 6 && key.starts_with(*k))
699            .min_by_key(|k| k.len())
700            .map(|k| k.to_string())
701            .unwrap_or_else(|| key.to_string())
702    };
703    let cluster_of = |g: &str| -> String {
704        if manual.contains(g) {
705            format!("\u{0}{g}")
706        } else {
707            resolve(&alnum_key(g))
708        }
709    };
710
711    let mut best: HashMap<String, (&String, usize)> = HashMap::new();
712    for (g, n) in &variants {
713        let cluster = cluster_of(g);
714        let score = |g: &str, n: usize| {
715            n * 4
716                + usize::from(g.contains(' ')) * 2
717                + usize::from(g.chars().any(|c| c.is_uppercase()))
718        };
719        let entry = best.entry(cluster).or_insert((g, *n));
720        if score(g, *n) > score(entry.0, entry.1) {
721            *entry = (g, *n);
722        }
723    }
724
725    let display: HashMap<String, String> = best
726        .iter()
727        .map(|(k, (g, _))| (k.clone(), (*g).clone()))
728        .collect();
729    for c in contributors.iter_mut() {
730        if let Some(g) = &c.group {
731            c.group = display.get(&cluster_of(g)).cloned().or(c.group.clone());
732        }
733    }
734    display.len()
735}