Skip to main content

contributor_graphs/
lib.rs

1//! Build contributor timelines from a git or GitHub repository.
2//!
3//! `contributor-graphs` is primarily a command-line tool, but the same engine
4//! is available as a library. The usual flow is [`analyze`] to turn a
5//! repository into [`Contributor`] rows plus [`RepoMeta`], then one of the
6//! renderers in [`svg`] or [`html`].
7//!
8//! ```no_run
9//! use contributor_graphs::{analyze, svg, Config};
10//!
11//! let analysis = analyze("nf-core/rnaseq", &Config::default())?;
12//! let rows: Vec<_> = analysis.contributors.iter().filter(|c| !c.bot).cloned().collect();
13//! let opts = svg::SvgOptions {
14//!     title: analysis.meta.name.clone(),
15//!     ..Default::default()
16//! };
17//! std::fs::write("rnaseq.svg", svg::render_svg(&rows, &opts))?;
18//! # Ok::<(), anyhow::Error>(())
19//! ```
20//!
21//! The lower-level modules ([`repo`], [`identity`], [`github`]) are public too,
22//! for callers who want to assemble a custom pipeline.
23
24pub mod cache;
25pub mod github;
26pub mod html;
27pub mod identity;
28pub mod model;
29pub mod repo;
30pub mod svg;
31pub mod theme;
32
33use anyhow::{bail, Result};
34use std::sync::atomic::{AtomicUsize, Ordering};
35use std::sync::Mutex;
36
37pub use model::{Contributor, RepoMeta};
38
39/// Worker threads for reading per-source history in parallel.
40const READ_THREADS: usize = 8;
41
42/// How to read history and resolve identities. Construct with `Config::default()`
43/// and override fields as needed.
44#[derive(Clone)]
45pub struct Config {
46    /// Branch or ref to read (default: `HEAD`).
47    pub branch: Option<String>,
48    /// Only include commits after this date (passed to `git log --since`).
49    pub since: Option<String>,
50    /// Only include commits before this date (`git log --until`).
51    pub until: Option<String>,
52    /// Skip merge commits.
53    pub no_merges: bool,
54    /// Override the chart/repository title.
55    pub title: Option<String>,
56    /// Exclude contributors whose name or login contains any of these strings.
57    pub exclude: Vec<String>,
58    /// Manual `matcher → group` rules (matcher = name, email, or login),
59    /// optionally date-bounded for affiliations that change over time.
60    pub groups: Vec<model::GroupRule>,
61    /// Manual group-name aliases: `(canonical, [variants])`. Variants are
62    /// folded into the canonical name, which is then authoritative.
63    pub group_aliases: Vec<(String, Vec<String>)>,
64    /// Manual identity merges: each row is `[canonical, alias, …]`.
65    pub identities: Vec<Vec<String>>,
66    /// Query the GitHub API for logins, avatars, and profiles.
67    pub use_github: bool,
68    /// Auto-detect affiliations from GitHub profile companies.
69    pub detect_affiliation: bool,
70    /// Merge identities that share a normalised author name.
71    pub merge_names: bool,
72    /// Count `Co-authored-by` trailers as commits for those contributors.
73    pub count_coauthors: bool,
74    /// Download avatars and embed them as data URIs.
75    pub embed_avatars: bool,
76    /// Avatar pixel size to request when embedding.
77    pub avatar_size: u32,
78    /// Ignore cached git history and GitHub lookups, forcing a fresh pull
79    /// (the caches are still refreshed with the new results).
80    pub refresh: bool,
81    /// Print progress to stderr.
82    pub verbose: bool,
83}
84
85impl Default for Config {
86    fn default() -> Self {
87        Config {
88            branch: None,
89            since: None,
90            until: None,
91            no_merges: false,
92            title: None,
93            exclude: Vec::new(),
94            groups: Vec::new(),
95            group_aliases: Vec::new(),
96            identities: Vec::new(),
97            use_github: true,
98            detect_affiliation: true,
99            merge_names: true,
100            count_coauthors: true,
101            embed_avatars: true,
102            avatar_size: 64,
103            refresh: false,
104            verbose: false,
105        }
106    }
107}
108
109/// The result of [`analyze`]: every contributor (bots included — filter on
110/// [`Contributor::bot`] if you don't want them) and repository metadata.
111pub struct Analysis {
112    pub contributors: Vec<Contributor>,
113    pub meta: RepoMeta,
114}
115
116/// Row ordering for [`sort`].
117#[derive(Copy, Clone, PartialEq, Eq)]
118pub enum Sort {
119    /// Oldest first commit at the top.
120    First,
121    /// Most recent commit first.
122    Last,
123    /// Most commits first.
124    Commits,
125    /// Longest active period first.
126    Duration,
127    /// Alphabetical by name.
128    Name,
129}
130
131/// Sort contributor rows in place.
132pub fn sort(rows: &mut [Contributor], key: Sort) {
133    match key {
134        Sort::First => rows.sort_by(|a, b| a.first.cmp(&b.first).then(b.commits.cmp(&a.commits))),
135        Sort::Last => rows.sort_by(|a, b| b.last.cmp(&a.last).then(b.commits.cmp(&a.commits))),
136        Sort::Commits => rows.sort_by_key(|c| std::cmp::Reverse(c.commits)),
137        Sort::Duration => rows.sort_by_key(|c| std::cmp::Reverse(c.last - c.first)),
138        Sort::Name => rows.sort_by_key(|a| a.name.to_lowercase()),
139    }
140}
141
142/// Resolve a single repository (local path, `owner/repo` slug, or git URL)
143/// into contributor data and metadata. Shorthand for [`analyze_many`] with one
144/// source.
145pub fn analyze(input: &str, cfg: &Config) -> Result<Analysis> {
146    analyze_many(std::slice::from_ref(&input), cfg)
147}
148
149/// Resolve one or more repositories into a single combined timeline. Commits
150/// from every source are pooled, author identities are clustered across the
151/// whole pool, and commits that appear in more than one source (overlapping
152/// histories) are de-duplicated by commit SHA. Disjoint sources contribute
153/// distinct SHAs and so are simply concatenated.
154pub fn analyze_many(inputs: &[&str], cfg: &Config) -> Result<Analysis> {
155    macro_rules! log {
156        ($($arg:tt)*) => { if cfg.verbose { eprintln!($($arg)*); } };
157    }
158    if inputs.is_empty() {
159        bail!("no repository sources given");
160    }
161
162    let client = github::GhClient::new(if cfg.use_github {
163        github::find_token()
164    } else {
165        None
166    });
167    let now = chrono::Utc::now().timestamp();
168    let mut caches = cache::Caches::load(cfg.refresh, now);
169
170    // Expand any bare owner names (a single token that is not a local path,
171    // slug, or URL) into every repository under that GitHub org or user.
172    // Everything else passes through unchanged. Explicit repos and whole orgs
173    // can be mixed freely; overlaps are dropped later by commit SHA.
174    let mut sources: Vec<String> = Vec::new();
175    for input in inputs {
176        if repo::looks_like_owner(input) {
177            if !cfg.use_github {
178                bail!("'{input}' looks like an org/user, but listing its repositories needs GitHub access (remove --no-github, or pass owner/repo slugs)");
179            }
180            let (slugs, cached) = match caches.org_repos(input) {
181                Some(repos) => (repos, true),
182                None => {
183                    log!("→ listing repositories for '{input}'");
184                    let fetched = client.list_owner_repos(input);
185                    if !fetched.is_empty() {
186                        caches.put_org_repos((*input).to_string(), fetched.clone());
187                    }
188                    (fetched, false)
189                }
190            };
191            if slugs.is_empty() {
192                if inputs.len() == 1 {
193                    bail!("no repositories found for org/user '{input}' (it may not exist or has no non-fork repos)");
194                }
195                log!("  warning: no repositories found for '{input}'");
196            } else {
197                log!(
198                    "  {} repositories{}",
199                    slugs.len(),
200                    if cached { " (cached)" } else { "" }
201                );
202                sources.extend(slugs);
203            }
204        } else {
205            sources.push((*input).to_string());
206        }
207    }
208    if sources.is_empty() {
209        bail!("no usable repository sources");
210    }
211
212    // Prepare each source. With several sources, a single repo that fails to
213    // clone or has no commits shouldn't abort the whole pool: skip it with a
214    // warning. A single-source run still surfaces the error.
215    let mut prepared: Vec<repo::PreparedRepo> = Vec::new();
216    for input in &sources {
217        match repo::prepare(input, cfg.branch.as_deref()) {
218            Ok(p) => prepared.push(p),
219            Err(e) if sources.len() > 1 => log!("  warning: skipping source '{input}' ({e})"),
220            Err(e) => return Err(e),
221        }
222    }
223    if prepared.is_empty() {
224        bail!("no usable repository sources");
225    }
226    let source_slugs: Vec<Option<String>> = prepared.iter().map(|p| p.slug.clone()).collect();
227    for p in &prepared {
228        log!("→ source: {} (branch {})", p.display_name, p.branch);
229    }
230
231    let filter = model::CommitFilter {
232        since: cfg.since.clone(),
233        until: cfg.until.clone(),
234        no_merges: cfg.no_merges,
235    };
236    let branch = cfg.branch.as_deref();
237
238    // Read each source's history, reusing the cached `git log` when the branch
239    // tip is unchanged. Sources are independent, so read them in parallel and
240    // merge in source order afterwards, which keeps de-duplication deterministic
241    // (an earlier source keeps a SHA it shares with a later one).
242    let outcomes: Vec<Mutex<Option<Result<SourceRead>>>> =
243        (0..prepared.len()).map(|_| Mutex::new(None)).collect();
244    let cursor = AtomicUsize::new(0);
245    std::thread::scope(|s| {
246        for _ in 0..READ_THREADS.min(prepared.len()) {
247            s.spawn(|| loop {
248                let i = cursor.fetch_add(1, Ordering::Relaxed);
249                let Some(p) = prepared.get(i) else { break };
250                let r = read_source(p, &caches, &filter, branch);
251                *outcomes[i].lock().unwrap() = Some(r);
252            });
253        }
254    });
255
256    let mut commits: Vec<model::Commit> = Vec::new();
257    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
258    let mut duplicates = 0u64;
259    let mut cached_sources = 0usize;
260    for (i, (p, slot)) in prepared.iter().zip(outcomes).enumerate() {
261        let read = match slot.into_inner().unwrap() {
262            Some(Ok(r)) => r,
263            Some(Err(e)) if prepared.len() > 1 => {
264                log!("  warning: skipping {} ({e})", p.display_name);
265                continue;
266            }
267            Some(Err(e)) => return Err(e),
268            None => continue,
269        };
270        if read.from_cache {
271            cached_sources += 1;
272        }
273        for mut c in read.commits {
274            if !seen.insert(c.sha.clone()) {
275                duplicates += 1;
276                continue;
277            }
278            c.src = i as u32;
279            commits.push(c);
280        }
281    }
282    if commits.is_empty() {
283        bail!("no commits found");
284    }
285    if cached_sources > 0 {
286        log!(
287            "→ reused cached history for {cached_sources}/{} sources",
288            prepared.len()
289        );
290    }
291    if prepared.len() > 1 {
292        log!(
293            "→ {} commits from {} sources ({} duplicate commits dropped), {} distinct author emails",
294            model::thousands(commits.len() as u64),
295            prepared.len(),
296            model::thousands(duplicates),
297            distinct_emails(&commits)
298        );
299    } else {
300        log!(
301            "→ {} commits from {} distinct author emails",
302            model::thousands(commits.len() as u64),
303            distinct_emails(&commits)
304        );
305    }
306
307    let mut clusters = identity::cluster_commits(&commits, cfg.merge_names);
308
309    let any_slug = source_slugs.iter().any(|s| s.is_some());
310    if cfg.use_github {
311        if any_slug {
312            log!("→ enriching from GitHub");
313            github::enrich_clusters(
314                &mut clusters,
315                &commits,
316                &source_slugs,
317                &client,
318                &mut caches,
319                cfg.verbose,
320            );
321            clusters = identity::merge_by_login(clusters);
322            github::fetch_profiles(&mut clusters, &client, &mut caches, cfg.verbose);
323            if !cfg.detect_affiliation {
324                for cl in clusters.iter_mut() {
325                    cl.affiliation = None;
326                }
327            }
328        } else {
329            log!("→ no GitHub sources, skipping enrichment");
330        }
331    }
332
333    if !cfg.identities.is_empty() {
334        clusters = identity::apply_identity_file(clusters, &cfg.identities);
335        log!("→ applied {} identity overrides", cfg.identities.len());
336    }
337
338    let mut contributors =
339        identity::build_contributors(&clusters, &commits, &cfg.groups, cfg.count_coauthors);
340
341    // Apply manual group-name aliases: fold each variant into its canonical
342    // name (case-insensitively), on both the primary group and the per-month
343    // affiliations.
344    apply_group_aliases(&mut contributors, &cfg.group_aliases);
345
346    // Names the user supplied (manual affiliations and alias canonicals) are
347    // authoritative; canonicalisation only folds auto-detected variants and
348    // leaves these exactly as written.
349    let mut manual_groups: std::collections::HashSet<String> =
350        cfg.groups.iter().map(|r| r.group.clone()).collect();
351    manual_groups.extend(cfg.group_aliases.iter().map(|(canon, _)| canon.clone()));
352    let n_groups = canonicalize_groups(&mut contributors, &manual_groups);
353    if n_groups > 0 {
354        log!("→ {n_groups} distinct affiliations/groups");
355    }
356
357    if !cfg.exclude.is_empty() {
358        contributors.retain(|c| {
359            !cfg.exclude.iter().any(|pat| {
360                let p = pat.to_lowercase();
361                c.name.to_lowercase().contains(&p)
362                    || c.login
363                        .as_deref()
364                        .is_some_and(|l| l.to_lowercase().contains(&p))
365            })
366        });
367    }
368
369    log!(
370        "→ merged to {} contributors ({} bots)",
371        contributors.len(),
372        contributors.iter().filter(|c| c.bot).count()
373    );
374
375    if cfg.embed_avatars && cfg.use_github {
376        github::embed_avatars(
377            &mut contributors,
378            &client,
379            &mut caches,
380            cfg.avatar_size,
381            cfg.verbose,
382        );
383    }
384
385    // A single source keeps its slug/url/branch; multiple sources collapse to a
386    // combined label with no single canonical URL.
387    let single = if prepared.len() == 1 {
388        Some(&prepared[0])
389    } else {
390        None
391    };
392
393    // The GitHub owner shared by every source, if any (a single repo or a
394    // same-owner pool such as a whole-org timeline).
395    let owner = common_owner(&prepared);
396
397    // Owner/org avatar for the interactive page header. Shown for a single
398    // GitHub source and for a same-owner pool, but not for a mix of owners.
399    let owner_avatar = if cfg.use_github && cfg.embed_avatars {
400        owner
401            .as_deref()
402            .and_then(|owner| github::fetch_avatar(&client, &mut caches, owner, 48))
403    } else {
404        None
405    };
406
407    // Repository description (single GitHub source).
408    let description = if cfg.use_github {
409        single
410            .and_then(|p| p.slug.as_deref())
411            .and_then(|slug| github::fetch_repo_description(&client, slug))
412    } else {
413        None
414    };
415
416    // A single repo keeps its name; a same-owner pool is labelled by the owner
417    // (so a whole org reads as "nf-core"); a mixed pool joins the repo names.
418    let default_name = match (single, &owner) {
419        (Some(p), _) => p.display_name.clone(),
420        (None, Some(owner)) => owner.clone(),
421        (None, None) => combined_name(&prepared),
422    };
423    let branch = match single {
424        Some(p) => p.branch.clone(),
425        None => "combined".to_string(),
426    };
427
428    let first = contributors.iter().map(|c| c.first).min().unwrap_or(0);
429    let last = contributors.iter().map(|c| c.last).max().unwrap_or(0);
430    let meta = RepoMeta {
431        name: cfg.title.clone().unwrap_or(default_name),
432        url: single.and_then(|p| p.url.clone()),
433        slug: single.and_then(|p| p.slug.clone()),
434        branch,
435        first,
436        last,
437        total_commits: commits.len() as u64,
438        total_contributors: contributors.iter().filter(|c| !c.bot).count(),
439        generated: chrono::Utc::now().format("%Y-%m-%d").to_string(),
440        owner_avatar,
441        description,
442    };
443
444    caches.save();
445    Ok(Analysis { contributors, meta })
446}
447
448struct SourceRead {
449    commits: Vec<model::Commit>,
450    from_cache: bool,
451}
452
453/// Read one source's history, reusing the cached `git log` when the branch tip
454/// is unchanged. On a miss, the clone is brought up to date, parsed, and the
455/// result is written back to the cache against the new tip.
456fn read_source(
457    p: &repo::PreparedRepo,
458    caches: &cache::Caches,
459    filter: &model::CommitFilter,
460    branch: Option<&str>,
461) -> Result<SourceRead> {
462    let key = source_cache_key(p);
463    // Freshness token: the remote tip via `ls-remote` (no object transfer),
464    // falling back to the local tip when offline or for a local checkout.
465    let remote = repo::remote_tip(p);
466    let tip = remote.clone().or_else(|| repo::local_tip(p));
467
468    if let Some(tip) = &tip {
469        if let Some(cached) = caches.commits(&key, tip, filter) {
470            let commits = cached
471                .into_iter()
472                .map(|c| model::Commit {
473                    sha: c.sha,
474                    ts: c.ts,
475                    name: c.name,
476                    email: c.email,
477                    coauthors: c.coauthors,
478                    src: 0,
479                })
480                .collect();
481            return Ok(SourceRead {
482                commits,
483                from_cache: true,
484            });
485        }
486    }
487
488    // Cache miss. Update the clone only when it is genuinely behind the remote;
489    // a just-cloned repo or an offline run reads what is already on disk.
490    let local = repo::local_tip(p);
491    if p.is_remote && remote.is_some() && remote != local {
492        repo::fetch(p);
493    }
494    let commits = repo::read_commits(p, branch, filter)?;
495    // After a fetch the tip moved, so re-read it; otherwise the pre-fetch local
496    // tip still stands.
497    if let Some(tip) = repo::local_tip(p) {
498        let cached = commits
499            .iter()
500            .map(|c| cache::CachedCommit {
501                sha: c.sha.clone(),
502                ts: c.ts,
503                name: c.name.clone(),
504                email: c.email.clone(),
505                coauthors: c.coauthors.clone(),
506            })
507            .collect();
508        caches.put_commits(&key, &tip, filter, cached);
509    }
510    Ok(SourceRead {
511        commits,
512        from_cache: false,
513    })
514}
515
516/// Stable per-repo cache key: the slug (or display name) plus the branch.
517fn source_cache_key(p: &repo::PreparedRepo) -> String {
518    let base = p.slug.as_deref().unwrap_or(&p.display_name);
519    repo::sanitize(&format!("{base}__{}", p.branch))
520}
521
522/// Build a chart title for a multi-source run: join up to three source names
523/// with " + ", and summarise the rest as "+N more".
524fn combined_name(prepared: &[repo::PreparedRepo]) -> String {
525    let names: Vec<&str> = prepared.iter().map(|p| p.display_name.as_str()).collect();
526    match names.len() {
527        0 => "repositories".to_string(),
528        1..=3 => names.join(" + "),
529        n => format!("{} + {} more", names[..2].join(" + "), n - 2),
530    }
531}
532
533/// The GitHub owner shared by every source, if all sources are GitHub slugs
534/// under one owner. Returns `None` if any source has no slug or the owners
535/// differ, so a mixed-owner run falls back to the generic header icon.
536fn common_owner(prepared: &[repo::PreparedRepo]) -> Option<String> {
537    let mut owner: Option<String> = None;
538    for p in prepared {
539        let o = p.slug.as_deref()?.split('/').next()?.to_string();
540        match &owner {
541            Some(prev) if *prev != o => return None,
542            _ => owner = Some(o),
543        }
544    }
545    owner
546}
547
548fn distinct_emails(commits: &[model::Commit]) -> usize {
549    let mut e: Vec<&str> = commits.iter().map(|c| c.email.as_str()).collect();
550    e.sort_unstable();
551    e.dedup();
552    e.len()
553}
554
555/// Merge group-name variants that refer to the same organisation:
556/// case/punctuation differences ("Seqera Labs" vs "seqeralabs"), a leading
557/// "The", and prefix forms ("Seqera" vs "Seqera Labs"). Returns the final
558/// group count.
559/// Rewrite group names that match a manual alias to their canonical form, on
560/// both the primary `group` and per-month affiliations (matched case-insensitively).
561fn apply_group_aliases(contributors: &mut [Contributor], aliases: &[(String, Vec<String>)]) {
562    if aliases.is_empty() {
563        return;
564    }
565    let mut map: std::collections::HashMap<String, String> = std::collections::HashMap::new();
566    for (canon, variants) in aliases {
567        map.insert(canon.to_lowercase(), canon.clone());
568        for v in variants {
569            map.insert(v.to_lowercase(), canon.clone());
570        }
571    }
572    let canon = |g: &str| map.get(&g.to_lowercase()).cloned();
573    for c in contributors.iter_mut() {
574        if let Some(g) = &c.group {
575            if let Some(cn) = canon(g) {
576                c.group = Some(cn);
577            }
578        }
579        if let Some(mg) = &mut c.month_groups {
580            for slot in mg.iter_mut().flatten() {
581                if let Some(cn) = canon(slot) {
582                    *slot = cn;
583                }
584            }
585        }
586    }
587}
588
589fn canonicalize_groups(
590    contributors: &mut [Contributor],
591    manual: &std::collections::HashSet<String>,
592) -> usize {
593    use std::collections::HashMap;
594    let alnum_key = |g: &str| -> String {
595        let lower = g.to_lowercase();
596        let trimmed = lower.strip_prefix("the ").unwrap_or(&lower);
597        trimmed.chars().filter(|c| c.is_alphanumeric()).collect()
598    };
599
600    let mut variants: HashMap<String, usize> = HashMap::new();
601    for c in contributors.iter() {
602        if let Some(g) = &c.group {
603            *variants.entry(g.clone()).or_default() += 1;
604        }
605    }
606
607    // Prefix-merge keys come only from auto-detected names: a name the user
608    // wrote in --groups is authoritative, gets its own cluster, and is never
609    // folded into (or renamed by) a detected variant.
610    let mut keys: Vec<String> = variants
611        .keys()
612        .filter(|g| !manual.contains(*g))
613        .map(|g| alnum_key(g))
614        .collect();
615    keys.sort();
616    keys.dedup();
617    let resolve = |key: &str| -> String {
618        keys.iter()
619            .filter(|k| k.len() >= 6 && key.starts_with(*k))
620            .min_by_key(|k| k.len())
621            .map(|k| k.to_string())
622            .unwrap_or_else(|| key.to_string())
623    };
624    let cluster_of = |g: &str| -> String {
625        if manual.contains(g) {
626            format!("\u{0}{g}")
627        } else {
628            resolve(&alnum_key(g))
629        }
630    };
631
632    let mut best: HashMap<String, (&String, usize)> = HashMap::new();
633    for (g, n) in &variants {
634        let cluster = cluster_of(g);
635        let score = |g: &str, n: usize| {
636            n * 4
637                + usize::from(g.contains(' ')) * 2
638                + usize::from(g.chars().any(|c| c.is_uppercase()))
639        };
640        let entry = best.entry(cluster).or_insert((g, *n));
641        if score(g, *n) > score(entry.0, entry.1) {
642            *entry = (g, *n);
643        }
644    }
645
646    let display: HashMap<String, String> = best
647        .iter()
648        .map(|(k, (g, _))| (k.clone(), (*g).clone()))
649        .collect();
650    for c in contributors.iter_mut() {
651        if let Some(g) = &c.group {
652            c.group = display.get(&cluster_of(g)).cloned().or(c.group.clone());
653        }
654    }
655    display.len()
656}