Skip to main content

contributor_graphs/
identity.rs

1use crate::model::{month_index, month_start_ts, Commit, Contributor, GroupRule};
2use std::collections::HashMap;
3
4/// A cluster of commit identities believed to be one person.
5#[derive(Debug, Default, Clone)]
6pub struct Cluster {
7    pub emails: Vec<String>,
8    pub names: Vec<String>,
9    /// Commits this person authored.
10    pub commit_idxs: Vec<usize>,
11    /// Commits this person was a `Co-authored-by` on (and did not author).
12    pub coauthored_idxs: Vec<usize>,
13    pub login: Option<String>,
14    pub avatar_url: Option<String>,
15    /// Display name from the GitHub user profile.
16    pub profile_name: Option<String>,
17    /// Affiliation from the GitHub profile `company` field.
18    pub affiliation: Option<String>,
19}
20
21struct Dsu(Vec<usize>);
22
23impl Dsu {
24    fn new() -> Self {
25        Dsu(Vec::new())
26    }
27    fn make(&mut self) -> usize {
28        self.0.push(self.0.len());
29        self.0.len() - 1
30    }
31    fn find(&mut self, x: usize) -> usize {
32        if self.0[x] != x {
33            let root = self.find(self.0[x]);
34            self.0[x] = root;
35        }
36        self.0[x]
37    }
38    fn union(&mut self, a: usize, b: usize) {
39        let (ra, rb) = (self.find(a), self.find(b));
40        if ra != rb {
41            self.0[rb] = ra;
42        }
43    }
44}
45
46fn norm_name(name: &str) -> String {
47    name.split_whitespace()
48        .collect::<Vec<_>>()
49        .join(" ")
50        .to_lowercase()
51}
52
53/// The graph node for an identity: keyed by email (or name when email-less),
54/// unioned with any prior identity sharing a normalised name.
55fn node_for(
56    dsu: &mut Dsu,
57    by_email: &mut HashMap<String, usize>,
58    by_name: &mut HashMap<String, usize>,
59    merge_names: bool,
60    name: &str,
61    email: &str,
62) -> usize {
63    let key = if email.is_empty() { name } else { email };
64    let node = match by_email.get(key) {
65        Some(&n) => n,
66        None => {
67            let n = dsu.make();
68            by_email.insert(key.to_string(), n);
69            n
70        }
71    };
72    if merge_names {
73        let nn = norm_name(name);
74        if !nn.is_empty() {
75            match by_name.get(&nn) {
76                Some(&other) => dsu.union(node, other),
77                None => {
78                    by_name.insert(nn, node);
79                }
80            }
81        }
82    }
83    node
84}
85
86fn add_identity(cl: &mut Cluster, name: &str, email: &str) {
87    if !email.is_empty() && !cl.emails.iter().any(|e| e == email) {
88        cl.emails.push(email.to_string());
89    }
90    if !name.is_empty() && !cl.names.iter().any(|n| n == name) {
91        cl.names.push(name.to_string());
92    }
93}
94
95fn cluster_index(
96    dsu: &mut Dsu,
97    map: &mut HashMap<usize, usize>,
98    clusters: &mut Vec<Cluster>,
99    node: usize,
100) -> usize {
101    let root = dsu.find(node);
102    *map.entry(root).or_insert_with(|| {
103        clusters.push(Cluster::default());
104        clusters.len() - 1
105    })
106}
107
108/// Group commit identities into clusters, one per person. Authors and their
109/// `Co-authored-by` identities all share the graph (so a co-author merges with
110/// the same person's authored commits); each cluster records the commits it
111/// authored and, separately, those it only co-authored.
112pub fn cluster_commits(commits: &[Commit], merge_names: bool) -> Vec<Cluster> {
113    let mut dsu = Dsu::new();
114    let mut by_email: HashMap<String, usize> = HashMap::new();
115    let mut by_name: HashMap<String, usize> = HashMap::new();
116    let mut author_node: Vec<usize> = Vec::with_capacity(commits.len());
117    let mut coauthor_nodes: Vec<Vec<usize>> = Vec::with_capacity(commits.len());
118
119    for c in commits {
120        author_node.push(node_for(
121            &mut dsu,
122            &mut by_email,
123            &mut by_name,
124            merge_names,
125            &c.name,
126            &c.email,
127        ));
128        let cns = c
129            .coauthors
130            .iter()
131            .map(|(n, e)| node_for(&mut dsu, &mut by_email, &mut by_name, merge_names, n, e))
132            .collect();
133        coauthor_nodes.push(cns);
134    }
135
136    let mut clusters: Vec<Cluster> = Vec::new();
137    let mut root_to_cluster: HashMap<usize, usize> = HashMap::new();
138    for (i, c) in commits.iter().enumerate() {
139        let ci_a = cluster_index(
140            &mut dsu,
141            &mut root_to_cluster,
142            &mut clusters,
143            author_node[i],
144        );
145        add_identity(&mut clusters[ci_a], &c.name, &c.email);
146        clusters[ci_a].commit_idxs.push(i);
147        for (k, (n, e)) in c.coauthors.iter().enumerate() {
148            let ci_c = cluster_index(
149                &mut dsu,
150                &mut root_to_cluster,
151                &mut clusters,
152                coauthor_nodes[i][k],
153            );
154            add_identity(&mut clusters[ci_c], n, e);
155            // Skip when the co-author is the author, or already credited for
156            // this commit (a duplicate trailer, or two of their aliases on it).
157            if ci_c != ci_a && clusters[ci_c].coauthored_idxs.last() != Some(&i) {
158                clusters[ci_c].coauthored_idxs.push(i);
159            }
160        }
161    }
162    clusters
163}
164
165/// Merge clusters that resolved to the same GitHub login.
166pub fn merge_by_login(clusters: Vec<Cluster>) -> Vec<Cluster> {
167    let mut by_login: HashMap<String, usize> = HashMap::new();
168    let mut out: Vec<Cluster> = Vec::new();
169    for cl in clusters {
170        if let Some(login) = cl.login.clone() {
171            let key = login.to_lowercase();
172            if let Some(&i) = by_login.get(&key) {
173                merge_into(&mut out[i], cl);
174                continue;
175            }
176            by_login.insert(key, out.len());
177        }
178        out.push(cl);
179    }
180    out
181}
182
183/// Apply a manual identity file: each TSV row lists a canonical name followed
184/// by aliases. Any cluster whose name, email, or login matches any field is
185/// merged, and the first field becomes the display name.
186pub fn apply_identity_file(clusters: Vec<Cluster>, rows: &[Vec<String>]) -> Vec<Cluster> {
187    let mut clusters: Vec<Option<Cluster>> = clusters.into_iter().map(Some).collect();
188    for row in rows {
189        if row.is_empty() {
190            continue;
191        }
192        let canonical = &row[0];
193        let matches: Vec<usize> = clusters
194            .iter()
195            .enumerate()
196            .filter_map(|(i, c)| {
197                let c = c.as_ref()?;
198                let hit = row.iter().any(|alias| cluster_matches(c, alias));
199                hit.then_some(i)
200            })
201            .collect();
202        if matches.is_empty() {
203            continue;
204        }
205        let target = matches[0];
206        for &i in matches.iter().skip(1) {
207            let donor = clusters[i].take().unwrap();
208            let t = clusters[target].as_mut().unwrap();
209            merge_into(t, donor);
210        }
211        let t = clusters[target].as_mut().unwrap();
212        // Force the canonical display name to sort first.
213        t.names.retain(|n| n != canonical);
214        t.names.insert(0, canonical.clone());
215    }
216    clusters.into_iter().flatten().collect()
217}
218
219pub fn cluster_matches(c: &Cluster, needle: &str) -> bool {
220    let n = needle.trim().to_lowercase();
221    if n.is_empty() {
222        return false;
223    }
224    c.emails.iter().any(|e| e.to_lowercase() == n)
225        || c.names.iter().any(|name| name.to_lowercase() == n)
226        || c.login.as_deref().is_some_and(|l| l.to_lowercase() == n)
227}
228
229fn merge_into(target: &mut Cluster, donor: Cluster) {
230    for e in donor.emails {
231        if !target.emails.contains(&e) {
232            target.emails.push(e);
233        }
234    }
235    for n in donor.names {
236        if !target.names.contains(&n) {
237            target.names.push(n);
238        }
239    }
240    target.commit_idxs.extend(donor.commit_idxs);
241    target.coauthored_idxs.extend(donor.coauthored_idxs);
242    if target.login.is_none() {
243        target.login = donor.login;
244    }
245    if target.avatar_url.is_none() {
246        target.avatar_url = donor.avatar_url;
247    }
248    if target.profile_name.is_none() {
249        target.profile_name = donor.profile_name;
250    }
251    if target.affiliation.is_none() {
252        target.affiliation = donor.affiliation;
253    }
254}
255
256const BOT_NAMES: &[&str] = &[
257    "github-actions",
258    "github actions",
259    "dependabot",
260    "renovate",
261    "renovate bot",
262    "greenkeeper",
263    "snyk-bot",
264    "travis ci user",
265    "travis ci",
266    "travis",
267    "runner",
268    "nf-core-bot",
269    "semantic-release-bot",
270    "allcontributors",
271    "pre-commit-ci",
272    "imgbot",
273    "codecov",
274    "whitesource",
275    "deepsource",
276    "pyup.io bot",
277    "pyup-bot",
278    "mergify",
279    "copilot",
280];
281
282pub fn is_bot(cl: &Cluster) -> bool {
283    let hit = |s: &str| {
284        let l = s.to_lowercase();
285        l.contains("[bot]") || BOT_NAMES.contains(&l.as_str())
286    };
287    cl.names.iter().any(|n| hit(n))
288        || cl.login.as_deref().is_some_and(hit)
289        || cl.emails.iter().any(|e| {
290            e.contains("[bot]@") || e.starts_with("actions@github.com") || e.contains("dependabot")
291        })
292}
293
294/// Pick the best human-readable display name for a cluster: the most frequent
295/// author name, preferring "Firstname Lastname"-style over login-style names.
296fn display_name(cl: &Cluster, commits: &[Commit]) -> String {
297    let mut freq: HashMap<&str, (u32, usize)> = HashMap::new();
298    for (order, &i) in cl.commit_idxs.iter().enumerate() {
299        let name = commits[i].name.as_str();
300        if name.is_empty() {
301            continue;
302        }
303        let e = freq.entry(name).or_insert((0, order));
304        e.0 += 1;
305    }
306    let score = |name: &str, count: u32| {
307        let mut s = count as f64;
308        if name.contains(' ') {
309            s *= 3.0; // prefer full names over handles
310        }
311        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
312            s *= 1.5;
313        }
314        s
315    };
316    freq.iter()
317        .max_by(|(a, (ca, oa)), (b, (cb, ob))| {
318            score(a, *ca)
319                .partial_cmp(&score(b, *cb))
320                .unwrap()
321                .then(ob.cmp(oa)) // earlier-seen wins ties
322        })
323        .map(|(n, _)| n.to_string())
324        .unwrap_or_else(|| {
325            cl.login.clone().unwrap_or_else(|| {
326                cl.names
327                    .first()
328                    .cloned()
329                    .unwrap_or_else(|| "unknown".into())
330            })
331        })
332}
333
334/// Build final contributors with stats and monthly activity bins. With
335/// `count_coauthors`, each `Co-authored-by` credit is counted alongside the
336/// authored commits (and tracked separately in `co_months` / `co_commits`).
337pub fn build_contributors(
338    clusters: &[Cluster],
339    commits: &[Commit],
340    groups: &[GroupRule],
341    count_coauthors: bool,
342) -> Vec<Contributor> {
343    let mut out = Vec::with_capacity(clusters.len());
344    for cl in clusters {
345        let coauthored: &[usize] = if count_coauthors {
346            &cl.coauthored_idxs
347        } else {
348            &[]
349        };
350        if cl.commit_idxs.is_empty() && coauthored.is_empty() {
351            continue;
352        }
353        let mut first = i64::MAX;
354        let mut last = i64::MIN;
355        for &i in cl.commit_idxs.iter().chain(coauthored.iter()) {
356            first = first.min(commits[i].ts);
357            last = last.max(commits[i].ts);
358        }
359        let m0 = month_index(first);
360        let m1 = month_index(last);
361        // Clamp the span so a single corrupt/extreme commit date can't trigger
362        // a huge allocation (commits outside the window are simply not binned).
363        let len = (m1 - m0 + 1).clamp(1, 6000) as usize;
364        let mut months = vec![0u32; len];
365        // Only the co-authored rows allocate a second array.
366        let mut co_months = vec![0u32; if coauthored.is_empty() { 0 } else { len }];
367        for &i in &cl.commit_idxs {
368            if let Some(slot) = months.get_mut((month_index(commits[i].ts) - m0) as usize) {
369                *slot += 1;
370            }
371        }
372        for &i in coauthored {
373            let mi = (month_index(commits[i].ts) - m0) as usize;
374            if let Some(slot) = months.get_mut(mi) {
375                *slot += 1;
376            }
377            if let Some(slot) = co_months.get_mut(mi) {
378                *slot += 1;
379            }
380        }
381        let name = cl
382            .profile_name
383            .clone()
384            .filter(|n| !n.trim().is_empty())
385            .unwrap_or_else(|| display_name(cl, commits));
386        // Manual group rules win over the auto-detected affiliation. With
387        // dated rules, the person's months are coloured by the org active at
388        // the time (later `since` wins overlaps); the primary `group` is their
389        // most recent affiliation.
390        let matching: Vec<&GroupRule> = groups
391            .iter()
392            .filter(|r| cluster_matches(cl, &r.matcher))
393            .collect();
394        let (group, month_groups) = if matching.is_empty() {
395            (cl.affiliation.clone(), None)
396        } else if !matching.iter().any(|r| r.dated()) {
397            (Some(matching[0].group.clone()), None)
398        } else {
399            let active_at = |ts: i64| -> Option<&str> {
400                matching
401                    .iter()
402                    .filter(|r| r.covers(ts))
403                    .max_by_key(|r| r.since.unwrap_or(i64::MIN))
404                    .map(|r| r.group.as_str())
405            };
406            let mg: Vec<Option<String>> = (0..len)
407                .map(|mi| active_at(month_start_ts(m0 + mi as i32)).map(str::to_string))
408                .collect();
409            let primary = matching
410                .iter()
411                .max_by_key(|r| r.since.unwrap_or(i64::MIN))
412                .map(|r| r.group.clone());
413            let month_groups = mg.iter().any(|g| g.is_some()).then_some(mg);
414            (primary, month_groups)
415        };
416        let url = cl.login.as_ref().map(|l| format!("https://github.com/{l}"));
417        out.push(Contributor {
418            name,
419            login: cl.login.clone(),
420            avatar: cl.avatar_url.clone(),
421            url,
422            first,
423            last,
424            commits: (cl.commit_idxs.len() + coauthored.len()) as u32,
425            bot: is_bot(cl),
426            group,
427            members: 1,
428            member_names: Vec::new(),
429            m0,
430            months,
431            co_months,
432            co_commits: coauthored.len() as u32,
433            month_groups,
434        });
435    }
436    out
437}