Skip to main content

contributor_graphs/
identity.rs

1use crate::model::{month_index, Commit, Contributor};
2use std::collections::HashMap;
3
4/// A cluster of commit identities believed to be one person.
5#[derive(Debug, Default, Clone)]
6pub struct Cluster {
7    pub emails: Vec<String>,
8    pub names: Vec<String>,
9    pub commit_idxs: Vec<usize>,
10    pub login: Option<String>,
11    pub avatar_url: Option<String>,
12    /// Display name from the GitHub user profile.
13    pub profile_name: Option<String>,
14    /// Affiliation from the GitHub profile `company` field.
15    pub affiliation: Option<String>,
16}
17
18struct Dsu(Vec<usize>);
19
20impl Dsu {
21    fn new() -> Self {
22        Dsu(Vec::new())
23    }
24    fn make(&mut self) -> usize {
25        self.0.push(self.0.len());
26        self.0.len() - 1
27    }
28    fn find(&mut self, x: usize) -> usize {
29        if self.0[x] != x {
30            let root = self.find(self.0[x]);
31            self.0[x] = root;
32        }
33        self.0[x]
34    }
35    fn union(&mut self, a: usize, b: usize) {
36        let (ra, rb) = (self.find(a), self.find(b));
37        if ra != rb {
38            self.0[rb] = ra;
39        }
40    }
41}
42
43fn norm_name(name: &str) -> String {
44    name.split_whitespace()
45        .collect::<Vec<_>>()
46        .join(" ")
47        .to_lowercase()
48}
49
50/// Group commits into identity clusters. Commits sharing an email always
51/// merge; commits sharing a normalised author name merge unless disabled.
52pub fn cluster_commits(commits: &[Commit], merge_names: bool) -> Vec<Cluster> {
53    let mut dsu = Dsu::new();
54    let mut by_email: HashMap<&str, usize> = HashMap::new();
55    let mut by_name: HashMap<String, usize> = HashMap::new();
56    let mut commit_node: Vec<usize> = Vec::with_capacity(commits.len());
57
58    for c in commits {
59        let key: &str = if c.email.is_empty() {
60            &c.name
61        } else {
62            &c.email
63        };
64        let node = match by_email.get(key) {
65            Some(&n) => n,
66            None => {
67                let n = dsu.make();
68                by_email.insert(key, n);
69                n
70            }
71        };
72        if merge_names {
73            let nn = norm_name(&c.name);
74            if !nn.is_empty() {
75                match by_name.get(&nn) {
76                    Some(&other) => dsu.union(node, other),
77                    None => {
78                        by_name.insert(nn, node);
79                    }
80                }
81            }
82        }
83        commit_node.push(node);
84    }
85
86    let mut clusters: Vec<Cluster> = Vec::new();
87    let mut root_to_cluster: HashMap<usize, usize> = HashMap::new();
88    for (i, c) in commits.iter().enumerate() {
89        let root = dsu.find(commit_node[i]);
90        let ci = *root_to_cluster.entry(root).or_insert_with(|| {
91            clusters.push(Cluster::default());
92            clusters.len() - 1
93        });
94        let cl = &mut clusters[ci];
95        if !c.email.is_empty() && !cl.emails.iter().any(|e| e == &c.email) {
96            cl.emails.push(c.email.clone());
97        }
98        if !c.name.is_empty() && !cl.names.iter().any(|n| n == &c.name) {
99            cl.names.push(c.name.clone());
100        }
101        cl.commit_idxs.push(i);
102    }
103    clusters
104}
105
106/// Merge clusters that resolved to the same GitHub login.
107pub fn merge_by_login(clusters: Vec<Cluster>) -> Vec<Cluster> {
108    let mut by_login: HashMap<String, usize> = HashMap::new();
109    let mut out: Vec<Cluster> = Vec::new();
110    for cl in clusters {
111        if let Some(login) = cl.login.clone() {
112            let key = login.to_lowercase();
113            if let Some(&i) = by_login.get(&key) {
114                merge_into(&mut out[i], cl);
115                continue;
116            }
117            by_login.insert(key, out.len());
118        }
119        out.push(cl);
120    }
121    out
122}
123
124/// Apply a manual identity file: each TSV row lists a canonical name followed
125/// by aliases. Any cluster whose name, email, or login matches any field is
126/// merged, and the first field becomes the display name.
127pub fn apply_identity_file(clusters: Vec<Cluster>, rows: &[Vec<String>]) -> Vec<Cluster> {
128    let mut clusters: Vec<Option<Cluster>> = clusters.into_iter().map(Some).collect();
129    for row in rows {
130        if row.is_empty() {
131            continue;
132        }
133        let canonical = &row[0];
134        let matches: Vec<usize> = clusters
135            .iter()
136            .enumerate()
137            .filter_map(|(i, c)| {
138                let c = c.as_ref()?;
139                let hit = row.iter().any(|alias| cluster_matches(c, alias));
140                hit.then_some(i)
141            })
142            .collect();
143        if matches.is_empty() {
144            continue;
145        }
146        let target = matches[0];
147        for &i in matches.iter().skip(1) {
148            let donor = clusters[i].take().unwrap();
149            let t = clusters[target].as_mut().unwrap();
150            merge_into(t, donor);
151        }
152        let t = clusters[target].as_mut().unwrap();
153        // Force the canonical display name to sort first.
154        t.names.retain(|n| n != canonical);
155        t.names.insert(0, canonical.clone());
156    }
157    clusters.into_iter().flatten().collect()
158}
159
160pub fn cluster_matches(c: &Cluster, needle: &str) -> bool {
161    let n = needle.trim().to_lowercase();
162    if n.is_empty() {
163        return false;
164    }
165    c.emails.iter().any(|e| e.to_lowercase() == n)
166        || c.names.iter().any(|name| name.to_lowercase() == n)
167        || c.login.as_deref().is_some_and(|l| l.to_lowercase() == n)
168}
169
170fn merge_into(target: &mut Cluster, donor: Cluster) {
171    for e in donor.emails {
172        if !target.emails.contains(&e) {
173            target.emails.push(e);
174        }
175    }
176    for n in donor.names {
177        if !target.names.contains(&n) {
178            target.names.push(n);
179        }
180    }
181    target.commit_idxs.extend(donor.commit_idxs);
182    if target.login.is_none() {
183        target.login = donor.login;
184    }
185    if target.avatar_url.is_none() {
186        target.avatar_url = donor.avatar_url;
187    }
188    if target.profile_name.is_none() {
189        target.profile_name = donor.profile_name;
190    }
191    if target.affiliation.is_none() {
192        target.affiliation = donor.affiliation;
193    }
194}
195
196const BOT_NAMES: &[&str] = &[
197    "github-actions",
198    "github actions",
199    "dependabot",
200    "renovate",
201    "renovate bot",
202    "greenkeeper",
203    "snyk-bot",
204    "travis ci user",
205    "travis ci",
206    "travis",
207    "runner",
208    "nf-core-bot",
209    "semantic-release-bot",
210    "allcontributors",
211    "pre-commit-ci",
212    "imgbot",
213    "codecov",
214    "whitesource",
215    "deepsource",
216    "pyup.io bot",
217    "pyup-bot",
218    "mergify",
219    "copilot",
220];
221
222pub fn is_bot(cl: &Cluster) -> bool {
223    let hit = |s: &str| {
224        let l = s.to_lowercase();
225        l.contains("[bot]") || BOT_NAMES.contains(&l.as_str())
226    };
227    cl.names.iter().any(|n| hit(n))
228        || cl.login.as_deref().is_some_and(hit)
229        || cl.emails.iter().any(|e| {
230            e.contains("[bot]@") || e.starts_with("actions@github.com") || e.contains("dependabot")
231        })
232}
233
234/// Pick the best human-readable display name for a cluster: the most frequent
235/// author name, preferring "Firstname Lastname"-style over login-style names.
236fn display_name(cl: &Cluster, commits: &[Commit]) -> String {
237    let mut freq: HashMap<&str, (u32, usize)> = HashMap::new();
238    for (order, &i) in cl.commit_idxs.iter().enumerate() {
239        let name = commits[i].name.as_str();
240        if name.is_empty() {
241            continue;
242        }
243        let e = freq.entry(name).or_insert((0, order));
244        e.0 += 1;
245    }
246    let score = |name: &str, count: u32| {
247        let mut s = count as f64;
248        if name.contains(' ') {
249            s *= 3.0; // prefer full names over handles
250        }
251        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
252            s *= 1.5;
253        }
254        s
255    };
256    freq.iter()
257        .max_by(|(a, (ca, oa)), (b, (cb, ob))| {
258            score(a, *ca)
259                .partial_cmp(&score(b, *cb))
260                .unwrap()
261                .then(ob.cmp(oa)) // earlier-seen wins ties
262        })
263        .map(|(n, _)| n.to_string())
264        .unwrap_or_else(|| {
265            cl.login.clone().unwrap_or_else(|| {
266                cl.names
267                    .first()
268                    .cloned()
269                    .unwrap_or_else(|| "unknown".into())
270            })
271        })
272}
273
274/// Build final contributors with stats and monthly activity bins.
275pub fn build_contributors(
276    clusters: &[Cluster],
277    commits: &[Commit],
278    groups: &[(String, String)],
279) -> Vec<Contributor> {
280    let mut out = Vec::with_capacity(clusters.len());
281    for cl in clusters {
282        if cl.commit_idxs.is_empty() {
283            continue;
284        }
285        let mut first = i64::MAX;
286        let mut last = i64::MIN;
287        for &i in &cl.commit_idxs {
288            first = first.min(commits[i].ts);
289            last = last.max(commits[i].ts);
290        }
291        let m0 = month_index(first);
292        let m1 = month_index(last);
293        // Clamp the span so a single corrupt/extreme commit date can't trigger
294        // a huge allocation (commits outside the window are simply not binned).
295        let mut months = vec![0u32; (m1 - m0 + 1).clamp(1, 6000) as usize];
296        for &i in &cl.commit_idxs {
297            let mi = month_index(commits[i].ts) - m0;
298            if let Some(slot) = months.get_mut(mi as usize) {
299                *slot += 1;
300            }
301        }
302        let name = cl
303            .profile_name
304            .clone()
305            .filter(|n| !n.trim().is_empty())
306            .unwrap_or_else(|| display_name(cl, commits));
307        // Manual group mapping wins over auto-detected affiliation.
308        let group = groups
309            .iter()
310            .find(|(matcher, _)| cluster_matches(cl, matcher))
311            .map(|(_, g)| g.clone())
312            .or_else(|| cl.affiliation.clone());
313        let url = cl.login.as_ref().map(|l| format!("https://github.com/{l}"));
314        out.push(Contributor {
315            name,
316            login: cl.login.clone(),
317            avatar: cl.avatar_url.clone(),
318            url,
319            first,
320            last,
321            commits: cl.commit_idxs.len() as u32,
322            bot: is_bot(cl),
323            group,
324            members: 1,
325            member_names: Vec::new(),
326            m0,
327            months,
328        });
329    }
330    out
331}