1use crate::model::{month_index, month_start_ts, Commit, Contributor, GroupRule};
2use std::collections::HashMap;
3
4#[derive(Debug, Default, Clone)]
6pub struct Cluster {
7 pub emails: Vec<String>,
8 pub names: Vec<String>,
9 pub commit_idxs: Vec<usize>,
11 pub coauthored_idxs: Vec<usize>,
13 pub login: Option<String>,
14 pub avatar_url: Option<String>,
15 pub profile_name: Option<String>,
17 pub affiliation: Option<String>,
19}
20
21struct Dsu(Vec<usize>);
22
23impl Dsu {
24 fn new() -> Self {
25 Dsu(Vec::new())
26 }
27 fn make(&mut self) -> usize {
28 self.0.push(self.0.len());
29 self.0.len() - 1
30 }
31 fn find(&mut self, x: usize) -> usize {
32 if self.0[x] != x {
33 let root = self.find(self.0[x]);
34 self.0[x] = root;
35 }
36 self.0[x]
37 }
38 fn union(&mut self, a: usize, b: usize) {
39 let (ra, rb) = (self.find(a), self.find(b));
40 if ra != rb {
41 self.0[rb] = ra;
42 }
43 }
44}
45
46fn norm_name(name: &str) -> String {
47 name.split_whitespace()
48 .collect::<Vec<_>>()
49 .join(" ")
50 .to_lowercase()
51}
52
53fn node_for(
56 dsu: &mut Dsu,
57 by_email: &mut HashMap<String, usize>,
58 by_name: &mut HashMap<String, usize>,
59 merge_names: bool,
60 name: &str,
61 email: &str,
62) -> usize {
63 let key = if email.is_empty() { name } else { email };
64 let node = match by_email.get(key) {
65 Some(&n) => n,
66 None => {
67 let n = dsu.make();
68 by_email.insert(key.to_string(), n);
69 n
70 }
71 };
72 if merge_names {
73 let nn = norm_name(name);
74 if !nn.is_empty() {
75 match by_name.get(&nn) {
76 Some(&other) => dsu.union(node, other),
77 None => {
78 by_name.insert(nn, node);
79 }
80 }
81 }
82 }
83 node
84}
85
86fn add_identity(cl: &mut Cluster, name: &str, email: &str) {
87 if !email.is_empty() && !cl.emails.iter().any(|e| e == email) {
88 cl.emails.push(email.to_string());
89 }
90 if !name.is_empty() && !cl.names.iter().any(|n| n == name) {
91 cl.names.push(name.to_string());
92 }
93}
94
95fn cluster_index(
96 dsu: &mut Dsu,
97 map: &mut HashMap<usize, usize>,
98 clusters: &mut Vec<Cluster>,
99 node: usize,
100) -> usize {
101 let root = dsu.find(node);
102 *map.entry(root).or_insert_with(|| {
103 clusters.push(Cluster::default());
104 clusters.len() - 1
105 })
106}
107
108pub fn cluster_commits(commits: &[Commit], merge_names: bool) -> Vec<Cluster> {
113 let mut dsu = Dsu::new();
114 let mut by_email: HashMap<String, usize> = HashMap::new();
115 let mut by_name: HashMap<String, usize> = HashMap::new();
116 let mut author_node: Vec<usize> = Vec::with_capacity(commits.len());
117 let mut coauthor_nodes: Vec<Vec<usize>> = Vec::with_capacity(commits.len());
118
119 for c in commits {
120 author_node.push(node_for(
121 &mut dsu,
122 &mut by_email,
123 &mut by_name,
124 merge_names,
125 &c.name,
126 &c.email,
127 ));
128 let cns = c
129 .coauthors
130 .iter()
131 .map(|(n, e)| node_for(&mut dsu, &mut by_email, &mut by_name, merge_names, n, e))
132 .collect();
133 coauthor_nodes.push(cns);
134 }
135
136 let mut clusters: Vec<Cluster> = Vec::new();
137 let mut root_to_cluster: HashMap<usize, usize> = HashMap::new();
138 for (i, c) in commits.iter().enumerate() {
139 let ci_a = cluster_index(
140 &mut dsu,
141 &mut root_to_cluster,
142 &mut clusters,
143 author_node[i],
144 );
145 add_identity(&mut clusters[ci_a], &c.name, &c.email);
146 clusters[ci_a].commit_idxs.push(i);
147 for (k, (n, e)) in c.coauthors.iter().enumerate() {
148 let ci_c = cluster_index(
149 &mut dsu,
150 &mut root_to_cluster,
151 &mut clusters,
152 coauthor_nodes[i][k],
153 );
154 add_identity(&mut clusters[ci_c], n, e);
155 if ci_c != ci_a && clusters[ci_c].coauthored_idxs.last() != Some(&i) {
158 clusters[ci_c].coauthored_idxs.push(i);
159 }
160 }
161 }
162 clusters
163}
164
165pub fn merge_by_login(clusters: Vec<Cluster>) -> Vec<Cluster> {
167 let mut by_login: HashMap<String, usize> = HashMap::new();
168 let mut out: Vec<Cluster> = Vec::new();
169 for cl in clusters {
170 if let Some(login) = cl.login.clone() {
171 let key = login.to_lowercase();
172 if let Some(&i) = by_login.get(&key) {
173 merge_into(&mut out[i], cl);
174 continue;
175 }
176 by_login.insert(key, out.len());
177 }
178 out.push(cl);
179 }
180 out
181}
182
183pub fn apply_identity_file(clusters: Vec<Cluster>, rows: &[Vec<String>]) -> Vec<Cluster> {
187 let mut clusters: Vec<Option<Cluster>> = clusters.into_iter().map(Some).collect();
188 for row in rows {
189 if row.is_empty() {
190 continue;
191 }
192 let canonical = &row[0];
193 let matches: Vec<usize> = clusters
194 .iter()
195 .enumerate()
196 .filter_map(|(i, c)| {
197 let c = c.as_ref()?;
198 let hit = row.iter().any(|alias| cluster_matches(c, alias));
199 hit.then_some(i)
200 })
201 .collect();
202 if matches.is_empty() {
203 continue;
204 }
205 let target = matches[0];
206 for &i in matches.iter().skip(1) {
207 let donor = clusters[i].take().unwrap();
208 let t = clusters[target].as_mut().unwrap();
209 merge_into(t, donor);
210 }
211 let t = clusters[target].as_mut().unwrap();
212 t.names.retain(|n| n != canonical);
214 t.names.insert(0, canonical.clone());
215 }
216 clusters.into_iter().flatten().collect()
217}
218
219pub fn cluster_matches(c: &Cluster, needle: &str) -> bool {
220 let n = needle.trim().to_lowercase();
221 if n.is_empty() {
222 return false;
223 }
224 c.emails.iter().any(|e| e.to_lowercase() == n)
225 || c.names.iter().any(|name| name.to_lowercase() == n)
226 || c.login.as_deref().is_some_and(|l| l.to_lowercase() == n)
227}
228
229fn merge_into(target: &mut Cluster, donor: Cluster) {
230 for e in donor.emails {
231 if !target.emails.contains(&e) {
232 target.emails.push(e);
233 }
234 }
235 for n in donor.names {
236 if !target.names.contains(&n) {
237 target.names.push(n);
238 }
239 }
240 target.commit_idxs.extend(donor.commit_idxs);
241 target.coauthored_idxs.extend(donor.coauthored_idxs);
242 if target.login.is_none() {
243 target.login = donor.login;
244 }
245 if target.avatar_url.is_none() {
246 target.avatar_url = donor.avatar_url;
247 }
248 if target.profile_name.is_none() {
249 target.profile_name = donor.profile_name;
250 }
251 if target.affiliation.is_none() {
252 target.affiliation = donor.affiliation;
253 }
254}
255
256const BOT_NAMES: &[&str] = &[
257 "github-actions",
258 "github actions",
259 "dependabot",
260 "renovate",
261 "renovate bot",
262 "greenkeeper",
263 "snyk-bot",
264 "travis ci user",
265 "travis ci",
266 "travis",
267 "runner",
268 "nf-core-bot",
269 "semantic-release-bot",
270 "allcontributors",
271 "pre-commit-ci",
272 "imgbot",
273 "codecov",
274 "whitesource",
275 "deepsource",
276 "pyup.io bot",
277 "pyup-bot",
278 "mergify",
279 "copilot",
280];
281
282pub fn is_bot(cl: &Cluster) -> bool {
283 let hit = |s: &str| {
284 let l = s.to_lowercase();
285 l.contains("[bot]") || BOT_NAMES.contains(&l.as_str())
286 };
287 cl.names.iter().any(|n| hit(n))
288 || cl.login.as_deref().is_some_and(hit)
289 || cl.emails.iter().any(|e| {
290 e.contains("[bot]@") || e.starts_with("actions@github.com") || e.contains("dependabot")
291 })
292}
293
294fn display_name(cl: &Cluster, commits: &[Commit]) -> String {
297 let mut freq: HashMap<&str, (u32, usize)> = HashMap::new();
298 for (order, &i) in cl.commit_idxs.iter().enumerate() {
299 let name = commits[i].name.as_str();
300 if name.is_empty() {
301 continue;
302 }
303 let e = freq.entry(name).or_insert((0, order));
304 e.0 += 1;
305 }
306 let score = |name: &str, count: u32| {
307 let mut s = count as f64;
308 if name.contains(' ') {
309 s *= 3.0; }
311 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
312 s *= 1.5;
313 }
314 s
315 };
316 freq.iter()
317 .max_by(|(a, (ca, oa)), (b, (cb, ob))| {
318 score(a, *ca)
319 .partial_cmp(&score(b, *cb))
320 .unwrap()
321 .then(ob.cmp(oa)) })
323 .map(|(n, _)| n.to_string())
324 .unwrap_or_else(|| {
325 cl.login.clone().unwrap_or_else(|| {
326 cl.names
327 .first()
328 .cloned()
329 .unwrap_or_else(|| "unknown".into())
330 })
331 })
332}
333
334pub fn build_contributors(
338 clusters: &[Cluster],
339 commits: &[Commit],
340 groups: &[GroupRule],
341 count_coauthors: bool,
342) -> Vec<Contributor> {
343 let mut out = Vec::with_capacity(clusters.len());
344 for cl in clusters {
345 let coauthored: &[usize] = if count_coauthors {
346 &cl.coauthored_idxs
347 } else {
348 &[]
349 };
350 if cl.commit_idxs.is_empty() && coauthored.is_empty() {
351 continue;
352 }
353 let mut first = i64::MAX;
354 let mut last = i64::MIN;
355 for &i in cl.commit_idxs.iter().chain(coauthored.iter()) {
356 first = first.min(commits[i].ts);
357 last = last.max(commits[i].ts);
358 }
359 let m0 = month_index(first);
360 let m1 = month_index(last);
361 let len = (m1 - m0 + 1).clamp(1, 6000) as usize;
364 let mut months = vec![0u32; len];
365 let mut co_months = vec![0u32; if coauthored.is_empty() { 0 } else { len }];
367 for &i in &cl.commit_idxs {
368 if let Some(slot) = months.get_mut((month_index(commits[i].ts) - m0) as usize) {
369 *slot += 1;
370 }
371 }
372 for &i in coauthored {
373 let mi = (month_index(commits[i].ts) - m0) as usize;
374 if let Some(slot) = months.get_mut(mi) {
375 *slot += 1;
376 }
377 if let Some(slot) = co_months.get_mut(mi) {
378 *slot += 1;
379 }
380 }
381 let name = cl
382 .profile_name
383 .clone()
384 .filter(|n| !n.trim().is_empty())
385 .unwrap_or_else(|| display_name(cl, commits));
386 let matching: Vec<&GroupRule> = groups
391 .iter()
392 .filter(|r| cluster_matches(cl, &r.matcher))
393 .collect();
394 let (group, month_groups) = if matching.is_empty() {
395 (cl.affiliation.clone(), None)
396 } else if !matching.iter().any(|r| r.dated()) {
397 (Some(matching[0].group.clone()), None)
398 } else {
399 let active_at = |ts: i64| -> Option<&str> {
400 matching
401 .iter()
402 .filter(|r| r.covers(ts))
403 .max_by_key(|r| r.since.unwrap_or(i64::MIN))
404 .map(|r| r.group.as_str())
405 };
406 let mg: Vec<Option<String>> = (0..len)
407 .map(|mi| active_at(month_start_ts(m0 + mi as i32)).map(str::to_string))
408 .collect();
409 let primary = matching
410 .iter()
411 .max_by_key(|r| r.since.unwrap_or(i64::MIN))
412 .map(|r| r.group.clone());
413 let month_groups = mg.iter().any(|g| g.is_some()).then_some(mg);
414 (primary, month_groups)
415 };
416 let url = cl.login.as_ref().map(|l| format!("https://github.com/{l}"));
417 out.push(Contributor {
418 name,
419 login: cl.login.clone(),
420 avatar: cl.avatar_url.clone(),
421 url,
422 first,
423 last,
424 commits: (cl.commit_idxs.len() + coauthored.len()) as u32,
425 bot: is_bot(cl),
426 group,
427 members: 1,
428 member_names: Vec::new(),
429 m0,
430 months,
431 co_months,
432 co_commits: coauthored.len() as u32,
433 month_groups,
434 });
435 }
436 out
437}