1use crate::model::{month_index, Commit, Contributor};
2use std::collections::HashMap;
3
4#[derive(Debug, Default, Clone)]
6pub struct Cluster {
7 pub emails: Vec<String>,
8 pub names: Vec<String>,
9 pub commit_idxs: Vec<usize>,
10 pub login: Option<String>,
11 pub avatar_url: Option<String>,
12 pub profile_name: Option<String>,
14 pub affiliation: Option<String>,
16}
17
18struct Dsu(Vec<usize>);
19
20impl Dsu {
21 fn new() -> Self {
22 Dsu(Vec::new())
23 }
24 fn make(&mut self) -> usize {
25 self.0.push(self.0.len());
26 self.0.len() - 1
27 }
28 fn find(&mut self, x: usize) -> usize {
29 if self.0[x] != x {
30 let root = self.find(self.0[x]);
31 self.0[x] = root;
32 }
33 self.0[x]
34 }
35 fn union(&mut self, a: usize, b: usize) {
36 let (ra, rb) = (self.find(a), self.find(b));
37 if ra != rb {
38 self.0[rb] = ra;
39 }
40 }
41}
42
43fn norm_name(name: &str) -> String {
44 name.split_whitespace()
45 .collect::<Vec<_>>()
46 .join(" ")
47 .to_lowercase()
48}
49
50pub fn cluster_commits(commits: &[Commit], merge_names: bool) -> Vec<Cluster> {
53 let mut dsu = Dsu::new();
54 let mut by_email: HashMap<&str, usize> = HashMap::new();
55 let mut by_name: HashMap<String, usize> = HashMap::new();
56 let mut commit_node: Vec<usize> = Vec::with_capacity(commits.len());
57
58 for c in commits {
59 let key: &str = if c.email.is_empty() {
60 &c.name
61 } else {
62 &c.email
63 };
64 let node = match by_email.get(key) {
65 Some(&n) => n,
66 None => {
67 let n = dsu.make();
68 by_email.insert(key, n);
69 n
70 }
71 };
72 if merge_names {
73 let nn = norm_name(&c.name);
74 if !nn.is_empty() {
75 match by_name.get(&nn) {
76 Some(&other) => dsu.union(node, other),
77 None => {
78 by_name.insert(nn, node);
79 }
80 }
81 }
82 }
83 commit_node.push(node);
84 }
85
86 let mut clusters: Vec<Cluster> = Vec::new();
87 let mut root_to_cluster: HashMap<usize, usize> = HashMap::new();
88 for (i, c) in commits.iter().enumerate() {
89 let root = dsu.find(commit_node[i]);
90 let ci = *root_to_cluster.entry(root).or_insert_with(|| {
91 clusters.push(Cluster::default());
92 clusters.len() - 1
93 });
94 let cl = &mut clusters[ci];
95 if !c.email.is_empty() && !cl.emails.iter().any(|e| e == &c.email) {
96 cl.emails.push(c.email.clone());
97 }
98 if !c.name.is_empty() && !cl.names.iter().any(|n| n == &c.name) {
99 cl.names.push(c.name.clone());
100 }
101 cl.commit_idxs.push(i);
102 }
103 clusters
104}
105
106pub fn merge_by_login(clusters: Vec<Cluster>) -> Vec<Cluster> {
108 let mut by_login: HashMap<String, usize> = HashMap::new();
109 let mut out: Vec<Cluster> = Vec::new();
110 for cl in clusters {
111 if let Some(login) = cl.login.clone() {
112 let key = login.to_lowercase();
113 if let Some(&i) = by_login.get(&key) {
114 merge_into(&mut out[i], cl);
115 continue;
116 }
117 by_login.insert(key, out.len());
118 }
119 out.push(cl);
120 }
121 out
122}
123
124pub fn apply_identity_file(clusters: Vec<Cluster>, rows: &[Vec<String>]) -> Vec<Cluster> {
128 let mut clusters: Vec<Option<Cluster>> = clusters.into_iter().map(Some).collect();
129 for row in rows {
130 if row.is_empty() {
131 continue;
132 }
133 let canonical = &row[0];
134 let matches: Vec<usize> = clusters
135 .iter()
136 .enumerate()
137 .filter_map(|(i, c)| {
138 let c = c.as_ref()?;
139 let hit = row.iter().any(|alias| cluster_matches(c, alias));
140 hit.then_some(i)
141 })
142 .collect();
143 if matches.is_empty() {
144 continue;
145 }
146 let target = matches[0];
147 for &i in matches.iter().skip(1) {
148 let donor = clusters[i].take().unwrap();
149 let t = clusters[target].as_mut().unwrap();
150 merge_into(t, donor);
151 }
152 let t = clusters[target].as_mut().unwrap();
153 t.names.retain(|n| n != canonical);
155 t.names.insert(0, canonical.clone());
156 }
157 clusters.into_iter().flatten().collect()
158}
159
160pub fn cluster_matches(c: &Cluster, needle: &str) -> bool {
161 let n = needle.trim().to_lowercase();
162 if n.is_empty() {
163 return false;
164 }
165 c.emails.iter().any(|e| e.to_lowercase() == n)
166 || c.names.iter().any(|name| name.to_lowercase() == n)
167 || c.login.as_deref().is_some_and(|l| l.to_lowercase() == n)
168}
169
170fn merge_into(target: &mut Cluster, donor: Cluster) {
171 for e in donor.emails {
172 if !target.emails.contains(&e) {
173 target.emails.push(e);
174 }
175 }
176 for n in donor.names {
177 if !target.names.contains(&n) {
178 target.names.push(n);
179 }
180 }
181 target.commit_idxs.extend(donor.commit_idxs);
182 if target.login.is_none() {
183 target.login = donor.login;
184 }
185 if target.avatar_url.is_none() {
186 target.avatar_url = donor.avatar_url;
187 }
188 if target.profile_name.is_none() {
189 target.profile_name = donor.profile_name;
190 }
191 if target.affiliation.is_none() {
192 target.affiliation = donor.affiliation;
193 }
194}
195
196const BOT_NAMES: &[&str] = &[
197 "github-actions",
198 "github actions",
199 "dependabot",
200 "renovate",
201 "renovate bot",
202 "greenkeeper",
203 "snyk-bot",
204 "travis ci user",
205 "travis ci",
206 "travis",
207 "runner",
208 "nf-core-bot",
209 "semantic-release-bot",
210 "allcontributors",
211 "pre-commit-ci",
212 "imgbot",
213 "codecov",
214 "whitesource",
215 "deepsource",
216 "pyup.io bot",
217 "pyup-bot",
218 "mergify",
219 "copilot",
220];
221
222pub fn is_bot(cl: &Cluster) -> bool {
223 let hit = |s: &str| {
224 let l = s.to_lowercase();
225 l.contains("[bot]") || BOT_NAMES.contains(&l.as_str())
226 };
227 cl.names.iter().any(|n| hit(n))
228 || cl.login.as_deref().is_some_and(hit)
229 || cl.emails.iter().any(|e| {
230 e.contains("[bot]@") || e.starts_with("actions@github.com") || e.contains("dependabot")
231 })
232}
233
234fn display_name(cl: &Cluster, commits: &[Commit]) -> String {
237 let mut freq: HashMap<&str, (u32, usize)> = HashMap::new();
238 for (order, &i) in cl.commit_idxs.iter().enumerate() {
239 let name = commits[i].name.as_str();
240 if name.is_empty() {
241 continue;
242 }
243 let e = freq.entry(name).or_insert((0, order));
244 e.0 += 1;
245 }
246 let score = |name: &str, count: u32| {
247 let mut s = count as f64;
248 if name.contains(' ') {
249 s *= 3.0; }
251 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
252 s *= 1.5;
253 }
254 s
255 };
256 freq.iter()
257 .max_by(|(a, (ca, oa)), (b, (cb, ob))| {
258 score(a, *ca)
259 .partial_cmp(&score(b, *cb))
260 .unwrap()
261 .then(ob.cmp(oa)) })
263 .map(|(n, _)| n.to_string())
264 .unwrap_or_else(|| {
265 cl.login.clone().unwrap_or_else(|| {
266 cl.names
267 .first()
268 .cloned()
269 .unwrap_or_else(|| "unknown".into())
270 })
271 })
272}
273
274pub fn build_contributors(
276 clusters: &[Cluster],
277 commits: &[Commit],
278 groups: &[(String, String)],
279) -> Vec<Contributor> {
280 let mut out = Vec::with_capacity(clusters.len());
281 for cl in clusters {
282 if cl.commit_idxs.is_empty() {
283 continue;
284 }
285 let mut first = i64::MAX;
286 let mut last = i64::MIN;
287 for &i in &cl.commit_idxs {
288 first = first.min(commits[i].ts);
289 last = last.max(commits[i].ts);
290 }
291 let m0 = month_index(first);
292 let m1 = month_index(last);
293 let mut months = vec![0u32; (m1 - m0 + 1).clamp(1, 6000) as usize];
296 for &i in &cl.commit_idxs {
297 let mi = month_index(commits[i].ts) - m0;
298 if let Some(slot) = months.get_mut(mi as usize) {
299 *slot += 1;
300 }
301 }
302 let name = cl
303 .profile_name
304 .clone()
305 .filter(|n| !n.trim().is_empty())
306 .unwrap_or_else(|| display_name(cl, commits));
307 let group = groups
309 .iter()
310 .find(|(matcher, _)| cluster_matches(cl, matcher))
311 .map(|(_, g)| g.clone())
312 .or_else(|| cl.affiliation.clone());
313 let url = cl.login.as_ref().map(|l| format!("https://github.com/{l}"));
314 out.push(Contributor {
315 name,
316 login: cl.login.clone(),
317 avatar: cl.avatar_url.clone(),
318 url,
319 first,
320 last,
321 commits: cl.commit_idxs.len() as u32,
322 bot: is_bot(cl),
323 group,
324 members: 1,
325 member_names: Vec::new(),
326 m0,
327 months,
328 });
329 }
330 out
331}