1use crate::cache::Caches;
2use crate::identity::Cluster;
3use crate::model::{Commit, Contributor};
4use base64::Engine;
5use std::collections::HashMap;
6use std::io::IsTerminal;
7use std::process::Command;
8use std::sync::atomic::{AtomicUsize, Ordering};
9use std::sync::Mutex;
10use std::time::Duration;
11
12const THREADS: usize = 8;
13
14type Profile = (Option<String>, Option<String>);
16
17pub struct GhClient {
18 agent: ureq::Agent,
19 token: Option<String>,
20}
21
22pub fn find_token() -> Option<String> {
24 for var in ["GITHUB_TOKEN", "GH_TOKEN"] {
25 if let Ok(t) = std::env::var(var) {
26 let t = t.trim().to_string();
27 if !t.is_empty() {
28 return Some(t);
29 }
30 }
31 }
32 let out = Command::new("gh").args(["auth", "token"]).output().ok()?;
33 if out.status.success() {
34 let t = String::from_utf8_lossy(&out.stdout).trim().to_string();
35 if !t.is_empty() {
36 return Some(t);
37 }
38 }
39 None
40}
41
42pub fn parse_noreply(email: &str) -> Option<(Option<u64>, String)> {
45 let local = email.strip_suffix("@users.noreply.github.com")?;
46 match local.split_once('+') {
47 Some((id, login)) => {
48 let id = id.parse::<u64>().ok();
49 Some((id, login.to_string()))
50 }
51 None => Some((None, local.to_string())),
52 }
53}
54
55impl GhClient {
56 pub fn new(token: Option<String>) -> Self {
57 let agent = ureq::AgentBuilder::new()
58 .timeout(Duration::from_secs(30))
59 .user_agent("contributor-graphs (https://github.com/ewels/contributor-graphs)")
60 .build();
61 GhClient { agent, token }
62 }
63
64 pub fn has_token(&self) -> bool {
65 self.token.is_some()
66 }
67
68 fn get_json(&self, url: &str) -> Option<serde_json::Value> {
69 let mut req = self
70 .agent
71 .get(url)
72 .set("Accept", "application/vnd.github+json")
73 .set("X-GitHub-Api-Version", "2022-11-28");
74 if let Some(t) = &self.token {
75 req = req.set("Authorization", &format!("Bearer {t}"));
76 }
77 match req.call() {
78 Ok(resp) => resp.into_json().ok(),
79 Err(ureq::Error::Status(code, _)) => {
80 if code == 403 || code == 429 {
81 eprintln!(" warning: GitHub API rate limited (HTTP {code})");
82 }
83 None
84 }
85 Err(_) => None,
86 }
87 }
88
89 fn commit_author(&self, slug: &str, sha: &str) -> Option<(String, String)> {
91 let v = self.get_json(&format!(
92 "https://api.github.com/repos/{slug}/commits/{sha}"
93 ))?;
94 let author = v.get("author")?;
95 let login = author.get("login")?.as_str()?.to_string();
96 let avatar = author
97 .get("avatar_url")
98 .and_then(|a| a.as_str())
99 .map(String::from)
100 .unwrap_or_else(|| format!("https://avatars.githubusercontent.com/{login}"));
101 Some((login, avatar))
102 }
103
104 fn user_profile(&self, login: &str) -> Profile {
106 let Some(v) = self.get_json(&format!("https://api.github.com/users/{login}")) else {
107 return (None, None);
108 };
109 let name = v
110 .get("name")
111 .and_then(|n| n.as_str())
112 .map(str::trim)
113 .filter(|n| !n.is_empty())
114 .map(String::from);
115 let company = v
116 .get("company")
117 .and_then(|c| c.as_str())
118 .and_then(normalize_company);
119 (name, company)
120 }
121
122 pub fn list_owner_repos(&self, owner: &str) -> Vec<String> {
127 for kind in ["orgs", "users"] {
128 let mut slugs = Vec::new();
129 let mut page = 1;
130 let mut reached = false;
131 loop {
132 let url =
133 format!("https://api.github.com/{kind}/{owner}/repos?per_page=100&page={page}");
134 let Some(v) = self.get_json(&url) else { break };
135 reached = true;
136 let Some(arr) = v.as_array() else { break };
137 let count = arr.len();
138 for repo in arr {
139 if repo.get("fork").and_then(|f| f.as_bool()).unwrap_or(false) {
140 continue;
141 }
142 if let Some(full) = repo.get("full_name").and_then(|n| n.as_str()) {
143 slugs.push(full.to_string());
144 }
145 }
146 if count < 100 {
147 break;
148 }
149 page += 1;
150 }
151 if reached && !slugs.is_empty() {
152 return slugs;
153 }
154 }
155 Vec::new()
156 }
157
158 pub fn fetch_bytes(&self, url: &str) -> Option<(Vec<u8>, String)> {
159 let resp = self.agent.get(url).call().ok()?;
160 let ct = resp.content_type().to_string();
161 let mut buf = Vec::new();
162 use std::io::Read;
163 resp.into_reader()
164 .take(4 * 1024 * 1024)
165 .read_to_end(&mut buf)
166 .ok()?;
167 Some((buf, ct))
168 }
169}
170
171pub fn enrich_clusters(
176 clusters: &mut [Cluster],
177 commits: &[Commit],
178 source_slugs: &[Option<String>],
179 client: &GhClient,
180 caches: &mut Caches,
181 verbose: bool,
182) {
183 let slug_of = |c: &Commit| -> Option<&str> {
184 source_slugs.get(c.src as usize).and_then(|s| s.as_deref())
185 };
186 let mut need_api: Vec<(usize, String, String)> = Vec::new();
187 for (i, cl) in clusters.iter_mut().enumerate() {
188 for email in &cl.emails {
189 if let Some((id, login)) = parse_noreply(email) {
190 cl.avatar_url = Some(match id {
191 Some(id) => format!("https://avatars.githubusercontent.com/u/{id}?v=4"),
192 None => format!("https://avatars.githubusercontent.com/{login}"),
193 });
194 cl.login = Some(login);
195 break;
196 }
197 }
198 if cl.login.is_none() {
199 let rep = cl
202 .commit_idxs
203 .iter()
204 .filter(|&&i| slug_of(&commits[i]).is_some())
205 .max_by_key(|&&i| commits[i].ts);
206 if let Some(&idx) = rep {
207 let slug = slug_of(&commits[idx]).unwrap().to_string();
208 need_api.push((i, slug, commits[idx].sha.clone()));
209 }
210 }
211 }
212
213 let mut from_cache = 0usize;
215 need_api.retain(|(idx, _, sha)| match caches.author(sha) {
216 Some(a) => {
217 clusters[*idx].login = Some(a.login);
218 clusters[*idx].avatar_url = Some(a.avatar_url);
219 from_cache += 1;
220 false
221 }
222 None => true,
223 });
224
225 if need_api.is_empty() || !client.has_token() {
226 if !need_api.is_empty() && verbose {
227 eprintln!(
228 " no GitHub token found ({} identities left unresolved) — run `gh auth login` to enable lookups",
229 need_api.len()
230 );
231 }
232 if from_cache > 0 && verbose {
233 eprintln!(" resolved {from_cache} identities from cache");
234 }
235 return;
236 }
237
238 let cursor = AtomicUsize::new(0);
239 let results: Mutex<HashMap<usize, (String, String)>> = Mutex::new(HashMap::new());
240 let pb = crate::progress::bar(
241 "resolving identities",
242 need_api.len(),
243 verbose && std::io::stderr().is_terminal(),
244 );
245 std::thread::scope(|s| {
246 for _ in 0..THREADS.min(need_api.len()) {
247 s.spawn(|| loop {
248 let i = cursor.fetch_add(1, Ordering::Relaxed);
249 let Some((cluster_idx, slug, sha)) = need_api.get(i) else {
250 break;
251 };
252 if let Some(found) = client.commit_author(slug, sha) {
253 results.lock().unwrap().insert(*cluster_idx, found);
254 }
255 pb.inc(1);
256 });
257 }
258 });
259 pb.finish_and_clear();
260
261 let results = results.into_inner().unwrap();
262 let resolved = results.len();
263 let sha_of: HashMap<usize, &str> = need_api
266 .iter()
267 .map(|(idx, _, sha)| (*idx, sha.as_str()))
268 .collect();
269 for (idx, (login, avatar)) in results {
270 if let Some(sha) = sha_of.get(&idx) {
271 caches.put_author(sha.to_string(), login.clone(), avatar.clone());
272 }
273 clusters[idx].login = Some(login);
274 clusters[idx].avatar_url = Some(avatar);
275 }
276 if verbose {
277 eprintln!(
278 " resolved {resolved}/{} identities via GitHub API{}",
279 need_api.len(),
280 if from_cache > 0 {
281 format!(" ({from_cache} more from cache)")
282 } else {
283 String::new()
284 }
285 );
286 }
287}
288
289pub fn fetch_repo_description(client: &GhClient, slug: &str) -> Option<String> {
291 let v = client.get_json(&format!("https://api.github.com/repos/{slug}"))?;
292 v.get("description")
293 .and_then(|d| d.as_str())
294 .map(str::trim)
295 .filter(|d| !d.is_empty())
296 .map(String::from)
297}
298
299pub fn fetch_avatar(
302 client: &GhClient,
303 caches: &mut Caches,
304 login: &str,
305 size: u32,
306) -> Option<String> {
307 let key = format!("owner:{login}:{size}");
308 if let Some(data) = caches.avatar(&key) {
309 return Some(data);
310 }
311 let url = format!("https://avatars.githubusercontent.com/{login}?s={size}");
312 let (bytes, ct) = client.fetch_bytes(&url)?;
313 let ct = if ct.starts_with("image/") {
314 ct
315 } else {
316 "image/png".into()
317 };
318 let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
319 let data = format!("data:{ct};base64,{b64}");
320 caches.put_avatar(key, data.clone());
321 Some(data)
322}
323
324pub fn normalize_company(raw: &str) -> Option<String> {
328 let mut s = raw.trim();
329 for sep in [" | ", " / ", ";", " · ", ","] {
331 if let Some(i) = s.find(sep) {
332 s = &s[..i];
333 }
334 }
335 s = s.trim().trim_start_matches('@').trim();
336 if let Some(i) = s.find(" @") {
338 s = &s[..i];
339 }
340 let s = s.trim().trim_end_matches(['.', ',', ';', '|', '/']).trim();
341 if s.is_empty() || s.chars().count() > 60 {
342 return None;
343 }
344 Some(s.to_string())
345}
346
347pub fn fetch_profiles(
350 clusters: &mut [Cluster],
351 client: &GhClient,
352 caches: &mut Caches,
353 verbose: bool,
354) {
355 let mut logins: Vec<String> = clusters.iter().filter_map(|c| c.login.clone()).collect();
356 logins.sort();
357 logins.dedup();
358 if logins.is_empty() {
359 return;
360 }
361
362 let mut profiles: HashMap<String, Profile> = HashMap::new();
364 let mut to_fetch: Vec<String> = Vec::new();
365 for login in logins {
366 match caches.profile(&login) {
367 Some(p) => {
368 profiles.insert(login, p);
369 }
370 None => to_fetch.push(login),
371 }
372 }
373 let from_cache = profiles.len();
374
375 if !to_fetch.is_empty() && client.has_token() {
376 let cursor = AtomicUsize::new(0);
377 let results: Mutex<HashMap<String, Profile>> = Mutex::new(HashMap::new());
378 let pb = crate::progress::bar(
379 "fetching profiles",
380 to_fetch.len(),
381 verbose && std::io::stderr().is_terminal(),
382 );
383 std::thread::scope(|s| {
384 for _ in 0..THREADS.min(to_fetch.len()) {
385 s.spawn(|| loop {
386 let i = cursor.fetch_add(1, Ordering::Relaxed);
387 let Some(login) = to_fetch.get(i) else { break };
388 let profile = client.user_profile(login);
389 results.lock().unwrap().insert(login.clone(), profile);
390 pb.inc(1);
391 });
392 }
393 });
394 pb.finish_and_clear();
395 for (login, (name, company)) in results.into_inner().unwrap() {
396 caches.put_profile(login.clone(), name.clone(), company.clone());
397 profiles.insert(login, (name, company));
398 }
399 }
400
401 let with_company = profiles.values().filter(|(_, c)| c.is_some()).count();
402 for cl in clusters.iter_mut() {
403 if let Some(login) = &cl.login {
404 if let Some((name, company)) = profiles.get(login) {
405 cl.profile_name = name.clone();
406 cl.affiliation = company.clone();
407 }
408 }
409 }
410 if verbose {
411 eprintln!(
412 " fetched {} profiles ({} with an affiliation, {from_cache} from cache)",
413 profiles.len(),
414 with_company,
415 );
416 }
417}
418
419pub fn embed_avatars(
422 contributors: &mut [Contributor],
423 client: &GhClient,
424 caches: &mut Caches,
425 size: u32,
426 verbose: bool,
427) {
428 let mut urls: Vec<String> = Vec::new();
429 for c in contributors.iter() {
430 if let Some(u) = &c.avatar {
431 if !u.starts_with("data:") && !urls.contains(u) {
432 urls.push(u.clone());
433 }
434 }
435 }
436 if urls.is_empty() {
437 return;
438 }
439
440 let key_of = |u: &str| format!("{u}|{size}");
442 let mut embedded: HashMap<String, String> = HashMap::new();
443 let mut to_fetch: Vec<String> = Vec::new();
444 for u in urls {
445 match caches.avatar(&key_of(&u)) {
446 Some(data) => {
447 embedded.insert(u, data);
448 }
449 None => to_fetch.push(u),
450 }
451 }
452 let from_cache = embedded.len();
453
454 if !to_fetch.is_empty() {
455 let sized: Vec<String> = to_fetch
456 .iter()
457 .map(|u| {
458 if u.contains('?') {
459 format!("{u}&s={size}")
460 } else {
461 format!("{u}?s={size}")
462 }
463 })
464 .collect();
465 let cursor = AtomicUsize::new(0);
466 let results: Mutex<HashMap<String, String>> = Mutex::new(HashMap::new());
467 let pb = crate::progress::bar(
468 "embedding avatars",
469 to_fetch.len(),
470 verbose && std::io::stderr().is_terminal(),
471 );
472 std::thread::scope(|s| {
473 for _ in 0..THREADS.min(to_fetch.len()) {
474 s.spawn(|| loop {
475 let i = cursor.fetch_add(1, Ordering::Relaxed);
476 let (Some(orig), Some(fetch_url)) = (to_fetch.get(i), sized.get(i)) else {
477 break;
478 };
479 if let Some((bytes, ct)) = client.fetch_bytes(fetch_url) {
480 let ct = if ct.starts_with("image/") {
481 ct
482 } else {
483 "image/png".into()
484 };
485 let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
486 results
487 .lock()
488 .unwrap()
489 .insert(orig.clone(), format!("data:{ct};base64,{b64}"));
490 }
491 pb.inc(1);
492 });
493 }
494 });
495 pb.finish_and_clear();
496 for (url, data) in results.into_inner().unwrap() {
497 caches.put_avatar(key_of(&url), data.clone());
498 embedded.insert(url, data);
499 }
500 }
501
502 let n = embedded.len();
503 for c in contributors.iter_mut() {
504 if let Some(u) = &c.avatar {
505 if let Some(data) = embedded.get(u) {
506 c.avatar = Some(data.clone());
507 }
508 }
509 }
510 if verbose {
511 eprintln!(" embedded {n} avatars as data URIs ({from_cache} from cache)");
512 }
513}