Skip to main content

contributor_graphs/
github.rs

1use crate::cache::Caches;
2use crate::identity::Cluster;
3use crate::model::{Commit, Contributor};
4use base64::Engine;
5use std::collections::HashMap;
6use std::io::IsTerminal;
7use std::process::Command;
8use std::sync::atomic::{AtomicUsize, Ordering};
9use std::sync::Mutex;
10use std::time::Duration;
11
12const THREADS: usize = 8;
13
14/// A GitHub user profile reduced to (display name, affiliation).
15type Profile = (Option<String>, Option<String>);
16
17pub struct GhClient {
18    agent: ureq::Agent,
19    token: Option<String>,
20}
21
22/// Get a GitHub token: $GITHUB_TOKEN / $GH_TOKEN, else `gh auth token`.
23pub fn find_token() -> Option<String> {
24    for var in ["GITHUB_TOKEN", "GH_TOKEN"] {
25        if let Ok(t) = std::env::var(var) {
26            let t = t.trim().to_string();
27            if !t.is_empty() {
28                return Some(t);
29            }
30        }
31    }
32    let out = Command::new("gh").args(["auth", "token"]).output().ok()?;
33    if out.status.success() {
34        let t = String::from_utf8_lossy(&out.stdout).trim().to_string();
35        if !t.is_empty() {
36            return Some(t);
37        }
38    }
39    None
40}
41
42/// Parse GitHub noreply addresses: `12345+login@users.noreply.github.com`
43/// or the older `login@users.noreply.github.com`.
44pub fn parse_noreply(email: &str) -> Option<(Option<u64>, String)> {
45    let local = email.strip_suffix("@users.noreply.github.com")?;
46    match local.split_once('+') {
47        Some((id, login)) => {
48            let id = id.parse::<u64>().ok();
49            Some((id, login.to_string()))
50        }
51        None => Some((None, local.to_string())),
52    }
53}
54
55impl GhClient {
56    pub fn new(token: Option<String>) -> Self {
57        let agent = ureq::AgentBuilder::new()
58            .timeout(Duration::from_secs(30))
59            .user_agent("contributor-graphs (https://github.com/ewels/contributor-graphs)")
60            .build();
61        GhClient { agent, token }
62    }
63
64    pub fn has_token(&self) -> bool {
65        self.token.is_some()
66    }
67
68    fn get_json(&self, url: &str) -> Option<serde_json::Value> {
69        let mut req = self
70            .agent
71            .get(url)
72            .set("Accept", "application/vnd.github+json")
73            .set("X-GitHub-Api-Version", "2022-11-28");
74        if let Some(t) = &self.token {
75            req = req.set("Authorization", &format!("Bearer {t}"));
76        }
77        match req.call() {
78            Ok(resp) => resp.into_json().ok(),
79            Err(ureq::Error::Status(code, _)) => {
80                if code == 403 || code == 429 {
81                    eprintln!("  warning: GitHub API rate limited (HTTP {code})");
82                }
83                None
84            }
85            Err(_) => None,
86        }
87    }
88
89    /// Resolve the GitHub login + avatar for the author of a commit.
90    fn commit_author(&self, slug: &str, sha: &str) -> Option<(String, String)> {
91        let v = self.get_json(&format!(
92            "https://api.github.com/repos/{slug}/commits/{sha}"
93        ))?;
94        let author = v.get("author")?;
95        let login = author.get("login")?.as_str()?.to_string();
96        let avatar = author
97            .get("avatar_url")
98            .and_then(|a| a.as_str())
99            .map(String::from)
100            .unwrap_or_else(|| format!("https://avatars.githubusercontent.com/{login}"));
101        Some((login, avatar))
102    }
103
104    /// Fetch a user profile: (display name, company/affiliation).
105    fn user_profile(&self, login: &str) -> Profile {
106        let Some(v) = self.get_json(&format!("https://api.github.com/users/{login}")) else {
107            return (None, None);
108        };
109        let name = v
110            .get("name")
111            .and_then(|n| n.as_str())
112            .map(str::trim)
113            .filter(|n| !n.is_empty())
114            .map(String::from);
115        let company = v
116            .get("company")
117            .and_then(|c| c.as_str())
118            .and_then(normalize_company);
119        (name, company)
120    }
121
122    /// List every repository under a GitHub org or user, returning `owner/repo`
123    /// slugs. Forks are skipped; archived repos are kept. Tries the org endpoint
124    /// first and falls back to the user endpoint, so it works for either kind of
125    /// account. Returns an empty vec if the owner can't be listed.
126    pub fn list_owner_repos(&self, owner: &str) -> Vec<String> {
127        for kind in ["orgs", "users"] {
128            let mut slugs = Vec::new();
129            let mut page = 1;
130            let mut reached = false;
131            loop {
132                let url =
133                    format!("https://api.github.com/{kind}/{owner}/repos?per_page=100&page={page}");
134                let Some(v) = self.get_json(&url) else { break };
135                reached = true;
136                let Some(arr) = v.as_array() else { break };
137                let count = arr.len();
138                for repo in arr {
139                    if repo.get("fork").and_then(|f| f.as_bool()).unwrap_or(false) {
140                        continue;
141                    }
142                    if let Some(full) = repo.get("full_name").and_then(|n| n.as_str()) {
143                        slugs.push(full.to_string());
144                    }
145                }
146                if count < 100 {
147                    break;
148                }
149                page += 1;
150            }
151            if reached && !slugs.is_empty() {
152                return slugs;
153            }
154        }
155        Vec::new()
156    }
157
158    pub fn fetch_bytes(&self, url: &str) -> Option<(Vec<u8>, String)> {
159        let resp = self.agent.get(url).call().ok()?;
160        let ct = resp.content_type().to_string();
161        let mut buf = Vec::new();
162        use std::io::Read;
163        resp.into_reader()
164            .take(4 * 1024 * 1024)
165            .read_to_end(&mut buf)
166            .ok()?;
167        Some((buf, ct))
168    }
169}
170
171/// Fill in `login` / `avatar_url` on clusters. Noreply emails resolve
172/// offline; the rest are looked up via the commits API (one representative
173/// commit per cluster), in parallel. `source_slugs` maps a commit's `src`
174/// index to the `owner/repo` slug to query (or `None` for non-GitHub sources).
175pub fn enrich_clusters(
176    clusters: &mut [Cluster],
177    commits: &[Commit],
178    source_slugs: &[Option<String>],
179    client: &GhClient,
180    caches: &mut Caches,
181    verbose: bool,
182) {
183    let slug_of = |c: &Commit| -> Option<&str> {
184        source_slugs.get(c.src as usize).and_then(|s| s.as_deref())
185    };
186    let mut need_api: Vec<(usize, String, String)> = Vec::new();
187    for (i, cl) in clusters.iter_mut().enumerate() {
188        for email in &cl.emails {
189            if let Some((id, login)) = parse_noreply(email) {
190                cl.avatar_url = Some(match id {
191                    Some(id) => format!("https://avatars.githubusercontent.com/u/{id}?v=4"),
192                    None => format!("https://avatars.githubusercontent.com/{login}"),
193                });
194                cl.login = Some(login);
195                break;
196            }
197        }
198        if cl.login.is_none() {
199            // Use the most recent commit that came from a GitHub source: old
200            // commits are more likely to have stale email → account mappings.
201            let rep = cl
202                .commit_idxs
203                .iter()
204                .filter(|&&i| slug_of(&commits[i]).is_some())
205                .max_by_key(|&&i| commits[i].ts);
206            if let Some(&idx) = rep {
207                let slug = slug_of(&commits[idx]).unwrap().to_string();
208                need_api.push((i, slug, commits[idx].sha.clone()));
209            }
210        }
211    }
212
213    // A commit's author never changes, so resolved SHAs are cached forever.
214    let mut from_cache = 0usize;
215    need_api.retain(|(idx, _, sha)| match caches.author(sha) {
216        Some(a) => {
217            clusters[*idx].login = Some(a.login);
218            clusters[*idx].avatar_url = Some(a.avatar_url);
219            from_cache += 1;
220            false
221        }
222        None => true,
223    });
224
225    if need_api.is_empty() || !client.has_token() {
226        if !need_api.is_empty() && verbose {
227            eprintln!(
228                "  no GitHub token found ({} identities left unresolved) — run `gh auth login` to enable lookups",
229                need_api.len()
230            );
231        }
232        if from_cache > 0 && verbose {
233            eprintln!("  resolved {from_cache} identities from cache");
234        }
235        return;
236    }
237
238    let cursor = AtomicUsize::new(0);
239    let results: Mutex<HashMap<usize, (String, String)>> = Mutex::new(HashMap::new());
240    let pb = crate::progress::bar(
241        "resolving identities",
242        need_api.len(),
243        verbose && std::io::stderr().is_terminal(),
244    );
245    std::thread::scope(|s| {
246        for _ in 0..THREADS.min(need_api.len()) {
247            s.spawn(|| loop {
248                let i = cursor.fetch_add(1, Ordering::Relaxed);
249                let Some((cluster_idx, slug, sha)) = need_api.get(i) else {
250                    break;
251                };
252                if let Some(found) = client.commit_author(slug, sha) {
253                    results.lock().unwrap().insert(*cluster_idx, found);
254                }
255                pb.inc(1);
256            });
257        }
258    });
259    pb.finish_and_clear();
260
261    let results = results.into_inner().unwrap();
262    let resolved = results.len();
263    // Map cluster index back to its representative SHA so resolved authors can
264    // be cached against the (immutable) commit.
265    let sha_of: HashMap<usize, &str> = need_api
266        .iter()
267        .map(|(idx, _, sha)| (*idx, sha.as_str()))
268        .collect();
269    for (idx, (login, avatar)) in results {
270        if let Some(sha) = sha_of.get(&idx) {
271            caches.put_author(sha.to_string(), login.clone(), avatar.clone());
272        }
273        clusters[idx].login = Some(login);
274        clusters[idx].avatar_url = Some(avatar);
275    }
276    if verbose {
277        eprintln!(
278            "  resolved {resolved}/{} identities via GitHub API{}",
279            need_api.len(),
280            if from_cache > 0 {
281                format!(" ({from_cache} more from cache)")
282            } else {
283                String::new()
284            }
285        );
286    }
287}
288
289/// Fetch a repository's description from the GitHub API, if it has one.
290pub fn fetch_repo_description(client: &GhClient, slug: &str) -> Option<String> {
291    let v = client.get_json(&format!("https://api.github.com/repos/{slug}"))?;
292    v.get("description")
293        .and_then(|d| d.as_str())
294        .map(str::trim)
295        .filter(|d| !d.is_empty())
296        .map(String::from)
297}
298
299/// Fetch a GitHub avatar (e.g. an org/owner) and return it as a data URI,
300/// caching the result by login and size.
301pub fn fetch_avatar(
302    client: &GhClient,
303    caches: &mut Caches,
304    login: &str,
305    size: u32,
306) -> Option<String> {
307    let key = format!("owner:{login}:{size}");
308    if let Some(data) = caches.avatar(&key) {
309        return Some(data);
310    }
311    let url = format!("https://avatars.githubusercontent.com/{login}?s={size}");
312    let (bytes, ct) = client.fetch_bytes(&url)?;
313    let ct = if ct.starts_with("image/") {
314        ct
315    } else {
316        "image/png".into()
317    };
318    let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
319    let data = format!("data:{ct};base64,{b64}");
320    caches.put_avatar(key, data.clone());
321    Some(data)
322}
323
324/// Clean up the free-text GitHub `company` field into a usable group name.
325/// Handles common patterns like "@seqeralabs", "QBiC @qbicsoftware", and
326/// multi-affiliation strings ("Seqera | SciLifeLab" → "Seqera").
327pub fn normalize_company(raw: &str) -> Option<String> {
328    let mut s = raw.trim();
329    // Multiple affiliations: keep the first one.
330    for sep in [" | ", " / ", ";", " · ", ","] {
331        if let Some(i) = s.find(sep) {
332            s = &s[..i];
333        }
334    }
335    s = s.trim().trim_start_matches('@').trim();
336    // "Company @githuborg" → "Company".
337    if let Some(i) = s.find(" @") {
338        s = &s[..i];
339    }
340    let s = s.trim().trim_end_matches(['.', ',', ';', '|', '/']).trim();
341    if s.is_empty() || s.chars().count() > 60 {
342        return None;
343    }
344    Some(s.to_string())
345}
346
347/// Fetch GitHub user profiles for every resolved login: improves display
348/// names ("phue" → "Patrick Hüther") and yields company affiliations.
349pub fn fetch_profiles(
350    clusters: &mut [Cluster],
351    client: &GhClient,
352    caches: &mut Caches,
353    verbose: bool,
354) {
355    let mut logins: Vec<String> = clusters.iter().filter_map(|c| c.login.clone()).collect();
356    logins.sort();
357    logins.dedup();
358    if logins.is_empty() {
359        return;
360    }
361
362    // Cached profiles skip the API; only the misses are fetched.
363    let mut profiles: HashMap<String, Profile> = HashMap::new();
364    let mut to_fetch: Vec<String> = Vec::new();
365    for login in logins {
366        match caches.profile(&login) {
367            Some(p) => {
368                profiles.insert(login, p);
369            }
370            None => to_fetch.push(login),
371        }
372    }
373    let from_cache = profiles.len();
374
375    if !to_fetch.is_empty() && client.has_token() {
376        let cursor = AtomicUsize::new(0);
377        let results: Mutex<HashMap<String, Profile>> = Mutex::new(HashMap::new());
378        let pb = crate::progress::bar(
379            "fetching profiles",
380            to_fetch.len(),
381            verbose && std::io::stderr().is_terminal(),
382        );
383        std::thread::scope(|s| {
384            for _ in 0..THREADS.min(to_fetch.len()) {
385                s.spawn(|| loop {
386                    let i = cursor.fetch_add(1, Ordering::Relaxed);
387                    let Some(login) = to_fetch.get(i) else { break };
388                    let profile = client.user_profile(login);
389                    results.lock().unwrap().insert(login.clone(), profile);
390                    pb.inc(1);
391                });
392            }
393        });
394        pb.finish_and_clear();
395        for (login, (name, company)) in results.into_inner().unwrap() {
396            caches.put_profile(login.clone(), name.clone(), company.clone());
397            profiles.insert(login, (name, company));
398        }
399    }
400
401    let with_company = profiles.values().filter(|(_, c)| c.is_some()).count();
402    for cl in clusters.iter_mut() {
403        if let Some(login) = &cl.login {
404            if let Some((name, company)) = profiles.get(login) {
405                cl.profile_name = name.clone();
406                cl.affiliation = company.clone();
407            }
408        }
409    }
410    if verbose {
411        eprintln!(
412            "  fetched {} profiles ({} with an affiliation, {from_cache} from cache)",
413            profiles.len(),
414            with_company,
415        );
416    }
417}
418
419/// Replace remote avatar URLs with embedded data URIs so the outputs are
420/// fully self-contained (and render in places that block remote images).
421pub fn embed_avatars(
422    contributors: &mut [Contributor],
423    client: &GhClient,
424    caches: &mut Caches,
425    size: u32,
426    verbose: bool,
427) {
428    let mut urls: Vec<String> = Vec::new();
429    for c in contributors.iter() {
430        if let Some(u) = &c.avatar {
431            if !u.starts_with("data:") && !urls.contains(u) {
432                urls.push(u.clone());
433            }
434        }
435    }
436    if urls.is_empty() {
437        return;
438    }
439
440    // Cache by avatar URL and size. Cached images skip the download.
441    let key_of = |u: &str| format!("{u}|{size}");
442    let mut embedded: HashMap<String, String> = HashMap::new();
443    let mut to_fetch: Vec<String> = Vec::new();
444    for u in urls {
445        match caches.avatar(&key_of(&u)) {
446            Some(data) => {
447                embedded.insert(u, data);
448            }
449            None => to_fetch.push(u),
450        }
451    }
452    let from_cache = embedded.len();
453
454    if !to_fetch.is_empty() {
455        let sized: Vec<String> = to_fetch
456            .iter()
457            .map(|u| {
458                if u.contains('?') {
459                    format!("{u}&s={size}")
460                } else {
461                    format!("{u}?s={size}")
462                }
463            })
464            .collect();
465        let cursor = AtomicUsize::new(0);
466        let results: Mutex<HashMap<String, String>> = Mutex::new(HashMap::new());
467        let pb = crate::progress::bar(
468            "embedding avatars",
469            to_fetch.len(),
470            verbose && std::io::stderr().is_terminal(),
471        );
472        std::thread::scope(|s| {
473            for _ in 0..THREADS.min(to_fetch.len()) {
474                s.spawn(|| loop {
475                    let i = cursor.fetch_add(1, Ordering::Relaxed);
476                    let (Some(orig), Some(fetch_url)) = (to_fetch.get(i), sized.get(i)) else {
477                        break;
478                    };
479                    if let Some((bytes, ct)) = client.fetch_bytes(fetch_url) {
480                        let ct = if ct.starts_with("image/") {
481                            ct
482                        } else {
483                            "image/png".into()
484                        };
485                        let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
486                        results
487                            .lock()
488                            .unwrap()
489                            .insert(orig.clone(), format!("data:{ct};base64,{b64}"));
490                    }
491                    pb.inc(1);
492                });
493            }
494        });
495        pb.finish_and_clear();
496        for (url, data) in results.into_inner().unwrap() {
497            caches.put_avatar(key_of(&url), data.clone());
498            embedded.insert(url, data);
499        }
500    }
501
502    let n = embedded.len();
503    for c in contributors.iter_mut() {
504        if let Some(u) = &c.avatar {
505            if let Some(data) = embedded.get(u) {
506                c.avatar = Some(data.clone());
507            }
508        }
509    }
510    if verbose {
511        eprintln!("  embedded {n} avatars as data URIs ({from_cache} from cache)");
512    }
513}