Skip to main content

contributor_graphs/
github.rs

1use crate::cache::Caches;
2use crate::identity::Cluster;
3use crate::model::{Commit, Contributor};
4use base64::Engine;
5use std::collections::HashMap;
6use std::process::Command;
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::Mutex;
9use std::time::Duration;
10
11const THREADS: usize = 8;
12
13/// A GitHub user profile reduced to (display name, affiliation).
14type Profile = (Option<String>, Option<String>);
15
16pub struct GhClient {
17    agent: ureq::Agent,
18    token: Option<String>,
19}
20
21/// Get a GitHub token: $GITHUB_TOKEN / $GH_TOKEN, else `gh auth token`.
22pub fn find_token() -> Option<String> {
23    for var in ["GITHUB_TOKEN", "GH_TOKEN"] {
24        if let Ok(t) = std::env::var(var) {
25            let t = t.trim().to_string();
26            if !t.is_empty() {
27                return Some(t);
28            }
29        }
30    }
31    let out = Command::new("gh").args(["auth", "token"]).output().ok()?;
32    if out.status.success() {
33        let t = String::from_utf8_lossy(&out.stdout).trim().to_string();
34        if !t.is_empty() {
35            return Some(t);
36        }
37    }
38    None
39}
40
41/// Parse GitHub noreply addresses: `12345+login@users.noreply.github.com`
42/// or the older `login@users.noreply.github.com`.
43pub fn parse_noreply(email: &str) -> Option<(Option<u64>, String)> {
44    let local = email.strip_suffix("@users.noreply.github.com")?;
45    match local.split_once('+') {
46        Some((id, login)) => {
47            let id = id.parse::<u64>().ok();
48            Some((id, login.to_string()))
49        }
50        None => Some((None, local.to_string())),
51    }
52}
53
54impl GhClient {
55    pub fn new(token: Option<String>) -> Self {
56        let agent = ureq::AgentBuilder::new()
57            .timeout(Duration::from_secs(30))
58            .user_agent("contributor-graphs (https://github.com/ewels/contributor-graphs)")
59            .build();
60        GhClient { agent, token }
61    }
62
63    pub fn has_token(&self) -> bool {
64        self.token.is_some()
65    }
66
67    fn get_json(&self, url: &str) -> Option<serde_json::Value> {
68        let mut req = self
69            .agent
70            .get(url)
71            .set("Accept", "application/vnd.github+json")
72            .set("X-GitHub-Api-Version", "2022-11-28");
73        if let Some(t) = &self.token {
74            req = req.set("Authorization", &format!("Bearer {t}"));
75        }
76        match req.call() {
77            Ok(resp) => resp.into_json().ok(),
78            Err(ureq::Error::Status(code, _)) => {
79                if code == 403 || code == 429 {
80                    eprintln!("  warning: GitHub API rate limited (HTTP {code})");
81                }
82                None
83            }
84            Err(_) => None,
85        }
86    }
87
88    /// Resolve the GitHub login + avatar for the author of a commit.
89    fn commit_author(&self, slug: &str, sha: &str) -> Option<(String, String)> {
90        let v = self.get_json(&format!(
91            "https://api.github.com/repos/{slug}/commits/{sha}"
92        ))?;
93        let author = v.get("author")?;
94        let login = author.get("login")?.as_str()?.to_string();
95        let avatar = author
96            .get("avatar_url")
97            .and_then(|a| a.as_str())
98            .map(String::from)
99            .unwrap_or_else(|| format!("https://avatars.githubusercontent.com/{login}"));
100        Some((login, avatar))
101    }
102
103    /// Fetch a user profile: (display name, company/affiliation).
104    fn user_profile(&self, login: &str) -> Profile {
105        let Some(v) = self.get_json(&format!("https://api.github.com/users/{login}")) else {
106            return (None, None);
107        };
108        let name = v
109            .get("name")
110            .and_then(|n| n.as_str())
111            .map(str::trim)
112            .filter(|n| !n.is_empty())
113            .map(String::from);
114        let company = v
115            .get("company")
116            .and_then(|c| c.as_str())
117            .and_then(normalize_company);
118        (name, company)
119    }
120
121    /// List every repository under a GitHub org or user, returning `owner/repo`
122    /// slugs. Forks are skipped; archived repos are kept. Tries the org endpoint
123    /// first and falls back to the user endpoint, so it works for either kind of
124    /// account. Returns an empty vec if the owner can't be listed.
125    pub fn list_owner_repos(&self, owner: &str) -> Vec<String> {
126        for kind in ["orgs", "users"] {
127            let mut slugs = Vec::new();
128            let mut page = 1;
129            let mut reached = false;
130            loop {
131                let url =
132                    format!("https://api.github.com/{kind}/{owner}/repos?per_page=100&page={page}");
133                let Some(v) = self.get_json(&url) else { break };
134                reached = true;
135                let Some(arr) = v.as_array() else { break };
136                let count = arr.len();
137                for repo in arr {
138                    if repo.get("fork").and_then(|f| f.as_bool()).unwrap_or(false) {
139                        continue;
140                    }
141                    if let Some(full) = repo.get("full_name").and_then(|n| n.as_str()) {
142                        slugs.push(full.to_string());
143                    }
144                }
145                if count < 100 {
146                    break;
147                }
148                page += 1;
149            }
150            if reached && !slugs.is_empty() {
151                return slugs;
152            }
153        }
154        Vec::new()
155    }
156
157    pub fn fetch_bytes(&self, url: &str) -> Option<(Vec<u8>, String)> {
158        let resp = self.agent.get(url).call().ok()?;
159        let ct = resp.content_type().to_string();
160        let mut buf = Vec::new();
161        use std::io::Read;
162        resp.into_reader()
163            .take(4 * 1024 * 1024)
164            .read_to_end(&mut buf)
165            .ok()?;
166        Some((buf, ct))
167    }
168}
169
170/// Fill in `login` / `avatar_url` on clusters. Noreply emails resolve
171/// offline; the rest are looked up via the commits API (one representative
172/// commit per cluster), in parallel. `source_slugs` maps a commit's `src`
173/// index to the `owner/repo` slug to query (or `None` for non-GitHub sources).
174pub fn enrich_clusters(
175    clusters: &mut [Cluster],
176    commits: &[Commit],
177    source_slugs: &[Option<String>],
178    client: &GhClient,
179    caches: &mut Caches,
180    verbose: bool,
181) {
182    let slug_of = |c: &Commit| -> Option<&str> {
183        source_slugs.get(c.src as usize).and_then(|s| s.as_deref())
184    };
185    let mut need_api: Vec<(usize, String, String)> = Vec::new();
186    for (i, cl) in clusters.iter_mut().enumerate() {
187        for email in &cl.emails {
188            if let Some((id, login)) = parse_noreply(email) {
189                cl.avatar_url = Some(match id {
190                    Some(id) => format!("https://avatars.githubusercontent.com/u/{id}?v=4"),
191                    None => format!("https://avatars.githubusercontent.com/{login}"),
192                });
193                cl.login = Some(login);
194                break;
195            }
196        }
197        if cl.login.is_none() {
198            // Use the most recent commit that came from a GitHub source: old
199            // commits are more likely to have stale email → account mappings.
200            let rep = cl
201                .commit_idxs
202                .iter()
203                .filter(|&&i| slug_of(&commits[i]).is_some())
204                .max_by_key(|&&i| commits[i].ts);
205            if let Some(&idx) = rep {
206                let slug = slug_of(&commits[idx]).unwrap().to_string();
207                need_api.push((i, slug, commits[idx].sha.clone()));
208            }
209        }
210    }
211
212    // A commit's author never changes, so resolved SHAs are cached forever.
213    let mut from_cache = 0usize;
214    need_api.retain(|(idx, _, sha)| match caches.author(sha) {
215        Some(a) => {
216            clusters[*idx].login = Some(a.login);
217            clusters[*idx].avatar_url = Some(a.avatar_url);
218            from_cache += 1;
219            false
220        }
221        None => true,
222    });
223
224    if need_api.is_empty() || !client.has_token() {
225        if !need_api.is_empty() && verbose {
226            eprintln!(
227                "  no GitHub token found ({} identities left unresolved) — run `gh auth login` to enable lookups",
228                need_api.len()
229            );
230        }
231        if from_cache > 0 && verbose {
232            eprintln!("  resolved {from_cache} identities from cache");
233        }
234        return;
235    }
236
237    let cursor = AtomicUsize::new(0);
238    let results: Mutex<HashMap<usize, (String, String)>> = Mutex::new(HashMap::new());
239    std::thread::scope(|s| {
240        for _ in 0..THREADS.min(need_api.len()) {
241            s.spawn(|| loop {
242                let i = cursor.fetch_add(1, Ordering::Relaxed);
243                let Some((cluster_idx, slug, sha)) = need_api.get(i) else {
244                    break;
245                };
246                if let Some(found) = client.commit_author(slug, sha) {
247                    results.lock().unwrap().insert(*cluster_idx, found);
248                }
249            });
250        }
251    });
252
253    let results = results.into_inner().unwrap();
254    let resolved = results.len();
255    // Map cluster index back to its representative SHA so resolved authors can
256    // be cached against the (immutable) commit.
257    let sha_of: HashMap<usize, &str> = need_api
258        .iter()
259        .map(|(idx, _, sha)| (*idx, sha.as_str()))
260        .collect();
261    for (idx, (login, avatar)) in results {
262        if let Some(sha) = sha_of.get(&idx) {
263            caches.put_author(sha.to_string(), login.clone(), avatar.clone());
264        }
265        clusters[idx].login = Some(login);
266        clusters[idx].avatar_url = Some(avatar);
267    }
268    if verbose {
269        eprintln!(
270            "  resolved {resolved}/{} identities via GitHub API{}",
271            need_api.len(),
272            if from_cache > 0 {
273                format!(" ({from_cache} more from cache)")
274            } else {
275                String::new()
276            }
277        );
278    }
279}
280
281/// Fetch a repository's description from the GitHub API, if it has one.
282pub fn fetch_repo_description(client: &GhClient, slug: &str) -> Option<String> {
283    let v = client.get_json(&format!("https://api.github.com/repos/{slug}"))?;
284    v.get("description")
285        .and_then(|d| d.as_str())
286        .map(str::trim)
287        .filter(|d| !d.is_empty())
288        .map(String::from)
289}
290
291/// Fetch a GitHub avatar (e.g. an org/owner) and return it as a data URI,
292/// caching the result by login and size.
293pub fn fetch_avatar(
294    client: &GhClient,
295    caches: &mut Caches,
296    login: &str,
297    size: u32,
298) -> Option<String> {
299    let key = format!("owner:{login}:{size}");
300    if let Some(data) = caches.avatar(&key) {
301        return Some(data);
302    }
303    let url = format!("https://avatars.githubusercontent.com/{login}?s={size}");
304    let (bytes, ct) = client.fetch_bytes(&url)?;
305    let ct = if ct.starts_with("image/") {
306        ct
307    } else {
308        "image/png".into()
309    };
310    let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
311    let data = format!("data:{ct};base64,{b64}");
312    caches.put_avatar(key, data.clone());
313    Some(data)
314}
315
316/// Clean up the free-text GitHub `company` field into a usable group name.
317/// Handles common patterns like "@seqeralabs", "QBiC @qbicsoftware", and
318/// multi-affiliation strings ("Seqera | SciLifeLab" → "Seqera").
319pub fn normalize_company(raw: &str) -> Option<String> {
320    let mut s = raw.trim();
321    // Multiple affiliations: keep the first one.
322    for sep in [" | ", " / ", ";", " · ", ","] {
323        if let Some(i) = s.find(sep) {
324            s = &s[..i];
325        }
326    }
327    s = s.trim().trim_start_matches('@').trim();
328    // "Company @githuborg" → "Company".
329    if let Some(i) = s.find(" @") {
330        s = &s[..i];
331    }
332    let s = s.trim().trim_end_matches(['.', ',', ';', '|', '/']).trim();
333    if s.is_empty() || s.chars().count() > 60 {
334        return None;
335    }
336    Some(s.to_string())
337}
338
339/// Fetch GitHub user profiles for every resolved login: improves display
340/// names ("phue" → "Patrick Hüther") and yields company affiliations.
341pub fn fetch_profiles(
342    clusters: &mut [Cluster],
343    client: &GhClient,
344    caches: &mut Caches,
345    verbose: bool,
346) {
347    let mut logins: Vec<String> = clusters.iter().filter_map(|c| c.login.clone()).collect();
348    logins.sort();
349    logins.dedup();
350    if logins.is_empty() {
351        return;
352    }
353
354    // Cached profiles skip the API; only the misses are fetched.
355    let mut profiles: HashMap<String, Profile> = HashMap::new();
356    let mut to_fetch: Vec<String> = Vec::new();
357    for login in logins {
358        match caches.profile(&login) {
359            Some(p) => {
360                profiles.insert(login, p);
361            }
362            None => to_fetch.push(login),
363        }
364    }
365    let from_cache = profiles.len();
366
367    if !to_fetch.is_empty() && client.has_token() {
368        let cursor = AtomicUsize::new(0);
369        let results: Mutex<HashMap<String, Profile>> = Mutex::new(HashMap::new());
370        std::thread::scope(|s| {
371            for _ in 0..THREADS.min(to_fetch.len()) {
372                s.spawn(|| loop {
373                    let i = cursor.fetch_add(1, Ordering::Relaxed);
374                    let Some(login) = to_fetch.get(i) else { break };
375                    let profile = client.user_profile(login);
376                    results.lock().unwrap().insert(login.clone(), profile);
377                });
378            }
379        });
380        for (login, (name, company)) in results.into_inner().unwrap() {
381            caches.put_profile(login.clone(), name.clone(), company.clone());
382            profiles.insert(login, (name, company));
383        }
384    }
385
386    let with_company = profiles.values().filter(|(_, c)| c.is_some()).count();
387    for cl in clusters.iter_mut() {
388        if let Some(login) = &cl.login {
389            if let Some((name, company)) = profiles.get(login) {
390                cl.profile_name = name.clone();
391                cl.affiliation = company.clone();
392            }
393        }
394    }
395    if verbose {
396        eprintln!(
397            "  fetched {} profiles ({} with an affiliation, {from_cache} from cache)",
398            profiles.len(),
399            with_company,
400        );
401    }
402}
403
404/// Replace remote avatar URLs with embedded data URIs so the outputs are
405/// fully self-contained (and render in places that block remote images).
406pub fn embed_avatars(
407    contributors: &mut [Contributor],
408    client: &GhClient,
409    caches: &mut Caches,
410    size: u32,
411    verbose: bool,
412) {
413    let mut urls: Vec<String> = Vec::new();
414    for c in contributors.iter() {
415        if let Some(u) = &c.avatar {
416            if !u.starts_with("data:") && !urls.contains(u) {
417                urls.push(u.clone());
418            }
419        }
420    }
421    if urls.is_empty() {
422        return;
423    }
424
425    // Cache by avatar URL and size. Cached images skip the download.
426    let key_of = |u: &str| format!("{u}|{size}");
427    let mut embedded: HashMap<String, String> = HashMap::new();
428    let mut to_fetch: Vec<String> = Vec::new();
429    for u in urls {
430        match caches.avatar(&key_of(&u)) {
431            Some(data) => {
432                embedded.insert(u, data);
433            }
434            None => to_fetch.push(u),
435        }
436    }
437    let from_cache = embedded.len();
438
439    if !to_fetch.is_empty() {
440        let sized: Vec<String> = to_fetch
441            .iter()
442            .map(|u| {
443                if u.contains('?') {
444                    format!("{u}&s={size}")
445                } else {
446                    format!("{u}?s={size}")
447                }
448            })
449            .collect();
450        let cursor = AtomicUsize::new(0);
451        let results: Mutex<HashMap<String, String>> = Mutex::new(HashMap::new());
452        std::thread::scope(|s| {
453            for _ in 0..THREADS.min(to_fetch.len()) {
454                s.spawn(|| loop {
455                    let i = cursor.fetch_add(1, Ordering::Relaxed);
456                    let (Some(orig), Some(fetch_url)) = (to_fetch.get(i), sized.get(i)) else {
457                        break;
458                    };
459                    if let Some((bytes, ct)) = client.fetch_bytes(fetch_url) {
460                        let ct = if ct.starts_with("image/") {
461                            ct
462                        } else {
463                            "image/png".into()
464                        };
465                        let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
466                        results
467                            .lock()
468                            .unwrap()
469                            .insert(orig.clone(), format!("data:{ct};base64,{b64}"));
470                    }
471                });
472            }
473        });
474        for (url, data) in results.into_inner().unwrap() {
475            caches.put_avatar(key_of(&url), data.clone());
476            embedded.insert(url, data);
477        }
478    }
479
480    let n = embedded.len();
481    for c in contributors.iter_mut() {
482        if let Some(u) = &c.avatar {
483            if let Some(data) = embedded.get(u) {
484                c.avatar = Some(data.clone());
485            }
486        }
487    }
488    if verbose {
489        eprintln!("  embedded {n} avatars as data URIs ({from_cache} from cache)");
490    }
491}