Skip to main content

contributor_graphs/
github.rs

1use crate::identity::Cluster;
2use crate::model::{Commit, Contributor};
3use base64::Engine;
4use std::collections::HashMap;
5use std::process::Command;
6use std::sync::atomic::{AtomicUsize, Ordering};
7use std::sync::Mutex;
8use std::time::Duration;
9
10const THREADS: usize = 8;
11
12/// A GitHub user profile reduced to (display name, affiliation).
13type Profile = (Option<String>, Option<String>);
14
15pub struct GhClient {
16    agent: ureq::Agent,
17    token: Option<String>,
18}
19
20/// Get a GitHub token: $GITHUB_TOKEN / $GH_TOKEN, else `gh auth token`.
21pub fn find_token() -> Option<String> {
22    for var in ["GITHUB_TOKEN", "GH_TOKEN"] {
23        if let Ok(t) = std::env::var(var) {
24            let t = t.trim().to_string();
25            if !t.is_empty() {
26                return Some(t);
27            }
28        }
29    }
30    let out = Command::new("gh").args(["auth", "token"]).output().ok()?;
31    if out.status.success() {
32        let t = String::from_utf8_lossy(&out.stdout).trim().to_string();
33        if !t.is_empty() {
34            return Some(t);
35        }
36    }
37    None
38}
39
40/// Parse GitHub noreply addresses: `12345+login@users.noreply.github.com`
41/// or the older `login@users.noreply.github.com`.
42pub fn parse_noreply(email: &str) -> Option<(Option<u64>, String)> {
43    let local = email.strip_suffix("@users.noreply.github.com")?;
44    match local.split_once('+') {
45        Some((id, login)) => {
46            let id = id.parse::<u64>().ok();
47            Some((id, login.to_string()))
48        }
49        None => Some((None, local.to_string())),
50    }
51}
52
53impl GhClient {
54    pub fn new(token: Option<String>) -> Self {
55        let agent = ureq::AgentBuilder::new()
56            .timeout(Duration::from_secs(30))
57            .user_agent("contributor-graphs (https://github.com/ewels/contributor-graphs)")
58            .build();
59        GhClient { agent, token }
60    }
61
62    pub fn has_token(&self) -> bool {
63        self.token.is_some()
64    }
65
66    fn get_json(&self, url: &str) -> Option<serde_json::Value> {
67        let mut req = self
68            .agent
69            .get(url)
70            .set("Accept", "application/vnd.github+json")
71            .set("X-GitHub-Api-Version", "2022-11-28");
72        if let Some(t) = &self.token {
73            req = req.set("Authorization", &format!("Bearer {t}"));
74        }
75        match req.call() {
76            Ok(resp) => resp.into_json().ok(),
77            Err(ureq::Error::Status(code, _)) => {
78                if code == 403 || code == 429 {
79                    eprintln!("  warning: GitHub API rate limited (HTTP {code})");
80                }
81                None
82            }
83            Err(_) => None,
84        }
85    }
86
87    /// Resolve the GitHub login + avatar for the author of a commit.
88    fn commit_author(&self, slug: &str, sha: &str) -> Option<(String, String)> {
89        let v = self.get_json(&format!(
90            "https://api.github.com/repos/{slug}/commits/{sha}"
91        ))?;
92        let author = v.get("author")?;
93        let login = author.get("login")?.as_str()?.to_string();
94        let avatar = author
95            .get("avatar_url")
96            .and_then(|a| a.as_str())
97            .map(String::from)
98            .unwrap_or_else(|| format!("https://avatars.githubusercontent.com/{login}"));
99        Some((login, avatar))
100    }
101
102    /// Fetch a user profile: (display name, company/affiliation).
103    fn user_profile(&self, login: &str) -> Profile {
104        let Some(v) = self.get_json(&format!("https://api.github.com/users/{login}")) else {
105            return (None, None);
106        };
107        let name = v
108            .get("name")
109            .and_then(|n| n.as_str())
110            .map(str::trim)
111            .filter(|n| !n.is_empty())
112            .map(String::from);
113        let company = v
114            .get("company")
115            .and_then(|c| c.as_str())
116            .and_then(normalize_company);
117        (name, company)
118    }
119
120    pub fn fetch_bytes(&self, url: &str) -> Option<(Vec<u8>, String)> {
121        let resp = self.agent.get(url).call().ok()?;
122        let ct = resp.content_type().to_string();
123        let mut buf = Vec::new();
124        use std::io::Read;
125        resp.into_reader()
126            .take(4 * 1024 * 1024)
127            .read_to_end(&mut buf)
128            .ok()?;
129        Some((buf, ct))
130    }
131}
132
133/// Fill in `login` / `avatar_url` on clusters. Noreply emails resolve
134/// offline; the rest are looked up via the commits API (one representative
135/// commit per cluster), in parallel.
136pub fn enrich_clusters(
137    clusters: &mut [Cluster],
138    commits: &[Commit],
139    slug: &str,
140    client: &GhClient,
141    verbose: bool,
142) {
143    let mut need_api: Vec<(usize, String)> = Vec::new();
144    for (i, cl) in clusters.iter_mut().enumerate() {
145        for email in &cl.emails {
146            if let Some((id, login)) = parse_noreply(email) {
147                cl.avatar_url = Some(match id {
148                    Some(id) => format!("https://avatars.githubusercontent.com/u/{id}?v=4"),
149                    None => format!("https://avatars.githubusercontent.com/{login}"),
150                });
151                cl.login = Some(login);
152                break;
153            }
154        }
155        if cl.login.is_none() {
156            // Use the most recent commit: old commits are more likely to have
157            // stale email → account mappings.
158            if let Some(&idx) = cl.commit_idxs.iter().max_by_key(|&&i| commits[i].ts) {
159                need_api.push((i, commits[idx].sha.clone()));
160            }
161        }
162    }
163
164    if need_api.is_empty() || !client.has_token() {
165        if !need_api.is_empty() && verbose {
166            eprintln!(
167                "  no GitHub token found ({} identities left unresolved) — run `gh auth login` to enable lookups",
168                need_api.len()
169            );
170        }
171        return;
172    }
173
174    let cursor = AtomicUsize::new(0);
175    let results: Mutex<HashMap<usize, (String, String)>> = Mutex::new(HashMap::new());
176    std::thread::scope(|s| {
177        for _ in 0..THREADS.min(need_api.len()) {
178            s.spawn(|| loop {
179                let i = cursor.fetch_add(1, Ordering::Relaxed);
180                let Some((cluster_idx, sha)) = need_api.get(i) else {
181                    break;
182                };
183                if let Some(found) = client.commit_author(slug, sha) {
184                    results.lock().unwrap().insert(*cluster_idx, found);
185                }
186            });
187        }
188    });
189
190    let results = results.into_inner().unwrap();
191    let resolved = results.len();
192    for (idx, (login, avatar)) in results {
193        clusters[idx].login = Some(login);
194        clusters[idx].avatar_url = Some(avatar);
195    }
196    if verbose {
197        eprintln!(
198            "  resolved {resolved}/{} identities via GitHub API",
199            need_api.len()
200        );
201    }
202}
203
204/// Fetch a GitHub avatar (e.g. an org/owner) and return it as a data URI.
205pub fn fetch_avatar(client: &GhClient, login: &str, size: u32) -> Option<String> {
206    let url = format!("https://avatars.githubusercontent.com/{login}?s={size}");
207    let (bytes, ct) = client.fetch_bytes(&url)?;
208    let ct = if ct.starts_with("image/") {
209        ct
210    } else {
211        "image/png".into()
212    };
213    let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
214    Some(format!("data:{ct};base64,{b64}"))
215}
216
217/// Clean up the free-text GitHub `company` field into a usable group name.
218/// Handles common patterns like "@seqeralabs", "QBiC @qbicsoftware", and
219/// multi-affiliation strings ("Seqera | SciLifeLab" → "Seqera").
220pub fn normalize_company(raw: &str) -> Option<String> {
221    let mut s = raw.trim();
222    // Multiple affiliations: keep the first one.
223    for sep in [" | ", " / ", ";", " · ", ","] {
224        if let Some(i) = s.find(sep) {
225            s = &s[..i];
226        }
227    }
228    s = s.trim().trim_start_matches('@').trim();
229    // "Company @githuborg" → "Company".
230    if let Some(i) = s.find(" @") {
231        s = &s[..i];
232    }
233    let s = s.trim().trim_end_matches(['.', ',', ';', '|', '/']).trim();
234    if s.is_empty() || s.chars().count() > 60 {
235        return None;
236    }
237    Some(s.to_string())
238}
239
240/// Fetch GitHub user profiles for every resolved login: improves display
241/// names ("phue" → "Patrick Hüther") and yields company affiliations.
242pub fn fetch_profiles(clusters: &mut [Cluster], client: &GhClient, verbose: bool) {
243    if !client.has_token() {
244        return;
245    }
246    let mut logins: Vec<String> = clusters.iter().filter_map(|c| c.login.clone()).collect();
247    logins.sort();
248    logins.dedup();
249    if logins.is_empty() {
250        return;
251    }
252
253    let cursor = AtomicUsize::new(0);
254    let results: Mutex<HashMap<String, Profile>> = Mutex::new(HashMap::new());
255    std::thread::scope(|s| {
256        for _ in 0..THREADS.min(logins.len()) {
257            s.spawn(|| loop {
258                let i = cursor.fetch_add(1, Ordering::Relaxed);
259                let Some(login) = logins.get(i) else { break };
260                let profile = client.user_profile(login);
261                results.lock().unwrap().insert(login.clone(), profile);
262            });
263        }
264    });
265
266    let results = results.into_inner().unwrap();
267    let with_company = results.values().filter(|(_, c)| c.is_some()).count();
268    for cl in clusters.iter_mut() {
269        if let Some(login) = &cl.login {
270            if let Some((name, company)) = results.get(login) {
271                cl.profile_name = name.clone();
272                cl.affiliation = company.clone();
273            }
274        }
275    }
276    if verbose {
277        eprintln!(
278            "  fetched {} profiles ({} with an affiliation)",
279            results.len(),
280            with_company
281        );
282    }
283}
284
285/// Replace remote avatar URLs with embedded data URIs so the outputs are
286/// fully self-contained (and render in places that block remote images).
287pub fn embed_avatars(
288    contributors: &mut [Contributor],
289    client: &GhClient,
290    size: u32,
291    verbose: bool,
292) {
293    let mut urls: Vec<String> = Vec::new();
294    for c in contributors.iter() {
295        if let Some(u) = &c.avatar {
296            if !u.starts_with("data:") && !urls.contains(u) {
297                urls.push(u.clone());
298            }
299        }
300    }
301    if urls.is_empty() {
302        return;
303    }
304
305    let sized: Vec<String> = urls
306        .iter()
307        .map(|u| {
308            if u.contains('?') {
309                format!("{u}&s={size}")
310            } else {
311                format!("{u}?s={size}")
312            }
313        })
314        .collect();
315
316    let cursor = AtomicUsize::new(0);
317    let results: Mutex<HashMap<String, String>> = Mutex::new(HashMap::new());
318    std::thread::scope(|s| {
319        for _ in 0..THREADS.min(urls.len()) {
320            s.spawn(|| loop {
321                let i = cursor.fetch_add(1, Ordering::Relaxed);
322                let (Some(orig), Some(fetch_url)) = (urls.get(i), sized.get(i)) else {
323                    break;
324                };
325                if let Some((bytes, ct)) = client.fetch_bytes(fetch_url) {
326                    let ct = if ct.starts_with("image/") {
327                        ct
328                    } else {
329                        "image/png".into()
330                    };
331                    let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
332                    results
333                        .lock()
334                        .unwrap()
335                        .insert(orig.clone(), format!("data:{ct};base64,{b64}"));
336                }
337            });
338        }
339    });
340
341    let results = results.into_inner().unwrap();
342    let n = results.len();
343    for c in contributors.iter_mut() {
344        if let Some(u) = &c.avatar {
345            if let Some(data) = results.get(u) {
346                c.avatar = Some(data.clone());
347            }
348        }
349    }
350    if verbose {
351        eprintln!("  embedded {n} avatars as data URIs");
352    }
353}