1use crate::cache::Caches;
2use crate::identity::Cluster;
3use crate::model::{Commit, Contributor};
4use base64::Engine;
5use std::collections::HashMap;
6use std::process::Command;
7use std::sync::atomic::{AtomicUsize, Ordering};
8use std::sync::Mutex;
9use std::time::Duration;
10
11const THREADS: usize = 8;
12
13type Profile = (Option<String>, Option<String>);
15
16pub struct GhClient {
17 agent: ureq::Agent,
18 token: Option<String>,
19}
20
21pub fn find_token() -> Option<String> {
23 for var in ["GITHUB_TOKEN", "GH_TOKEN"] {
24 if let Ok(t) = std::env::var(var) {
25 let t = t.trim().to_string();
26 if !t.is_empty() {
27 return Some(t);
28 }
29 }
30 }
31 let out = Command::new("gh").args(["auth", "token"]).output().ok()?;
32 if out.status.success() {
33 let t = String::from_utf8_lossy(&out.stdout).trim().to_string();
34 if !t.is_empty() {
35 return Some(t);
36 }
37 }
38 None
39}
40
41pub fn parse_noreply(email: &str) -> Option<(Option<u64>, String)> {
44 let local = email.strip_suffix("@users.noreply.github.com")?;
45 match local.split_once('+') {
46 Some((id, login)) => {
47 let id = id.parse::<u64>().ok();
48 Some((id, login.to_string()))
49 }
50 None => Some((None, local.to_string())),
51 }
52}
53
54impl GhClient {
55 pub fn new(token: Option<String>) -> Self {
56 let agent = ureq::AgentBuilder::new()
57 .timeout(Duration::from_secs(30))
58 .user_agent("contributor-graphs (https://github.com/ewels/contributor-graphs)")
59 .build();
60 GhClient { agent, token }
61 }
62
63 pub fn has_token(&self) -> bool {
64 self.token.is_some()
65 }
66
67 fn get_json(&self, url: &str) -> Option<serde_json::Value> {
68 let mut req = self
69 .agent
70 .get(url)
71 .set("Accept", "application/vnd.github+json")
72 .set("X-GitHub-Api-Version", "2022-11-28");
73 if let Some(t) = &self.token {
74 req = req.set("Authorization", &format!("Bearer {t}"));
75 }
76 match req.call() {
77 Ok(resp) => resp.into_json().ok(),
78 Err(ureq::Error::Status(code, _)) => {
79 if code == 403 || code == 429 {
80 eprintln!(" warning: GitHub API rate limited (HTTP {code})");
81 }
82 None
83 }
84 Err(_) => None,
85 }
86 }
87
88 fn commit_author(&self, slug: &str, sha: &str) -> Option<(String, String)> {
90 let v = self.get_json(&format!(
91 "https://api.github.com/repos/{slug}/commits/{sha}"
92 ))?;
93 let author = v.get("author")?;
94 let login = author.get("login")?.as_str()?.to_string();
95 let avatar = author
96 .get("avatar_url")
97 .and_then(|a| a.as_str())
98 .map(String::from)
99 .unwrap_or_else(|| format!("https://avatars.githubusercontent.com/{login}"));
100 Some((login, avatar))
101 }
102
103 fn user_profile(&self, login: &str) -> Profile {
105 let Some(v) = self.get_json(&format!("https://api.github.com/users/{login}")) else {
106 return (None, None);
107 };
108 let name = v
109 .get("name")
110 .and_then(|n| n.as_str())
111 .map(str::trim)
112 .filter(|n| !n.is_empty())
113 .map(String::from);
114 let company = v
115 .get("company")
116 .and_then(|c| c.as_str())
117 .and_then(normalize_company);
118 (name, company)
119 }
120
121 pub fn list_owner_repos(&self, owner: &str) -> Vec<String> {
126 for kind in ["orgs", "users"] {
127 let mut slugs = Vec::new();
128 let mut page = 1;
129 let mut reached = false;
130 loop {
131 let url =
132 format!("https://api.github.com/{kind}/{owner}/repos?per_page=100&page={page}");
133 let Some(v) = self.get_json(&url) else { break };
134 reached = true;
135 let Some(arr) = v.as_array() else { break };
136 let count = arr.len();
137 for repo in arr {
138 if repo.get("fork").and_then(|f| f.as_bool()).unwrap_or(false) {
139 continue;
140 }
141 if let Some(full) = repo.get("full_name").and_then(|n| n.as_str()) {
142 slugs.push(full.to_string());
143 }
144 }
145 if count < 100 {
146 break;
147 }
148 page += 1;
149 }
150 if reached && !slugs.is_empty() {
151 return slugs;
152 }
153 }
154 Vec::new()
155 }
156
157 pub fn fetch_bytes(&self, url: &str) -> Option<(Vec<u8>, String)> {
158 let resp = self.agent.get(url).call().ok()?;
159 let ct = resp.content_type().to_string();
160 let mut buf = Vec::new();
161 use std::io::Read;
162 resp.into_reader()
163 .take(4 * 1024 * 1024)
164 .read_to_end(&mut buf)
165 .ok()?;
166 Some((buf, ct))
167 }
168}
169
170pub fn enrich_clusters(
175 clusters: &mut [Cluster],
176 commits: &[Commit],
177 source_slugs: &[Option<String>],
178 client: &GhClient,
179 caches: &mut Caches,
180 verbose: bool,
181) {
182 let slug_of = |c: &Commit| -> Option<&str> {
183 source_slugs.get(c.src as usize).and_then(|s| s.as_deref())
184 };
185 let mut need_api: Vec<(usize, String, String)> = Vec::new();
186 for (i, cl) in clusters.iter_mut().enumerate() {
187 for email in &cl.emails {
188 if let Some((id, login)) = parse_noreply(email) {
189 cl.avatar_url = Some(match id {
190 Some(id) => format!("https://avatars.githubusercontent.com/u/{id}?v=4"),
191 None => format!("https://avatars.githubusercontent.com/{login}"),
192 });
193 cl.login = Some(login);
194 break;
195 }
196 }
197 if cl.login.is_none() {
198 let rep = cl
201 .commit_idxs
202 .iter()
203 .filter(|&&i| slug_of(&commits[i]).is_some())
204 .max_by_key(|&&i| commits[i].ts);
205 if let Some(&idx) = rep {
206 let slug = slug_of(&commits[idx]).unwrap().to_string();
207 need_api.push((i, slug, commits[idx].sha.clone()));
208 }
209 }
210 }
211
212 let mut from_cache = 0usize;
214 need_api.retain(|(idx, _, sha)| match caches.author(sha) {
215 Some(a) => {
216 clusters[*idx].login = Some(a.login);
217 clusters[*idx].avatar_url = Some(a.avatar_url);
218 from_cache += 1;
219 false
220 }
221 None => true,
222 });
223
224 if need_api.is_empty() || !client.has_token() {
225 if !need_api.is_empty() && verbose {
226 eprintln!(
227 " no GitHub token found ({} identities left unresolved) — run `gh auth login` to enable lookups",
228 need_api.len()
229 );
230 }
231 if from_cache > 0 && verbose {
232 eprintln!(" resolved {from_cache} identities from cache");
233 }
234 return;
235 }
236
237 let cursor = AtomicUsize::new(0);
238 let results: Mutex<HashMap<usize, (String, String)>> = Mutex::new(HashMap::new());
239 std::thread::scope(|s| {
240 for _ in 0..THREADS.min(need_api.len()) {
241 s.spawn(|| loop {
242 let i = cursor.fetch_add(1, Ordering::Relaxed);
243 let Some((cluster_idx, slug, sha)) = need_api.get(i) else {
244 break;
245 };
246 if let Some(found) = client.commit_author(slug, sha) {
247 results.lock().unwrap().insert(*cluster_idx, found);
248 }
249 });
250 }
251 });
252
253 let results = results.into_inner().unwrap();
254 let resolved = results.len();
255 let sha_of: HashMap<usize, &str> = need_api
258 .iter()
259 .map(|(idx, _, sha)| (*idx, sha.as_str()))
260 .collect();
261 for (idx, (login, avatar)) in results {
262 if let Some(sha) = sha_of.get(&idx) {
263 caches.put_author(sha.to_string(), login.clone(), avatar.clone());
264 }
265 clusters[idx].login = Some(login);
266 clusters[idx].avatar_url = Some(avatar);
267 }
268 if verbose {
269 eprintln!(
270 " resolved {resolved}/{} identities via GitHub API{}",
271 need_api.len(),
272 if from_cache > 0 {
273 format!(" ({from_cache} more from cache)")
274 } else {
275 String::new()
276 }
277 );
278 }
279}
280
281pub fn fetch_repo_description(client: &GhClient, slug: &str) -> Option<String> {
283 let v = client.get_json(&format!("https://api.github.com/repos/{slug}"))?;
284 v.get("description")
285 .and_then(|d| d.as_str())
286 .map(str::trim)
287 .filter(|d| !d.is_empty())
288 .map(String::from)
289}
290
291pub fn fetch_avatar(
294 client: &GhClient,
295 caches: &mut Caches,
296 login: &str,
297 size: u32,
298) -> Option<String> {
299 let key = format!("owner:{login}:{size}");
300 if let Some(data) = caches.avatar(&key) {
301 return Some(data);
302 }
303 let url = format!("https://avatars.githubusercontent.com/{login}?s={size}");
304 let (bytes, ct) = client.fetch_bytes(&url)?;
305 let ct = if ct.starts_with("image/") {
306 ct
307 } else {
308 "image/png".into()
309 };
310 let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
311 let data = format!("data:{ct};base64,{b64}");
312 caches.put_avatar(key, data.clone());
313 Some(data)
314}
315
316pub fn normalize_company(raw: &str) -> Option<String> {
320 let mut s = raw.trim();
321 for sep in [" | ", " / ", ";", " · ", ","] {
323 if let Some(i) = s.find(sep) {
324 s = &s[..i];
325 }
326 }
327 s = s.trim().trim_start_matches('@').trim();
328 if let Some(i) = s.find(" @") {
330 s = &s[..i];
331 }
332 let s = s.trim().trim_end_matches(['.', ',', ';', '|', '/']).trim();
333 if s.is_empty() || s.chars().count() > 60 {
334 return None;
335 }
336 Some(s.to_string())
337}
338
339pub fn fetch_profiles(
342 clusters: &mut [Cluster],
343 client: &GhClient,
344 caches: &mut Caches,
345 verbose: bool,
346) {
347 let mut logins: Vec<String> = clusters.iter().filter_map(|c| c.login.clone()).collect();
348 logins.sort();
349 logins.dedup();
350 if logins.is_empty() {
351 return;
352 }
353
354 let mut profiles: HashMap<String, Profile> = HashMap::new();
356 let mut to_fetch: Vec<String> = Vec::new();
357 for login in logins {
358 match caches.profile(&login) {
359 Some(p) => {
360 profiles.insert(login, p);
361 }
362 None => to_fetch.push(login),
363 }
364 }
365 let from_cache = profiles.len();
366
367 if !to_fetch.is_empty() && client.has_token() {
368 let cursor = AtomicUsize::new(0);
369 let results: Mutex<HashMap<String, Profile>> = Mutex::new(HashMap::new());
370 std::thread::scope(|s| {
371 for _ in 0..THREADS.min(to_fetch.len()) {
372 s.spawn(|| loop {
373 let i = cursor.fetch_add(1, Ordering::Relaxed);
374 let Some(login) = to_fetch.get(i) else { break };
375 let profile = client.user_profile(login);
376 results.lock().unwrap().insert(login.clone(), profile);
377 });
378 }
379 });
380 for (login, (name, company)) in results.into_inner().unwrap() {
381 caches.put_profile(login.clone(), name.clone(), company.clone());
382 profiles.insert(login, (name, company));
383 }
384 }
385
386 let with_company = profiles.values().filter(|(_, c)| c.is_some()).count();
387 for cl in clusters.iter_mut() {
388 if let Some(login) = &cl.login {
389 if let Some((name, company)) = profiles.get(login) {
390 cl.profile_name = name.clone();
391 cl.affiliation = company.clone();
392 }
393 }
394 }
395 if verbose {
396 eprintln!(
397 " fetched {} profiles ({} with an affiliation, {from_cache} from cache)",
398 profiles.len(),
399 with_company,
400 );
401 }
402}
403
404pub fn embed_avatars(
407 contributors: &mut [Contributor],
408 client: &GhClient,
409 caches: &mut Caches,
410 size: u32,
411 verbose: bool,
412) {
413 let mut urls: Vec<String> = Vec::new();
414 for c in contributors.iter() {
415 if let Some(u) = &c.avatar {
416 if !u.starts_with("data:") && !urls.contains(u) {
417 urls.push(u.clone());
418 }
419 }
420 }
421 if urls.is_empty() {
422 return;
423 }
424
425 let key_of = |u: &str| format!("{u}|{size}");
427 let mut embedded: HashMap<String, String> = HashMap::new();
428 let mut to_fetch: Vec<String> = Vec::new();
429 for u in urls {
430 match caches.avatar(&key_of(&u)) {
431 Some(data) => {
432 embedded.insert(u, data);
433 }
434 None => to_fetch.push(u),
435 }
436 }
437 let from_cache = embedded.len();
438
439 if !to_fetch.is_empty() {
440 let sized: Vec<String> = to_fetch
441 .iter()
442 .map(|u| {
443 if u.contains('?') {
444 format!("{u}&s={size}")
445 } else {
446 format!("{u}?s={size}")
447 }
448 })
449 .collect();
450 let cursor = AtomicUsize::new(0);
451 let results: Mutex<HashMap<String, String>> = Mutex::new(HashMap::new());
452 std::thread::scope(|s| {
453 for _ in 0..THREADS.min(to_fetch.len()) {
454 s.spawn(|| loop {
455 let i = cursor.fetch_add(1, Ordering::Relaxed);
456 let (Some(orig), Some(fetch_url)) = (to_fetch.get(i), sized.get(i)) else {
457 break;
458 };
459 if let Some((bytes, ct)) = client.fetch_bytes(fetch_url) {
460 let ct = if ct.starts_with("image/") {
461 ct
462 } else {
463 "image/png".into()
464 };
465 let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
466 results
467 .lock()
468 .unwrap()
469 .insert(orig.clone(), format!("data:{ct};base64,{b64}"));
470 }
471 });
472 }
473 });
474 for (url, data) in results.into_inner().unwrap() {
475 caches.put_avatar(key_of(&url), data.clone());
476 embedded.insert(url, data);
477 }
478 }
479
480 let n = embedded.len();
481 for c in contributors.iter_mut() {
482 if let Some(u) = &c.avatar {
483 if let Some(data) = embedded.get(u) {
484 c.avatar = Some(data.clone());
485 }
486 }
487 }
488 if verbose {
489 eprintln!(" embedded {n} avatars as data URIs ({from_cache} from cache)");
490 }
491}