Skip to main content

web_capture/
github.rs

1//! GitHub repository-page capture helpers.
2//!
3//! Plain repository pages are mostly application HTML. For text and markdown
4//! output, the compact content users expect is available more reliably through
5//! the GitHub REST API: repository details, the root file listing, and README.
6
7use anyhow::{anyhow, Context};
8use base64::{engine::general_purpose::STANDARD, Engine as _};
9use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, AUTHORIZATION, USER_AGENT};
10use reqwest::StatusCode;
11use serde::de::DeserializeOwned;
12use serde::Deserialize;
13use url::Url;
14
15const GITHUB_API_BASE: &str = "https://api.github.com";
16const GITHUB_USER_AGENT: &str = "web-capture";
17
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct GithubRepositoryUrl {
20    pub owner: String,
21    pub repo: String,
22    pub full_name: String,
23    pub html_url: String,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct GithubRepositoryMetadata {
28    pub full_name: String,
29    pub html_url: String,
30    pub description: Option<String>,
31    pub language: Option<String>,
32    pub stargazers_count: u64,
33    pub forks_count: u64,
34    pub open_issues_count: u64,
35    pub license_spdx_id: Option<String>,
36    pub topics: Vec<String>,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct GithubReadme {
41    pub name: String,
42    pub path: String,
43    pub html_url: Option<String>,
44    pub content: Option<String>,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct GithubTreeEntry {
49    pub name: String,
50    pub path: String,
51    pub kind: String,
52    pub size: Option<u64>,
53    pub html_url: String,
54}
55
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct GithubRepositorySnapshot {
58    pub source_url: String,
59    pub repository: GithubRepositoryMetadata,
60    pub default_branch: String,
61    pub readme: Option<GithubReadme>,
62    pub tree: Vec<GithubTreeEntry>,
63}
64
65#[derive(Debug, Deserialize)]
66struct RepositoryApiResponse {
67    full_name: String,
68    html_url: String,
69    description: Option<String>,
70    default_branch: Option<String>,
71    language: Option<String>,
72    stargazers_count: Option<u64>,
73    forks_count: Option<u64>,
74    open_issues_count: Option<u64>,
75    license: Option<RepositoryLicenseApiResponse>,
76    topics: Option<Vec<String>>,
77}
78
79#[derive(Debug, Deserialize)]
80struct RepositoryLicenseApiResponse {
81    spdx_id: Option<String>,
82}
83
84#[derive(Debug, Deserialize)]
85struct ReadmeApiResponse {
86    name: Option<String>,
87    path: Option<String>,
88    html_url: Option<String>,
89    download_url: Option<String>,
90    content: Option<String>,
91    encoding: Option<String>,
92}
93
94#[derive(Debug, Deserialize)]
95struct ContentsApiResponse {
96    name: String,
97    path: String,
98    #[serde(rename = "type")]
99    kind: String,
100    size: Option<u64>,
101    html_url: Option<String>,
102}
103
104/// Parse a plain GitHub repository URL.
105///
106/// URLs for subpages such as `/issues`, `/tree/...`, or `/blob/...` are not
107/// treated as repository snapshots because those pages have their own capture
108/// semantics.
109#[must_use]
110pub fn parse_github_repository_url(url: &str) -> Option<GithubRepositoryUrl> {
111    let parsed = Url::parse(url).ok()?;
112    let host = parsed.host_str()?.to_ascii_lowercase();
113    if host != "github.com" && host != "www.github.com" {
114        return None;
115    }
116
117    let parts: Vec<_> = parsed
118        .path_segments()?
119        .filter(|segment| !segment.is_empty())
120        .collect();
121    if parts.len() != 2 {
122        return None;
123    }
124
125    let owner = parts[0].to_string();
126    let repo = parts[1].to_string();
127    if owner.is_empty() || repo.is_empty() {
128        return None;
129    }
130
131    Some(GithubRepositoryUrl {
132        full_name: format!("{owner}/{repo}"),
133        html_url: format!("https://github.com/{owner}/{repo}"),
134        owner,
135        repo,
136    })
137}
138
139#[must_use]
140pub fn is_github_repository_url(url: &str) -> bool {
141    parse_github_repository_url(url).is_some()
142}
143
144#[must_use]
145pub fn github_repository_text_filename(url: &str) -> Option<String> {
146    parse_github_repository_url(url).map(|repo| format!("{}-{}.txt", repo.owner, repo.repo))
147}
148
149pub async fn fetch_github_repository_snapshot(
150    url: &str,
151) -> anyhow::Result<GithubRepositorySnapshot> {
152    let parsed = parse_github_repository_url(url)
153        .ok_or_else(|| anyhow!("Not a GitHub repository URL: {url}"))?;
154
155    let repository: RepositoryApiResponse = fetch_github_json(&format!(
156        "{GITHUB_API_BASE}/repos/{}/{}",
157        parsed.owner, parsed.repo
158    ))
159    .await?
160    .ok_or_else(|| anyhow!("Repository was not returned by the GitHub API"))?;
161
162    let default_branch = repository
163        .default_branch
164        .clone()
165        .unwrap_or_else(|| "main".to_string());
166
167    let (readme, tree) = tokio::try_join!(
168        fetch_github_readme(&parsed, &default_branch),
169        fetch_github_root_tree(&parsed, &default_branch)
170    )?;
171
172    Ok(GithubRepositorySnapshot {
173        source_url: parsed.html_url,
174        repository: GithubRepositoryMetadata {
175            full_name: repository.full_name,
176            html_url: repository.html_url,
177            description: repository.description,
178            language: repository.language,
179            stargazers_count: repository.stargazers_count.unwrap_or_default(),
180            forks_count: repository.forks_count.unwrap_or_default(),
181            open_issues_count: repository.open_issues_count.unwrap_or_default(),
182            license_spdx_id: repository.license.and_then(|license| license.spdx_id),
183            topics: repository.topics.unwrap_or_default(),
184        },
185        default_branch,
186        readme,
187        tree,
188    })
189}
190
191#[must_use]
192pub fn format_github_repository_markdown(snapshot: &GithubRepositorySnapshot) -> String {
193    let mut lines = vec![
194        format!("# {}", snapshot.repository.full_name),
195        String::new(),
196    ];
197    if let Some(description) = &snapshot.repository.description {
198        lines.push(format!("> {description}"));
199        lines.push(String::new());
200    }
201
202    lines.extend([
203        "## Repository".to_string(),
204        String::new(),
205        format!("- URL: {}", repository_url(snapshot)),
206        format!("- Default branch: `{}`", snapshot.default_branch),
207    ]);
208    push_optional_line(
209        &mut lines,
210        snapshot
211            .repository
212            .language
213            .as_ref()
214            .map(|language| format!("- Primary language: {language}")),
215    );
216    lines.push(format!("- Stars: {}", snapshot.repository.stargazers_count));
217    lines.push(format!("- Forks: {}", snapshot.repository.forks_count));
218    lines.push(format!(
219        "- Open issues: {}",
220        snapshot.repository.open_issues_count
221    ));
222    push_optional_line(
223        &mut lines,
224        snapshot
225            .repository
226            .license_spdx_id
227            .as_ref()
228            .map(|license| format!("- License: {license}")),
229    );
230    if !snapshot.repository.topics.is_empty() {
231        lines.push(format!(
232            "- Topics: {}",
233            snapshot.repository.topics.join(", ")
234        ));
235    }
236
237    lines.extend([String::new(), "## Files".to_string(), String::new()]);
238    append_tree_markdown(&mut lines, &snapshot.tree);
239
240    let readme_path = snapshot
241        .readme
242        .as_ref()
243        .map_or("README", |readme| readme.path.as_str());
244    lines.extend([String::new(), format!("## {readme_path}"), String::new()]);
245    append_readme_content(&mut lines, snapshot.readme.as_ref());
246
247    lines.join("\n")
248}
249
250#[must_use]
251pub fn format_github_repository_text(snapshot: &GithubRepositorySnapshot) -> String {
252    let mut lines = vec![format!("Repository: {}", snapshot.repository.full_name)];
253    if let Some(description) = &snapshot.repository.description {
254        lines.push(format!("Description: {description}"));
255    }
256    lines.extend([
257        format!("URL: {}", repository_url(snapshot)),
258        format!("Default branch: {}", snapshot.default_branch),
259    ]);
260    push_optional_line(
261        &mut lines,
262        snapshot
263            .repository
264            .language
265            .as_ref()
266            .map(|language| format!("Primary language: {language}")),
267    );
268    lines.push(format!("Stars: {}", snapshot.repository.stargazers_count));
269    lines.push(format!("Forks: {}", snapshot.repository.forks_count));
270    lines.push(format!(
271        "Open issues: {}",
272        snapshot.repository.open_issues_count
273    ));
274    push_optional_line(
275        &mut lines,
276        snapshot
277            .repository
278            .license_spdx_id
279            .as_ref()
280            .map(|license| format!("License: {license}")),
281    );
282    if !snapshot.repository.topics.is_empty() {
283        lines.push(format!("Topics: {}", snapshot.repository.topics.join(", ")));
284    }
285
286    lines.extend([String::new(), "Files:".to_string()]);
287    append_tree_text(&mut lines, &snapshot.tree);
288
289    let readme_path = snapshot
290        .readme
291        .as_ref()
292        .map_or("README", |readme| readme.path.as_str());
293    lines.extend([String::new(), format!("{readme_path}:"), String::new()]);
294    append_readme_content(&mut lines, snapshot.readme.as_ref());
295
296    lines.join("\n")
297}
298
299async fn fetch_github_readme(
300    parsed: &GithubRepositoryUrl,
301    default_branch: &str,
302) -> anyhow::Result<Option<GithubReadme>> {
303    let readme: Option<ReadmeApiResponse> = fetch_optional_github_json(&format!(
304        "{GITHUB_API_BASE}/repos/{}/{}/readme?ref={default_branch}",
305        parsed.owner, parsed.repo
306    ))
307    .await?;
308
309    let Some(readme) = readme else {
310        return Ok(None);
311    };
312
313    let content = if readme.encoding.as_deref() == Some("base64") {
314        readme
315            .content
316            .as_deref()
317            .map(decode_base64_text)
318            .transpose()?
319    } else if let Some(download_url) = readme.download_url.as_deref() {
320        fetch_optional_github_text(download_url).await?
321    } else {
322        None
323    };
324
325    let name = readme.name.unwrap_or_else(|| "README".to_string());
326    let path = readme.path.unwrap_or_else(|| name.clone());
327    Ok(Some(GithubReadme {
328        name,
329        path,
330        html_url: readme.html_url,
331        content,
332    }))
333}
334
335async fn fetch_github_root_tree(
336    parsed: &GithubRepositoryUrl,
337    default_branch: &str,
338) -> anyhow::Result<Vec<GithubTreeEntry>> {
339    let contents: Option<Vec<ContentsApiResponse>> = fetch_optional_github_json(&format!(
340        "{GITHUB_API_BASE}/repos/{}/{}/contents?ref={default_branch}",
341        parsed.owner, parsed.repo
342    ))
343    .await?;
344
345    let mut tree: Vec<_> = contents
346        .unwrap_or_default()
347        .into_iter()
348        .map(|item| {
349            let html_url = item.html_url.unwrap_or_else(|| {
350                let kind = if item.kind == "dir" { "tree" } else { "blob" };
351                format!(
352                    "https://github.com/{}/{}/{kind}/{default_branch}/{}",
353                    parsed.owner, parsed.repo, item.path
354                )
355            });
356            GithubTreeEntry {
357                name: item.name,
358                path: item.path,
359                kind: item.kind,
360                size: item.size,
361                html_url,
362            }
363        })
364        .collect();
365    tree.sort_by(
366        |a, b| match (a.kind.as_str() == "dir", b.kind.as_str() == "dir") {
367            (true, false) => std::cmp::Ordering::Less,
368            (false, true) => std::cmp::Ordering::Greater,
369            _ => a.name.cmp(&b.name),
370        },
371    );
372    Ok(tree)
373}
374
375async fn fetch_github_json<T>(url: &str) -> anyhow::Result<Option<T>>
376where
377    T: DeserializeOwned,
378{
379    fetch_github_json_with_optional_not_found(url, false).await
380}
381
382async fn fetch_optional_github_json<T>(url: &str) -> anyhow::Result<Option<T>>
383where
384    T: DeserializeOwned,
385{
386    fetch_github_json_with_optional_not_found(url, true).await
387}
388
389async fn fetch_github_json_with_optional_not_found<T>(
390    url: &str,
391    optional: bool,
392) -> anyhow::Result<Option<T>>
393where
394    T: DeserializeOwned,
395{
396    let response = reqwest::Client::new()
397        .get(url)
398        .headers(github_headers("application/vnd.github+json"))
399        .send()
400        .await
401        .with_context(|| format!("Requesting {url}"))?;
402    if optional && response.status() == StatusCode::NOT_FOUND {
403        return Ok(None);
404    }
405    let status = response.status();
406    let body = response
407        .text()
408        .await
409        .with_context(|| format!("Reading response body from {url}"))?;
410    if !status.is_success() {
411        anyhow::bail!("GitHub API {status}: {body}");
412    }
413    Ok(Some(serde_json::from_str(&body).with_context(|| {
414        format!("Parsing GitHub JSON from {url}")
415    })?))
416}
417
418async fn fetch_optional_github_text(url: &str) -> anyhow::Result<Option<String>> {
419    let response = reqwest::Client::new()
420        .get(url)
421        .headers(github_headers("text/plain"))
422        .send()
423        .await
424        .with_context(|| format!("Requesting {url}"))?;
425    if response.status() == StatusCode::NOT_FOUND {
426        return Ok(None);
427    }
428    let status = response.status();
429    let text = response
430        .text()
431        .await
432        .with_context(|| format!("Reading text response from {url}"))?;
433    if !status.is_success() {
434        anyhow::bail!("GitHub raw {status}: {text}");
435    }
436    Ok(Some(text))
437}
438
439fn github_headers(accept: &str) -> HeaderMap {
440    let mut headers = HeaderMap::new();
441    headers.insert(
442        ACCEPT,
443        HeaderValue::from_str(accept).unwrap_or_else(|_| HeaderValue::from_static("*/*")),
444    );
445    headers.insert(USER_AGENT, HeaderValue::from_static(GITHUB_USER_AGENT));
446    headers.insert(
447        "X-GitHub-Api-Version",
448        HeaderValue::from_static("2022-11-28"),
449    );
450    if let Ok(token) = std::env::var("GITHUB_TOKEN").or_else(|_| std::env::var("GH_TOKEN")) {
451        if let Ok(value) = HeaderValue::from_str(&format!("Bearer {token}")) {
452            headers.insert(AUTHORIZATION, value);
453        }
454    }
455    headers
456}
457
458fn decode_base64_text(content: &str) -> anyhow::Result<String> {
459    let stripped: String = content.chars().filter(|ch| !ch.is_whitespace()).collect();
460    let bytes = STANDARD
461        .decode(stripped)
462        .context("Decoding GitHub README base64 content")?;
463    Ok(String::from_utf8_lossy(&bytes).into_owned())
464}
465
466fn repository_url(snapshot: &GithubRepositorySnapshot) -> &str {
467    if snapshot.repository.html_url.is_empty() {
468        &snapshot.source_url
469    } else {
470        &snapshot.repository.html_url
471    }
472}
473
474fn push_optional_line(lines: &mut Vec<String>, line: Option<String>) {
475    if let Some(line) = line {
476        lines.push(line);
477    }
478}
479
480fn append_tree_markdown(lines: &mut Vec<String>, tree: &[GithubTreeEntry]) {
481    if tree.is_empty() {
482        lines.push("- No root files returned by the GitHub API.".to_string());
483        return;
484    }
485
486    for item in tree {
487        let label = if item.kind == "dir" {
488            format!("{}/", item.name)
489        } else {
490            item.name.clone()
491        };
492        let suffix = if item.kind == "file" {
493            item.size
494                .map_or_else(String::new, |size| format!(" ({})", format_bytes(size)))
495        } else {
496            String::new()
497        };
498        lines.push(format!("- [{label}]({}){suffix}", item.html_url));
499    }
500}
501
502fn append_tree_text(lines: &mut Vec<String>, tree: &[GithubTreeEntry]) {
503    if tree.is_empty() {
504        lines.push("- No root files returned by the GitHub API.".to_string());
505        return;
506    }
507
508    for item in tree {
509        let label = if item.kind == "dir" {
510            format!("{}/", item.name)
511        } else {
512            item.name.clone()
513        };
514        let suffix = if item.kind == "file" {
515            item.size
516                .map_or_else(String::new, |size| format!(" ({})", format_bytes(size)))
517        } else {
518            String::new()
519        };
520        lines.push(format!("- {label}{suffix}"));
521    }
522}
523
524fn append_readme_content(lines: &mut Vec<String>, readme: Option<&GithubReadme>) {
525    if let Some(content) = readme.and_then(|readme| readme.content.as_deref()) {
526        lines.push(content.trim_end().to_string());
527    } else {
528        lines.push("README content was not returned by the GitHub API.".to_string());
529    }
530    lines.push(String::new());
531}
532
533fn format_bytes(size: u64) -> String {
534    if size < 1024 {
535        return format!("{size} B");
536    }
537    if size < 1024 * 1024 {
538        return format_scaled_bytes(size, 1024, "KB");
539    }
540    format_scaled_bytes(size, 1024 * 1024, "MB")
541}
542
543fn format_scaled_bytes(size: u64, unit: u64, suffix: &str) -> String {
544    let mut whole = size / unit;
545    let mut tenth = ((size % unit) * 10 + unit / 2) / unit;
546    if tenth == 10 {
547        whole += 1;
548        tenth = 0;
549    }
550    format!("{whole}.{tenth} {suffix}")
551}