1use anyhow::{anyhow, Context};
8use base64::{engine::general_purpose::STANDARD, Engine as _};
9use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, AUTHORIZATION, USER_AGENT};
10use reqwest::StatusCode;
11use serde::de::DeserializeOwned;
12use serde::Deserialize;
13use url::Url;
14
15const GITHUB_API_BASE: &str = "https://api.github.com";
16const GITHUB_USER_AGENT: &str = "web-capture";
17
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct GithubRepositoryUrl {
20 pub owner: String,
21 pub repo: String,
22 pub full_name: String,
23 pub html_url: String,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct GithubRepositoryMetadata {
28 pub full_name: String,
29 pub html_url: String,
30 pub description: Option<String>,
31 pub language: Option<String>,
32 pub stargazers_count: u64,
33 pub forks_count: u64,
34 pub open_issues_count: u64,
35 pub license_spdx_id: Option<String>,
36 pub topics: Vec<String>,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct GithubReadme {
41 pub name: String,
42 pub path: String,
43 pub html_url: Option<String>,
44 pub content: Option<String>,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct GithubTreeEntry {
49 pub name: String,
50 pub path: String,
51 pub kind: String,
52 pub size: Option<u64>,
53 pub html_url: String,
54}
55
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct GithubRepositorySnapshot {
58 pub source_url: String,
59 pub repository: GithubRepositoryMetadata,
60 pub default_branch: String,
61 pub readme: Option<GithubReadme>,
62 pub tree: Vec<GithubTreeEntry>,
63}
64
65#[derive(Debug, Deserialize)]
66struct RepositoryApiResponse {
67 full_name: String,
68 html_url: String,
69 description: Option<String>,
70 default_branch: Option<String>,
71 language: Option<String>,
72 stargazers_count: Option<u64>,
73 forks_count: Option<u64>,
74 open_issues_count: Option<u64>,
75 license: Option<RepositoryLicenseApiResponse>,
76 topics: Option<Vec<String>>,
77}
78
79#[derive(Debug, Deserialize)]
80struct RepositoryLicenseApiResponse {
81 spdx_id: Option<String>,
82}
83
84#[derive(Debug, Deserialize)]
85struct ReadmeApiResponse {
86 name: Option<String>,
87 path: Option<String>,
88 html_url: Option<String>,
89 download_url: Option<String>,
90 content: Option<String>,
91 encoding: Option<String>,
92}
93
94#[derive(Debug, Deserialize)]
95struct ContentsApiResponse {
96 name: String,
97 path: String,
98 #[serde(rename = "type")]
99 kind: String,
100 size: Option<u64>,
101 html_url: Option<String>,
102}
103
104#[must_use]
110pub fn parse_github_repository_url(url: &str) -> Option<GithubRepositoryUrl> {
111 let parsed = Url::parse(url).ok()?;
112 let host = parsed.host_str()?.to_ascii_lowercase();
113 if host != "github.com" && host != "www.github.com" {
114 return None;
115 }
116
117 let parts: Vec<_> = parsed
118 .path_segments()?
119 .filter(|segment| !segment.is_empty())
120 .collect();
121 if parts.len() != 2 {
122 return None;
123 }
124
125 let owner = parts[0].to_string();
126 let repo = parts[1].to_string();
127 if owner.is_empty() || repo.is_empty() {
128 return None;
129 }
130
131 Some(GithubRepositoryUrl {
132 full_name: format!("{owner}/{repo}"),
133 html_url: format!("https://github.com/{owner}/{repo}"),
134 owner,
135 repo,
136 })
137}
138
139#[must_use]
140pub fn is_github_repository_url(url: &str) -> bool {
141 parse_github_repository_url(url).is_some()
142}
143
144#[must_use]
145pub fn github_repository_text_filename(url: &str) -> Option<String> {
146 parse_github_repository_url(url).map(|repo| format!("{}-{}.txt", repo.owner, repo.repo))
147}
148
149pub async fn fetch_github_repository_snapshot(
150 url: &str,
151) -> anyhow::Result<GithubRepositorySnapshot> {
152 let parsed = parse_github_repository_url(url)
153 .ok_or_else(|| anyhow!("Not a GitHub repository URL: {url}"))?;
154
155 let repository: RepositoryApiResponse = fetch_github_json(&format!(
156 "{GITHUB_API_BASE}/repos/{}/{}",
157 parsed.owner, parsed.repo
158 ))
159 .await?
160 .ok_or_else(|| anyhow!("Repository was not returned by the GitHub API"))?;
161
162 let default_branch = repository
163 .default_branch
164 .clone()
165 .unwrap_or_else(|| "main".to_string());
166
167 let (readme, tree) = tokio::try_join!(
168 fetch_github_readme(&parsed, &default_branch),
169 fetch_github_root_tree(&parsed, &default_branch)
170 )?;
171
172 Ok(GithubRepositorySnapshot {
173 source_url: parsed.html_url,
174 repository: GithubRepositoryMetadata {
175 full_name: repository.full_name,
176 html_url: repository.html_url,
177 description: repository.description,
178 language: repository.language,
179 stargazers_count: repository.stargazers_count.unwrap_or_default(),
180 forks_count: repository.forks_count.unwrap_or_default(),
181 open_issues_count: repository.open_issues_count.unwrap_or_default(),
182 license_spdx_id: repository.license.and_then(|license| license.spdx_id),
183 topics: repository.topics.unwrap_or_default(),
184 },
185 default_branch,
186 readme,
187 tree,
188 })
189}
190
191#[must_use]
192pub fn format_github_repository_markdown(snapshot: &GithubRepositorySnapshot) -> String {
193 let mut lines = vec![
194 format!("# {}", snapshot.repository.full_name),
195 String::new(),
196 ];
197 if let Some(description) = &snapshot.repository.description {
198 lines.push(format!("> {description}"));
199 lines.push(String::new());
200 }
201
202 lines.extend([
203 "## Repository".to_string(),
204 String::new(),
205 format!("- URL: {}", repository_url(snapshot)),
206 format!("- Default branch: `{}`", snapshot.default_branch),
207 ]);
208 push_optional_line(
209 &mut lines,
210 snapshot
211 .repository
212 .language
213 .as_ref()
214 .map(|language| format!("- Primary language: {language}")),
215 );
216 lines.push(format!("- Stars: {}", snapshot.repository.stargazers_count));
217 lines.push(format!("- Forks: {}", snapshot.repository.forks_count));
218 lines.push(format!(
219 "- Open issues: {}",
220 snapshot.repository.open_issues_count
221 ));
222 push_optional_line(
223 &mut lines,
224 snapshot
225 .repository
226 .license_spdx_id
227 .as_ref()
228 .map(|license| format!("- License: {license}")),
229 );
230 if !snapshot.repository.topics.is_empty() {
231 lines.push(format!(
232 "- Topics: {}",
233 snapshot.repository.topics.join(", ")
234 ));
235 }
236
237 lines.extend([String::new(), "## Files".to_string(), String::new()]);
238 append_tree_markdown(&mut lines, &snapshot.tree);
239
240 let readme_path = snapshot
241 .readme
242 .as_ref()
243 .map_or("README", |readme| readme.path.as_str());
244 lines.extend([String::new(), format!("## {readme_path}"), String::new()]);
245 append_readme_content(&mut lines, snapshot.readme.as_ref());
246
247 lines.join("\n")
248}
249
250#[must_use]
251pub fn format_github_repository_text(snapshot: &GithubRepositorySnapshot) -> String {
252 let mut lines = vec![format!("Repository: {}", snapshot.repository.full_name)];
253 if let Some(description) = &snapshot.repository.description {
254 lines.push(format!("Description: {description}"));
255 }
256 lines.extend([
257 format!("URL: {}", repository_url(snapshot)),
258 format!("Default branch: {}", snapshot.default_branch),
259 ]);
260 push_optional_line(
261 &mut lines,
262 snapshot
263 .repository
264 .language
265 .as_ref()
266 .map(|language| format!("Primary language: {language}")),
267 );
268 lines.push(format!("Stars: {}", snapshot.repository.stargazers_count));
269 lines.push(format!("Forks: {}", snapshot.repository.forks_count));
270 lines.push(format!(
271 "Open issues: {}",
272 snapshot.repository.open_issues_count
273 ));
274 push_optional_line(
275 &mut lines,
276 snapshot
277 .repository
278 .license_spdx_id
279 .as_ref()
280 .map(|license| format!("License: {license}")),
281 );
282 if !snapshot.repository.topics.is_empty() {
283 lines.push(format!("Topics: {}", snapshot.repository.topics.join(", ")));
284 }
285
286 lines.extend([String::new(), "Files:".to_string()]);
287 append_tree_text(&mut lines, &snapshot.tree);
288
289 let readme_path = snapshot
290 .readme
291 .as_ref()
292 .map_or("README", |readme| readme.path.as_str());
293 lines.extend([String::new(), format!("{readme_path}:"), String::new()]);
294 append_readme_content(&mut lines, snapshot.readme.as_ref());
295
296 lines.join("\n")
297}
298
299async fn fetch_github_readme(
300 parsed: &GithubRepositoryUrl,
301 default_branch: &str,
302) -> anyhow::Result<Option<GithubReadme>> {
303 let readme: Option<ReadmeApiResponse> = fetch_optional_github_json(&format!(
304 "{GITHUB_API_BASE}/repos/{}/{}/readme?ref={default_branch}",
305 parsed.owner, parsed.repo
306 ))
307 .await?;
308
309 let Some(readme) = readme else {
310 return Ok(None);
311 };
312
313 let content = if readme.encoding.as_deref() == Some("base64") {
314 readme
315 .content
316 .as_deref()
317 .map(decode_base64_text)
318 .transpose()?
319 } else if let Some(download_url) = readme.download_url.as_deref() {
320 fetch_optional_github_text(download_url).await?
321 } else {
322 None
323 };
324
325 let name = readme.name.unwrap_or_else(|| "README".to_string());
326 let path = readme.path.unwrap_or_else(|| name.clone());
327 Ok(Some(GithubReadme {
328 name,
329 path,
330 html_url: readme.html_url,
331 content,
332 }))
333}
334
335async fn fetch_github_root_tree(
336 parsed: &GithubRepositoryUrl,
337 default_branch: &str,
338) -> anyhow::Result<Vec<GithubTreeEntry>> {
339 let contents: Option<Vec<ContentsApiResponse>> = fetch_optional_github_json(&format!(
340 "{GITHUB_API_BASE}/repos/{}/{}/contents?ref={default_branch}",
341 parsed.owner, parsed.repo
342 ))
343 .await?;
344
345 let mut tree: Vec<_> = contents
346 .unwrap_or_default()
347 .into_iter()
348 .map(|item| {
349 let html_url = item.html_url.unwrap_or_else(|| {
350 let kind = if item.kind == "dir" { "tree" } else { "blob" };
351 format!(
352 "https://github.com/{}/{}/{kind}/{default_branch}/{}",
353 parsed.owner, parsed.repo, item.path
354 )
355 });
356 GithubTreeEntry {
357 name: item.name,
358 path: item.path,
359 kind: item.kind,
360 size: item.size,
361 html_url,
362 }
363 })
364 .collect();
365 tree.sort_by(
366 |a, b| match (a.kind.as_str() == "dir", b.kind.as_str() == "dir") {
367 (true, false) => std::cmp::Ordering::Less,
368 (false, true) => std::cmp::Ordering::Greater,
369 _ => a.name.cmp(&b.name),
370 },
371 );
372 Ok(tree)
373}
374
375async fn fetch_github_json<T>(url: &str) -> anyhow::Result<Option<T>>
376where
377 T: DeserializeOwned,
378{
379 fetch_github_json_with_optional_not_found(url, false).await
380}
381
382async fn fetch_optional_github_json<T>(url: &str) -> anyhow::Result<Option<T>>
383where
384 T: DeserializeOwned,
385{
386 fetch_github_json_with_optional_not_found(url, true).await
387}
388
389async fn fetch_github_json_with_optional_not_found<T>(
390 url: &str,
391 optional: bool,
392) -> anyhow::Result<Option<T>>
393where
394 T: DeserializeOwned,
395{
396 let response = reqwest::Client::new()
397 .get(url)
398 .headers(github_headers("application/vnd.github+json"))
399 .send()
400 .await
401 .with_context(|| format!("Requesting {url}"))?;
402 if optional && response.status() == StatusCode::NOT_FOUND {
403 return Ok(None);
404 }
405 let status = response.status();
406 let body = response
407 .text()
408 .await
409 .with_context(|| format!("Reading response body from {url}"))?;
410 if !status.is_success() {
411 anyhow::bail!("GitHub API {status}: {body}");
412 }
413 Ok(Some(serde_json::from_str(&body).with_context(|| {
414 format!("Parsing GitHub JSON from {url}")
415 })?))
416}
417
418async fn fetch_optional_github_text(url: &str) -> anyhow::Result<Option<String>> {
419 let response = reqwest::Client::new()
420 .get(url)
421 .headers(github_headers("text/plain"))
422 .send()
423 .await
424 .with_context(|| format!("Requesting {url}"))?;
425 if response.status() == StatusCode::NOT_FOUND {
426 return Ok(None);
427 }
428 let status = response.status();
429 let text = response
430 .text()
431 .await
432 .with_context(|| format!("Reading text response from {url}"))?;
433 if !status.is_success() {
434 anyhow::bail!("GitHub raw {status}: {text}");
435 }
436 Ok(Some(text))
437}
438
439fn github_headers(accept: &str) -> HeaderMap {
440 let mut headers = HeaderMap::new();
441 headers.insert(
442 ACCEPT,
443 HeaderValue::from_str(accept).unwrap_or_else(|_| HeaderValue::from_static("*/*")),
444 );
445 headers.insert(USER_AGENT, HeaderValue::from_static(GITHUB_USER_AGENT));
446 headers.insert(
447 "X-GitHub-Api-Version",
448 HeaderValue::from_static("2022-11-28"),
449 );
450 if let Ok(token) = std::env::var("GITHUB_TOKEN").or_else(|_| std::env::var("GH_TOKEN")) {
451 if let Ok(value) = HeaderValue::from_str(&format!("Bearer {token}")) {
452 headers.insert(AUTHORIZATION, value);
453 }
454 }
455 headers
456}
457
458fn decode_base64_text(content: &str) -> anyhow::Result<String> {
459 let stripped: String = content.chars().filter(|ch| !ch.is_whitespace()).collect();
460 let bytes = STANDARD
461 .decode(stripped)
462 .context("Decoding GitHub README base64 content")?;
463 Ok(String::from_utf8_lossy(&bytes).into_owned())
464}
465
466fn repository_url(snapshot: &GithubRepositorySnapshot) -> &str {
467 if snapshot.repository.html_url.is_empty() {
468 &snapshot.source_url
469 } else {
470 &snapshot.repository.html_url
471 }
472}
473
474fn push_optional_line(lines: &mut Vec<String>, line: Option<String>) {
475 if let Some(line) = line {
476 lines.push(line);
477 }
478}
479
480fn append_tree_markdown(lines: &mut Vec<String>, tree: &[GithubTreeEntry]) {
481 if tree.is_empty() {
482 lines.push("- No root files returned by the GitHub API.".to_string());
483 return;
484 }
485
486 for item in tree {
487 let label = if item.kind == "dir" {
488 format!("{}/", item.name)
489 } else {
490 item.name.clone()
491 };
492 let suffix = if item.kind == "file" {
493 item.size
494 .map_or_else(String::new, |size| format!(" ({})", format_bytes(size)))
495 } else {
496 String::new()
497 };
498 lines.push(format!("- [{label}]({}){suffix}", item.html_url));
499 }
500}
501
502fn append_tree_text(lines: &mut Vec<String>, tree: &[GithubTreeEntry]) {
503 if tree.is_empty() {
504 lines.push("- No root files returned by the GitHub API.".to_string());
505 return;
506 }
507
508 for item in tree {
509 let label = if item.kind == "dir" {
510 format!("{}/", item.name)
511 } else {
512 item.name.clone()
513 };
514 let suffix = if item.kind == "file" {
515 item.size
516 .map_or_else(String::new, |size| format!(" ({})", format_bytes(size)))
517 } else {
518 String::new()
519 };
520 lines.push(format!("- {label}{suffix}"));
521 }
522}
523
524fn append_readme_content(lines: &mut Vec<String>, readme: Option<&GithubReadme>) {
525 if let Some(content) = readme.and_then(|readme| readme.content.as_deref()) {
526 lines.push(content.trim_end().to_string());
527 } else {
528 lines.push("README content was not returned by the GitHub API.".to_string());
529 }
530 lines.push(String::new());
531}
532
533fn format_bytes(size: u64) -> String {
534 if size < 1024 {
535 return format!("{size} B");
536 }
537 if size < 1024 * 1024 {
538 return format_scaled_bytes(size, 1024, "KB");
539 }
540 format_scaled_bytes(size, 1024 * 1024, "MB")
541}
542
543fn format_scaled_bytes(size: u64, unit: u64, suffix: &str) -> String {
544 let mut whole = size / unit;
545 let mut tenth = ((size % unit) * 10 + unit / 2) / unit;
546 if tenth == 10 {
547 whole += 1;
548 tenth = 0;
549 }
550 format!("{whole}.{tenth} {suffix}")
551}