use anyhow::{Context, Result};
use async_trait::async_trait;
use super::{SiteContent, SiteMetadata, SiteProvider};
use crate::http_client::AcceleratedClient;
const DEEP_PATH_SEGMENTS: &[&str] = &[
"issues",
"pull",
"blob",
"commits",
"commit",
"releases",
"actions",
"discussions",
"security",
"insights",
"settings",
"packages",
"wiki",
"compare",
];
pub struct GitHubProvider;
#[async_trait]
impl SiteProvider for GitHubProvider {
fn name(&self) -> &'static str {
"github"
}
fn matches(&self, url: &str) -> bool {
let normalized = url.to_lowercase();
let normalized = normalized.split('?').next().unwrap_or(&normalized);
if !normalized.contains("github.com/") {
return false;
}
is_repo_root_url(normalized)
}
async fn extract(
&self,
url: &str,
client: &AcceleratedClient,
_cookies: Option<&str>,
_prefetched_html: Option<&[u8]>,
) -> Result<SiteContent> {
extract_repo_readme(url, client).await
}
}
fn is_repo_root_url(normalized: &str) -> bool {
let after_host = normalized
.split_once("github.com/")
.map_or("", |(_, rest)| rest);
let segments: Vec<&str> = after_host.split('/').filter(|s| !s.is_empty()).collect();
match segments.len() {
2 => true,
n if n >= 3 => {
let third = segments[2];
third == "tree" && !DEEP_PATH_SEGMENTS.contains(&segments[2])
}
_ => false,
}
}
fn parse_repo_url(url: &str) -> Result<(String, String)> {
let url = url.split('?').next().unwrap_or(url);
let parts: Vec<&str> = url.split('/').collect();
let gh_idx = parts
.iter()
.position(|p| p.to_lowercase().contains("github.com"))
.context("URL does not contain github.com")?;
let owner = parts
.get(gh_idx + 1)
.filter(|s| !s.is_empty())
.context("Could not extract owner from URL")?
.to_string();
let repo = parts
.get(gh_idx + 2)
.filter(|s| !s.is_empty())
.context("Could not extract repo from URL")?
.to_string();
Ok((owner, repo))
}
async fn extract_repo_readme(url: &str, client: &AcceleratedClient) -> Result<SiteContent> {
let (owner, repo) = parse_repo_url(url)?;
let readme = fetch_readme(client, &owner, &repo).await?;
let canonical_url = format!("https://github.com/{owner}/{repo}");
let metadata = SiteMetadata {
author: None,
title: Some(format!("{owner}/{repo}")),
published: None,
platform: "GitHub".to_string(),
canonical_url,
media_urls: vec![],
engagement: None,
};
Ok(SiteContent {
markdown: readme,
metadata,
})
}
async fn fetch_readme(client: &AcceleratedClient, owner: &str, repo: &str) -> Result<String> {
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/readme");
tracing::debug!("Fetching README from GitHub: {}", api_url);
let response = client
.inner()
.get(&api_url)
.header("User-Agent", "nab/0.5.0")
.header("Accept", "application/vnd.github.raw+json")
.send()
.await
.context("Failed to fetch README from GitHub API")?
.text()
.await
.context("Failed to read README response body")?;
Ok(response)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn does_not_match_github_issues_urls() {
let provider = GitHubProvider;
assert!(!provider.matches("https://github.com/rust-lang/rust/issues/12345"));
assert!(!provider.matches("https://GITHUB.COM/owner/repo/ISSUES/999"));
}
#[test]
fn does_not_match_github_pull_request_urls() {
let provider = GitHubProvider;
assert!(!provider.matches("https://github.com/rust-lang/rust/pull/67890"));
assert!(!provider.matches("https://github.com/owner/repo/pull/1"));
}
#[test]
fn matches_github_repo_root_urls() {
let provider = GitHubProvider;
assert!(provider.matches("https://github.com/rust-lang/rust"));
assert!(provider.matches("https://github.com/owner/repo"));
assert!(provider.matches("https://github.com/owner/repo/"));
assert!(provider.matches("https://github.com/owner/repo/tree/main"));
assert!(provider.matches("https://github.com/owner/repo/tree/feature/my-branch"));
}
#[test]
fn does_not_match_non_repo_github_urls() {
let provider = GitHubProvider;
assert!(!provider.matches("https://github.com/owner/repo/commits"));
assert!(!provider.matches("https://github.com/owner/repo/blob/main/src/lib.rs"));
assert!(!provider.matches("https://github.com/owner/repo/releases"));
assert!(!provider.matches("https://github.com/owner/repo/actions"));
assert!(!provider.matches("https://github.com/owner/repo/wiki"));
assert!(!provider.matches("https://youtube.com/watch?v=abc"));
}
#[test]
fn does_not_match_github_user_profile_urls() {
let provider = GitHubProvider;
assert!(!provider.matches("https://github.com/rust-lang"));
assert!(!provider.matches("https://github.com/"));
}
#[test]
fn is_repo_root_url_recognises_owner_repo() {
assert!(is_repo_root_url("github.com/owner/repo"));
assert!(is_repo_root_url("github.com/owner/repo/"));
}
#[test]
fn is_repo_root_url_accepts_tree_path() {
assert!(is_repo_root_url("github.com/owner/repo/tree/main"));
assert!(is_repo_root_url("github.com/owner/repo/tree/v1.0.0"));
}
#[test]
fn is_repo_root_url_rejects_deep_segments() {
assert!(!is_repo_root_url("github.com/owner/repo/issues/1"));
assert!(!is_repo_root_url(
"github.com/owner/repo/blob/main/Cargo.toml"
));
assert!(!is_repo_root_url("github.com/owner"));
}
#[test]
fn parse_repo_url_extracts_owner_and_repo() {
let (owner, repo) = parse_repo_url("https://github.com/rust-lang/rust").unwrap();
assert_eq!(owner, "rust-lang");
assert_eq!(repo, "rust");
}
#[test]
fn parse_repo_url_strips_query_and_tree_suffix() {
let (owner, repo) =
parse_repo_url("https://github.com/owner/my-repo/tree/main?tab=readme").unwrap();
assert_eq!(owner, "owner");
assert_eq!(repo, "my-repo");
}
}