use reqwest::blocking::{Client, Response};
use std::time::Duration;
const DEFAULT_TIMEOUT_SECS: u64 = 90;
const MAX_BODY_CHARS: usize = 240_000;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GithubFetchKind {
ReadmeApi,
RawUserContent,
PlainHttp,
}
#[derive(Debug, Clone)]
pub struct FetchedUrlBody {
pub text: String,
pub kind: GithubFetchKind,
pub resolved_url: String,
}
fn http_client() -> Result<Client, String> {
Client::builder()
.timeout(Duration::from_secs(DEFAULT_TIMEOUT_SECS))
.user_agent(concat!("Kowalski/", env!("CARGO_PKG_VERSION")))
.build()
.map_err(|e| e.to_string())
}
fn github_token() -> Option<String> {
std::env::var("GITHUB_TOKEN")
.ok()
.filter(|s| !s.trim().is_empty())
}
fn apply_github_auth(req: reqwest::blocking::RequestBuilder) -> reqwest::blocking::RequestBuilder {
if let Some(token) = github_token() {
req.header("Authorization", format!("Bearer {}", token.trim()))
} else {
req
}
}
fn github_tail(url: &str) -> Option<String> {
let u = url.trim().trim_end_matches('/');
for prefix in [
"https://github.com/",
"http://github.com/",
"https://www.github.com/",
"http://www.github.com/",
] {
if let Some(t) = u.strip_prefix(prefix) {
return Some(t.to_string());
}
}
None
}
fn readme_api_url(owner: &str, repo: &str) -> String {
format!("https://api.github.com/repos/{owner}/{repo}/readme")
}
fn blob_url_to_raw(owner: &str, repo: &str, git_ref: &str, path: &str) -> String {
format!("https://raw.githubusercontent.com/{owner}/{repo}/{git_ref}/{path}")
}
pub fn resolve_github_fetch(url: &str) -> Option<ResolvedGithub> {
let tail = github_tail(url)?;
let segments: Vec<&str> = tail.split('/').filter(|s| !s.is_empty()).collect();
if segments.len() < 2 {
return None;
}
let owner = segments[0];
let repo = segments[1];
if segments.len() == 2 {
return Some(ResolvedGithub::Readme {
owner: owner.to_string(),
repo: repo.to_string(),
});
}
match segments.get(2).copied() {
Some("blob") if segments.len() >= 5 => {
let git_ref = segments[3];
let path = segments[4..].join("/");
Some(ResolvedGithub::RawFile {
owner: owner.to_string(),
repo: repo.to_string(),
git_ref: git_ref.to_string(),
path,
})
}
Some("raw") if segments.len() >= 5 => {
let git_ref = segments[3];
let path = segments[4..].join("/");
Some(ResolvedGithub::RawFile {
owner: owner.to_string(),
repo: repo.to_string(),
git_ref: git_ref.to_string(),
path,
})
}
_ => None,
}
}
#[derive(Debug, Clone)]
pub enum ResolvedGithub {
Readme { owner: String, repo: String },
RawFile {
owner: String,
repo: String,
git_ref: String,
path: String,
},
}
impl ResolvedGithub {
fn fetch(&self, client: &Client) -> Result<Response, String> {
match self {
ResolvedGithub::Readme { owner, repo } => {
let u = readme_api_url(owner, repo);
let req = apply_github_auth(
client
.get(&u)
.header("Accept", "application/vnd.github.raw+json"),
);
req.send().map_err(|e| e.to_string())
}
ResolvedGithub::RawFile {
owner,
repo,
git_ref,
path,
} => {
let u = blob_url_to_raw(owner, repo, git_ref, path);
let req = apply_github_auth(client.get(&u));
req.send().map_err(|e| e.to_string())
}
}
}
fn resolved_url_display(&self) -> String {
match self {
ResolvedGithub::Readme { owner, repo } => readme_api_url(owner, repo),
ResolvedGithub::RawFile {
owner,
repo,
git_ref,
path,
} => blob_url_to_raw(owner, repo, git_ref, path),
}
}
fn fetch_kind(&self) -> GithubFetchKind {
match self {
ResolvedGithub::Readme { .. } => GithubFetchKind::ReadmeApi,
ResolvedGithub::RawFile { .. } => GithubFetchKind::RawUserContent,
}
}
}
fn read_response_text(resp: Response) -> Result<String, String> {
if !resp.status().is_success() {
return Err(format!(
"HTTP {} {}",
resp.status().as_u16(),
resp.status().canonical_reason().unwrap_or("")
));
}
let text = resp
.text()
.map_err(|e| e.to_string())?
.chars()
.take(MAX_BODY_CHARS)
.collect::<String>();
Ok(text)
}
pub fn fetch_url_for_ingest(original_url: &str) -> Result<FetchedUrlBody, String> {
let client = http_client()?;
if let Some(resolved) = resolve_github_fetch(original_url) {
let kind = resolved.fetch_kind();
let resolved_url = resolved.resolved_url_display();
if let Ok(resp) = resolved.fetch(&client) {
if let Ok(text) = read_response_text(resp) {
return Ok(FetchedUrlBody {
text,
kind,
resolved_url,
});
}
}
}
let resp = client.get(original_url).send().map_err(|e| e.to_string())?;
let text = read_response_text(resp)?;
Ok(FetchedUrlBody {
text,
kind: GithubFetchKind::PlainHttp,
resolved_url: original_url.to_string(),
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn readme_repo_only() {
let r = resolve_github_fetch("https://github.com/octocat/Hello-World").unwrap();
match r {
ResolvedGithub::Readme { owner, repo } => {
assert_eq!(owner, "octocat");
assert_eq!(repo, "Hello-World");
}
_ => panic!("expected Readme"),
}
}
#[test]
fn readme_trailing_slash() {
assert!(resolve_github_fetch("https://github.com/foo/bar/").is_some());
}
#[test]
fn blob_to_raw_resolution() {
let r = resolve_github_fetch(
"https://github.com/rust-lang/rust/blob/master/README.md",
)
.unwrap();
match r {
ResolvedGithub::RawFile {
owner,
repo,
git_ref,
path,
} => {
assert_eq!(owner, "rust-lang");
assert_eq!(repo, "rust");
assert_eq!(git_ref, "master");
assert_eq!(path, "README.md");
}
_ => panic!("expected RawFile"),
}
}
#[test]
fn nested_blob_path() {
let r = resolve_github_fetch(
"https://github.com/o/r/blob/main/docs/guide.md",
)
.unwrap();
match r {
ResolvedGithub::RawFile { path, .. } => assert_eq!(path, "docs/guide.md"),
_ => panic!("expected RawFile"),
}
}
#[test]
fn non_github_returns_none() {
assert!(resolve_github_fetch("https://example.com/page").is_none());
}
#[test]
fn raw_display_url() {
let r = resolve_github_fetch(
"https://github.com/a/b/blob/v1.0/Cargo.toml",
)
.unwrap();
assert_eq!(
r.resolved_url_display(),
"https://raw.githubusercontent.com/a/b/v1.0/Cargo.toml"
);
}
}