1use crate::client::FetchOptions;
7use crate::error::FetchError;
8use crate::fetchers::default::{read_full_body, transport_request};
9use crate::fetchers::Fetcher;
10use crate::types::{FetchRequest, FetchResponse};
11use crate::DEFAULT_USER_AGENT;
12use async_trait::async_trait;
13use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, USER_AGENT};
14use serde::Deserialize;
15use std::time::Duration;
16use url::Url;
17
18const API_TIMEOUT: Duration = Duration::from_secs(10);
19
20const GITHUB_API_HOST: &str = "api.github.com";
22const GITHUB_API_PORT: u16 = 443;
23
24const MAX_INLINE_SIZE: u64 = 1_048_576;
26
27pub struct GitHubCodeFetcher;
32
33impl GitHubCodeFetcher {
34 pub fn new() -> Self {
35 Self
36 }
37
38 fn parse_url(url: &Url) -> Option<ParsedBlobUrl> {
40 if url.host_str() != Some("github.com") {
41 return None;
42 }
43
44 let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
45
46 if segments.len() < 5 {
48 return None;
49 }
50
51 let owner = segments[0];
52 let repo = segments[1];
53 let kind = segments[2];
54 let git_ref = segments[3];
55
56 if owner.is_empty() || repo.is_empty() || git_ref.is_empty() {
57 return None;
58 }
59
60 if kind != "blob" {
61 return None;
62 }
63
64 let reserved = [
66 "settings",
67 "explore",
68 "trending",
69 "collections",
70 "events",
71 "sponsors",
72 "notifications",
73 "marketplace",
74 "pulls",
75 "issues",
76 "codespaces",
77 "features",
78 "enterprise",
79 "organizations",
80 "pricing",
81 "about",
82 "team",
83 "security",
84 "login",
85 "join",
86 ];
87 if reserved.contains(&owner) {
88 return None;
89 }
90
91 let file_path = segments[4..].join("/");
93 if file_path.is_empty() {
94 return None;
95 }
96
97 Some(ParsedBlobUrl {
98 owner: owner.to_string(),
99 repo: repo.to_string(),
100 git_ref: git_ref.to_string(),
101 path: file_path,
102 })
103 }
104}
105
106impl Default for GitHubCodeFetcher {
107 fn default() -> Self {
108 Self::new()
109 }
110}
111
112struct ParsedBlobUrl {
113 owner: String,
114 repo: String,
115 git_ref: String,
116 path: String,
117}
118
119#[derive(Debug, Deserialize)]
120struct GitHubContents {
121 name: String,
122 path: String,
123 size: u64,
124 #[serde(rename = "type")]
125 content_type: String,
126 content: Option<String>,
127 html_url: Option<String>,
128}
129
130#[async_trait]
131impl Fetcher for GitHubCodeFetcher {
132 fn name(&self) -> &'static str {
133 "github_code"
134 }
135
136 fn matches(&self, url: &Url) -> bool {
137 Self::parse_url(url).is_some()
138 }
139
140 async fn fetch(
141 &self,
142 request: &FetchRequest,
143 options: &FetchOptions,
144 ) -> Result<FetchResponse, FetchError> {
145 let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
146
147 let parsed = Self::parse_url(&url)
148 .ok_or_else(|| FetchError::FetcherError("Not a valid GitHub blob URL".to_string()))?;
149
150 let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
151 let ua_header = HeaderValue::from_str(user_agent)
152 .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
153 let accept_header = HeaderValue::from_static("application/vnd.github+json");
154
155 let api_url = format!(
157 "https://api.github.com/repos/{}/{}/contents/{}?ref={}",
158 parsed.owner, parsed.repo, parsed.path, parsed.git_ref
159 );
160 let parsed_api_url = Url::parse(&api_url).map_err(|_| FetchError::InvalidUrlScheme)?;
161 options.validate_url(&parsed_api_url)?;
163
164 let mut headers = HeaderMap::new();
165 headers.insert(USER_AGENT, ua_header);
166 headers.insert(ACCEPT, accept_header);
167
168 let response = transport_request(
170 parsed_api_url,
171 reqwest::Method::GET,
172 headers,
173 options,
174 API_TIMEOUT,
175 GITHUB_API_HOST,
176 GITHUB_API_PORT,
177 )
178 .await?;
179
180 let status_code = response.status;
181 if !(200..300).contains(&status_code) {
182 let error_msg = if status_code == 404 {
183 format!(
184 "{}/{}:{} {} not found",
185 parsed.owner, parsed.repo, parsed.git_ref, parsed.path
186 )
187 } else if status_code == 403 {
188 "GitHub API rate limit exceeded".to_string()
189 } else {
190 format!("GitHub API error: HTTP {}", status_code)
191 };
192 return Ok(FetchResponse {
193 url: request.url.clone(),
194 status_code,
195 error: Some(error_msg),
196 ..Default::default()
197 });
198 }
199
200 let body = read_full_body(response, options).await?;
201 let contents: GitHubContents = serde_json::from_slice(&body)
202 .map_err(|e| FetchError::FetcherError(format!("Failed to parse contents: {}", e)))?;
203
204 if contents.content_type != "file" {
206 return Ok(FetchResponse {
207 url: request.url.clone(),
208 status_code: 200,
209 format: Some("github_file".to_string()),
210 error: Some(format!("Path is a {} (not a file)", contents.content_type)),
211 ..Default::default()
212 });
213 }
214
215 if contents.size > MAX_INLINE_SIZE || contents.content.is_none() {
217 let content = format_metadata_only(&parsed, &contents);
218 return Ok(FetchResponse {
219 url: request.url.clone(),
220 status_code: 200,
221 content_type: Some("text/markdown".to_string()),
222 format: Some("github_file".to_string()),
223 content: Some(content),
224 size: Some(contents.size),
225 ..Default::default()
226 });
227 }
228
229 let raw_content = contents.content.as_deref().and_then(decode_base64_content);
231
232 let (file_content, is_binary) = match raw_content {
233 Some(bytes) => match String::from_utf8(bytes) {
234 Ok(text) => (Some(text), false),
235 Err(_) => (None, true),
236 },
237 None => (None, true),
238 };
239
240 if is_binary {
241 let content = format_metadata_only(&parsed, &contents);
242 return Ok(FetchResponse {
243 url: request.url.clone(),
244 status_code: 200,
245 content_type: Some("text/markdown".to_string()),
246 format: Some("github_file".to_string()),
247 content: Some(content),
248 size: Some(contents.size),
249 error: Some("Binary file — metadata only".to_string()),
250 ..Default::default()
251 });
252 }
253
254 let lang = detect_language(&contents.name);
255 let content = format_file_response(&parsed, &contents, file_content.as_deref(), lang);
256
257 Ok(FetchResponse {
258 url: request.url.clone(),
259 status_code: 200,
260 content_type: Some("text/markdown".to_string()),
261 format: Some("github_file".to_string()),
262 content: Some(content),
263 size: Some(contents.size),
264 ..Default::default()
265 })
266 }
267}
268
269fn decode_base64_content(encoded: &str) -> Option<Vec<u8>> {
271 let cleaned: String = encoded.chars().filter(|c| !c.is_whitespace()).collect();
272 base64_decode(&cleaned)
273}
274
275fn base64_decode(input: &str) -> Option<Vec<u8>> {
277 const ALPHABET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
278
279 fn decode_char(c: u8) -> Option<u8> {
280 if c == b'=' {
281 return Some(0);
282 }
283 ALPHABET.iter().position(|&x| x == c).map(|p| p as u8)
284 }
285
286 let bytes: Vec<u8> = input.bytes().collect();
287 if !bytes.is_empty() && !bytes.len().is_multiple_of(4) {
288 return None;
289 }
290
291 let mut result = Vec::with_capacity(bytes.len() * 3 / 4);
292
293 for chunk in bytes.chunks(4) {
294 if chunk.len() != 4 {
295 return None;
296 }
297 let a = decode_char(chunk[0])?;
298 let b = decode_char(chunk[1])?;
299 let c = decode_char(chunk[2])?;
300 let d = decode_char(chunk[3])?;
301
302 result.push((a << 2) | (b >> 4));
303 if chunk[2] != b'=' {
304 result.push((b << 4) | (c >> 2));
305 }
306 if chunk[3] != b'=' {
307 result.push((c << 6) | d);
308 }
309 }
310
311 Some(result)
312}
313
314fn detect_language(filename: &str) -> Option<&'static str> {
316 let ext = filename.rsplit('.').next()?;
317 match ext.to_ascii_lowercase().as_str() {
318 "rs" => Some("rust"),
319 "py" => Some("python"),
320 "js" => Some("javascript"),
321 "ts" => Some("typescript"),
322 "tsx" => Some("tsx"),
323 "jsx" => Some("jsx"),
324 "rb" => Some("ruby"),
325 "go" => Some("go"),
326 "java" => Some("java"),
327 "kt" | "kts" => Some("kotlin"),
328 "swift" => Some("swift"),
329 "c" => Some("c"),
330 "cpp" | "cc" | "cxx" => Some("cpp"),
331 "h" | "hpp" => Some("cpp"),
332 "cs" => Some("csharp"),
333 "php" => Some("php"),
334 "sh" | "bash" => Some("bash"),
335 "zsh" => Some("zsh"),
336 "fish" => Some("fish"),
337 "yml" | "yaml" => Some("yaml"),
338 "json" => Some("json"),
339 "toml" => Some("toml"),
340 "xml" => Some("xml"),
341 "html" | "htm" => Some("html"),
342 "css" => Some("css"),
343 "scss" | "sass" => Some("scss"),
344 "sql" => Some("sql"),
345 "md" | "markdown" => Some("markdown"),
346 "dockerfile" => Some("dockerfile"),
347 "tf" => Some("terraform"),
348 "ex" | "exs" => Some("elixir"),
349 "erl" => Some("erlang"),
350 "hs" => Some("haskell"),
351 "ml" | "mli" => Some("ocaml"),
352 "r" => Some("r"),
353 "scala" => Some("scala"),
354 "lua" => Some("lua"),
355 "zig" => Some("zig"),
356 "nim" => Some("nim"),
357 "v" => Some("v"),
358 "dart" => Some("dart"),
359 "proto" => Some("protobuf"),
360 "graphql" | "gql" => Some("graphql"),
361 _ => None,
362 }
363}
364
365fn format_metadata_only(parsed: &ParsedBlobUrl, contents: &GitHubContents) -> String {
366 let lang = detect_language(&contents.name);
367 let mut out = String::new();
368 out.push_str(&format!("# {}\n\n", contents.path));
369 out.push_str("## File Info\n\n");
370 out.push_str(&format!(
371 "- **Repository:** {}/{}\n",
372 parsed.owner, parsed.repo
373 ));
374 out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
375 out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
376 if let Some(lang) = lang {
377 out.push_str(&format!("- **Language:** {}\n", lang));
378 }
379 if let Some(url) = &contents.html_url {
380 out.push_str(&format!("- **URL:** {}\n", url));
381 }
382 out
383}
384
385fn format_file_response(
386 parsed: &ParsedBlobUrl,
387 contents: &GitHubContents,
388 file_content: Option<&str>,
389 lang: Option<&str>,
390) -> String {
391 let mut out = String::new();
392
393 out.push_str(&format!("# {}\n\n", contents.path));
394 out.push_str("## File Info\n\n");
395 out.push_str(&format!(
396 "- **Repository:** {}/{}\n",
397 parsed.owner, parsed.repo
398 ));
399 out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
400 out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
401 if let Some(lang) = lang {
402 out.push_str(&format!("- **Language:** {}\n", lang));
403 }
404 if let Some(url) = &contents.html_url {
405 out.push_str(&format!("- **URL:** {}\n", url));
406 }
407
408 if let Some(content) = file_content {
409 let lang_hint = lang.unwrap_or("");
410 out.push_str(&format!(
411 "\n## Content\n\n```{}\n{}\n```\n",
412 lang_hint, content
413 ));
414 }
415
416 out
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422
423 #[test]
424 fn test_parse_blob_url() {
425 let url = Url::parse("https://github.com/owner/repo/blob/main/src/lib.rs").unwrap();
426 let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
427 assert_eq!(parsed.owner, "owner");
428 assert_eq!(parsed.repo, "repo");
429 assert_eq!(parsed.git_ref, "main");
430 assert_eq!(parsed.path, "src/lib.rs");
431 }
432
433 #[test]
434 fn test_parse_blob_url_nested_path() {
435 let url = Url::parse("https://github.com/owner/repo/blob/v1.0.0/crates/core/src/main.rs")
436 .unwrap();
437 let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
438 assert_eq!(parsed.git_ref, "v1.0.0");
439 assert_eq!(parsed.path, "crates/core/src/main.rs");
440 }
441
442 #[test]
443 fn test_rejects_non_blob() {
444 let url = Url::parse("https://github.com/owner/repo/tree/main/src").unwrap();
445 assert!(GitHubCodeFetcher::parse_url(&url).is_none());
446 }
447
448 #[test]
449 fn test_rejects_too_few_segments() {
450 let url = Url::parse("https://github.com/owner/repo/blob/main").unwrap();
451 assert!(GitHubCodeFetcher::parse_url(&url).is_none());
452 }
453
454 #[test]
455 fn test_rejects_non_github() {
456 let url = Url::parse("https://gitlab.com/owner/repo/blob/main/file.rs").unwrap();
457 assert!(GitHubCodeFetcher::parse_url(&url).is_none());
458 }
459
460 #[test]
461 fn test_rejects_reserved_owner() {
462 let url = Url::parse("https://github.com/settings/repo/blob/main/file.rs").unwrap();
463 assert!(GitHubCodeFetcher::parse_url(&url).is_none());
464 }
465
466 #[test]
467 fn test_fetcher_matches() {
468 let fetcher = GitHubCodeFetcher::new();
469
470 let url = Url::parse("https://github.com/rust-lang/rust/blob/master/Cargo.toml").unwrap();
471 assert!(fetcher.matches(&url));
472
473 let url = Url::parse("https://github.com/rust-lang/rust").unwrap();
474 assert!(!fetcher.matches(&url));
475
476 let url = Url::parse("https://github.com/rust-lang/rust/issues/1").unwrap();
477 assert!(!fetcher.matches(&url));
478 }
479
480 #[test]
481 fn test_detect_language() {
482 assert_eq!(detect_language("main.rs"), Some("rust"));
483 assert_eq!(detect_language("app.py"), Some("python"));
484 assert_eq!(detect_language("index.tsx"), Some("tsx"));
485 assert_eq!(detect_language("Cargo.toml"), Some("toml"));
486 assert_eq!(detect_language("unknown.xyz"), None);
487 assert_eq!(detect_language("Dockerfile"), Some("dockerfile"));
488 }
489
490 #[test]
491 fn test_format_file_response() {
492 let parsed = ParsedBlobUrl {
493 owner: "owner".to_string(),
494 repo: "repo".to_string(),
495 git_ref: "main".to_string(),
496 path: "src/lib.rs".to_string(),
497 };
498 let contents = GitHubContents {
499 name: "lib.rs".to_string(),
500 path: "src/lib.rs".to_string(),
501 size: 42,
502 content_type: "file".to_string(),
503 content: None,
504 html_url: Some("https://github.com/owner/repo/blob/main/src/lib.rs".to_string()),
505 };
506
507 let output = format_file_response(&parsed, &contents, Some("fn main() {}"), Some("rust"));
508
509 assert!(output.contains("# src/lib.rs"));
510 assert!(output.contains("**Repository:** owner/repo"));
511 assert!(output.contains("**Language:** rust"));
512 assert!(output.contains("```rust\nfn main() {}\n```"));
513 }
514
515 #[test]
516 fn test_base64_decode() {
517 assert_eq!(base64_decode("SGVsbG8="), Some(b"Hello".to_vec()));
519 assert_eq!(base64_decode(""), Some(vec![]));
520 assert_eq!(base64_decode("abc"), None);
521 }
522}