Skip to main content

fetchkit/fetchers/
github_code.rs

1//! GitHub source file fetcher
2//!
3//! Handles GitHub blob URLs, returning raw source file content with language
4//! metadata via the GitHub API, optimized for LLM consumption.
5
6use crate::client::FetchOptions;
7use crate::error::FetchError;
8use crate::fetchers::default::{read_full_body, transport_request};
9use crate::fetchers::Fetcher;
10use crate::types::{FetchRequest, FetchResponse};
11use crate::DEFAULT_USER_AGENT;
12use async_trait::async_trait;
13use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, USER_AGENT};
14use serde::Deserialize;
15use std::time::Duration;
16use url::Url;
17
18const API_TIMEOUT: Duration = Duration::from_secs(10);
19
20/// GitHub API host and port (DNS-pinned per request).
21const GITHUB_API_HOST: &str = "api.github.com";
22const GITHUB_API_PORT: u16 = 443;
23
24/// Max file size we'll return inline (1 MB, matching GitHub contents API limit)
25const MAX_INLINE_SIZE: u64 = 1_048_576;
26
27/// GitHub source file fetcher
28///
29/// Matches `https://github.com/{owner}/{repo}/blob/{ref}/{path}` and returns
30/// raw file content with language metadata.
31pub struct GitHubCodeFetcher;
32
33impl GitHubCodeFetcher {
34    pub fn new() -> Self {
35        Self
36    }
37
38    /// Extract owner, repo, ref, and path from a GitHub blob URL
39    fn parse_url(url: &Url) -> Option<ParsedBlobUrl> {
40        if url.host_str() != Some("github.com") {
41            return None;
42        }
43
44        let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
45
46        // Minimum: /{owner}/{repo}/blob/{ref}/{path} = 5+ segments
47        if segments.len() < 5 {
48            return None;
49        }
50
51        let owner = segments[0];
52        let repo = segments[1];
53        let kind = segments[2];
54        let git_ref = segments[3];
55
56        if owner.is_empty() || repo.is_empty() || git_ref.is_empty() {
57            return None;
58        }
59
60        if kind != "blob" {
61            return None;
62        }
63
64        // Exclude reserved owner paths
65        let reserved = [
66            "settings",
67            "explore",
68            "trending",
69            "collections",
70            "events",
71            "sponsors",
72            "notifications",
73            "marketplace",
74            "pulls",
75            "issues",
76            "codespaces",
77            "features",
78            "enterprise",
79            "organizations",
80            "pricing",
81            "about",
82            "team",
83            "security",
84            "login",
85            "join",
86        ];
87        if reserved.contains(&owner) {
88            return None;
89        }
90
91        // Path is everything after the ref
92        let file_path = segments[4..].join("/");
93        if file_path.is_empty() {
94            return None;
95        }
96
97        Some(ParsedBlobUrl {
98            owner: owner.to_string(),
99            repo: repo.to_string(),
100            git_ref: git_ref.to_string(),
101            path: file_path,
102        })
103    }
104}
105
106impl Default for GitHubCodeFetcher {
107    fn default() -> Self {
108        Self::new()
109    }
110}
111
112struct ParsedBlobUrl {
113    owner: String,
114    repo: String,
115    git_ref: String,
116    path: String,
117}
118
119#[derive(Debug, Deserialize)]
120struct GitHubContents {
121    name: String,
122    path: String,
123    size: u64,
124    #[serde(rename = "type")]
125    content_type: String,
126    content: Option<String>,
127    html_url: Option<String>,
128}
129
130#[async_trait]
131impl Fetcher for GitHubCodeFetcher {
132    fn name(&self) -> &'static str {
133        "github_code"
134    }
135
136    fn matches(&self, url: &Url) -> bool {
137        Self::parse_url(url).is_some()
138    }
139
140    async fn fetch(
141        &self,
142        request: &FetchRequest,
143        options: &FetchOptions,
144    ) -> Result<FetchResponse, FetchError> {
145        let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
146
147        let parsed = Self::parse_url(&url)
148            .ok_or_else(|| FetchError::FetcherError("Not a valid GitHub blob URL".to_string()))?;
149
150        let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
151        let ua_header = HeaderValue::from_str(user_agent)
152            .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
153        let accept_header = HeaderValue::from_static("application/vnd.github+json");
154
155        // Fetch file contents via GitHub API
156        let api_url = format!(
157            "https://api.github.com/repos/{}/{}/contents/{}?ref={}",
158            parsed.owner, parsed.repo, parsed.path, parsed.git_ref
159        );
160        let parsed_api_url = Url::parse(&api_url).map_err(|_| FetchError::InvalidUrlScheme)?;
161        // THREAT[TM-INPUT]: enforce host/port policy on the GitHub API subrequest (PR #131).
162        options.validate_url(&parsed_api_url)?;
163
164        let mut headers = HeaderMap::new();
165        headers.insert(USER_AGENT, ua_header);
166        headers.insert(ACCEPT, accept_header);
167
168        // THREAT[TM-SSRF-010]: single-hop request, redirects not followed.
169        let response = transport_request(
170            parsed_api_url,
171            reqwest::Method::GET,
172            headers,
173            options,
174            API_TIMEOUT,
175            GITHUB_API_HOST,
176            GITHUB_API_PORT,
177        )
178        .await?;
179
180        let status_code = response.status;
181        if !(200..300).contains(&status_code) {
182            let error_msg = if status_code == 404 {
183                format!(
184                    "{}/{}:{} {} not found",
185                    parsed.owner, parsed.repo, parsed.git_ref, parsed.path
186                )
187            } else if status_code == 403 {
188                "GitHub API rate limit exceeded".to_string()
189            } else {
190                format!("GitHub API error: HTTP {}", status_code)
191            };
192            return Ok(FetchResponse {
193                url: request.url.clone(),
194                status_code,
195                error: Some(error_msg),
196                ..Default::default()
197            });
198        }
199
200        let body = read_full_body(response, options).await?;
201        let contents: GitHubContents = serde_json::from_slice(&body)
202            .map_err(|e| FetchError::FetcherError(format!("Failed to parse contents: {}", e)))?;
203
204        // Handle directories (content_type == "dir")
205        if contents.content_type != "file" {
206            return Ok(FetchResponse {
207                url: request.url.clone(),
208                status_code: 200,
209                format: Some("github_file".to_string()),
210                error: Some(format!("Path is a {} (not a file)", contents.content_type)),
211                ..Default::default()
212            });
213        }
214
215        // Handle binary/large files — return metadata only
216        if contents.size > MAX_INLINE_SIZE || contents.content.is_none() {
217            let content = format_metadata_only(&parsed, &contents);
218            return Ok(FetchResponse {
219                url: request.url.clone(),
220                status_code: 200,
221                content_type: Some("text/markdown".to_string()),
222                format: Some("github_file".to_string()),
223                content: Some(content),
224                size: Some(contents.size),
225                ..Default::default()
226            });
227        }
228
229        // Decode base64 content
230        let raw_content = contents.content.as_deref().and_then(decode_base64_content);
231
232        let (file_content, is_binary) = match raw_content {
233            Some(bytes) => match String::from_utf8(bytes) {
234                Ok(text) => (Some(text), false),
235                Err(_) => (None, true),
236            },
237            None => (None, true),
238        };
239
240        if is_binary {
241            let content = format_metadata_only(&parsed, &contents);
242            return Ok(FetchResponse {
243                url: request.url.clone(),
244                status_code: 200,
245                content_type: Some("text/markdown".to_string()),
246                format: Some("github_file".to_string()),
247                content: Some(content),
248                size: Some(contents.size),
249                error: Some("Binary file — metadata only".to_string()),
250                ..Default::default()
251            });
252        }
253
254        let lang = detect_language(&contents.name);
255        let content = format_file_response(&parsed, &contents, file_content.as_deref(), lang);
256
257        Ok(FetchResponse {
258            url: request.url.clone(),
259            status_code: 200,
260            content_type: Some("text/markdown".to_string()),
261            format: Some("github_file".to_string()),
262            content: Some(content),
263            size: Some(contents.size),
264            ..Default::default()
265        })
266    }
267}
268
269/// Decode base64 with whitespace (GitHub API includes newlines in base64)
270fn decode_base64_content(encoded: &str) -> Option<Vec<u8>> {
271    let cleaned: String = encoded.chars().filter(|c| !c.is_whitespace()).collect();
272    base64_decode(&cleaned)
273}
274
275/// Basic base64 decoder (same approach as github_repo.rs)
276fn base64_decode(input: &str) -> Option<Vec<u8>> {
277    const ALPHABET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
278
279    fn decode_char(c: u8) -> Option<u8> {
280        if c == b'=' {
281            return Some(0);
282        }
283        ALPHABET.iter().position(|&x| x == c).map(|p| p as u8)
284    }
285
286    let bytes: Vec<u8> = input.bytes().collect();
287    if !bytes.is_empty() && !bytes.len().is_multiple_of(4) {
288        return None;
289    }
290
291    let mut result = Vec::with_capacity(bytes.len() * 3 / 4);
292
293    for chunk in bytes.chunks(4) {
294        if chunk.len() != 4 {
295            return None;
296        }
297        let a = decode_char(chunk[0])?;
298        let b = decode_char(chunk[1])?;
299        let c = decode_char(chunk[2])?;
300        let d = decode_char(chunk[3])?;
301
302        result.push((a << 2) | (b >> 4));
303        if chunk[2] != b'=' {
304            result.push((b << 4) | (c >> 2));
305        }
306        if chunk[3] != b'=' {
307            result.push((c << 6) | d);
308        }
309    }
310
311    Some(result)
312}
313
314/// Simple language detection from file extension
315fn detect_language(filename: &str) -> Option<&'static str> {
316    let ext = filename.rsplit('.').next()?;
317    match ext.to_ascii_lowercase().as_str() {
318        "rs" => Some("rust"),
319        "py" => Some("python"),
320        "js" => Some("javascript"),
321        "ts" => Some("typescript"),
322        "tsx" => Some("tsx"),
323        "jsx" => Some("jsx"),
324        "rb" => Some("ruby"),
325        "go" => Some("go"),
326        "java" => Some("java"),
327        "kt" | "kts" => Some("kotlin"),
328        "swift" => Some("swift"),
329        "c" => Some("c"),
330        "cpp" | "cc" | "cxx" => Some("cpp"),
331        "h" | "hpp" => Some("cpp"),
332        "cs" => Some("csharp"),
333        "php" => Some("php"),
334        "sh" | "bash" => Some("bash"),
335        "zsh" => Some("zsh"),
336        "fish" => Some("fish"),
337        "yml" | "yaml" => Some("yaml"),
338        "json" => Some("json"),
339        "toml" => Some("toml"),
340        "xml" => Some("xml"),
341        "html" | "htm" => Some("html"),
342        "css" => Some("css"),
343        "scss" | "sass" => Some("scss"),
344        "sql" => Some("sql"),
345        "md" | "markdown" => Some("markdown"),
346        "dockerfile" => Some("dockerfile"),
347        "tf" => Some("terraform"),
348        "ex" | "exs" => Some("elixir"),
349        "erl" => Some("erlang"),
350        "hs" => Some("haskell"),
351        "ml" | "mli" => Some("ocaml"),
352        "r" => Some("r"),
353        "scala" => Some("scala"),
354        "lua" => Some("lua"),
355        "zig" => Some("zig"),
356        "nim" => Some("nim"),
357        "v" => Some("v"),
358        "dart" => Some("dart"),
359        "proto" => Some("protobuf"),
360        "graphql" | "gql" => Some("graphql"),
361        _ => None,
362    }
363}
364
365fn format_metadata_only(parsed: &ParsedBlobUrl, contents: &GitHubContents) -> String {
366    let lang = detect_language(&contents.name);
367    let mut out = String::new();
368    out.push_str(&format!("# {}\n\n", contents.path));
369    out.push_str("## File Info\n\n");
370    out.push_str(&format!(
371        "- **Repository:** {}/{}\n",
372        parsed.owner, parsed.repo
373    ));
374    out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
375    out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
376    if let Some(lang) = lang {
377        out.push_str(&format!("- **Language:** {}\n", lang));
378    }
379    if let Some(url) = &contents.html_url {
380        out.push_str(&format!("- **URL:** {}\n", url));
381    }
382    out
383}
384
385fn format_file_response(
386    parsed: &ParsedBlobUrl,
387    contents: &GitHubContents,
388    file_content: Option<&str>,
389    lang: Option<&str>,
390) -> String {
391    let mut out = String::new();
392
393    out.push_str(&format!("# {}\n\n", contents.path));
394    out.push_str("## File Info\n\n");
395    out.push_str(&format!(
396        "- **Repository:** {}/{}\n",
397        parsed.owner, parsed.repo
398    ));
399    out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
400    out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
401    if let Some(lang) = lang {
402        out.push_str(&format!("- **Language:** {}\n", lang));
403    }
404    if let Some(url) = &contents.html_url {
405        out.push_str(&format!("- **URL:** {}\n", url));
406    }
407
408    if let Some(content) = file_content {
409        let lang_hint = lang.unwrap_or("");
410        out.push_str(&format!(
411            "\n## Content\n\n```{}\n{}\n```\n",
412            lang_hint, content
413        ));
414    }
415
416    out
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn test_parse_blob_url() {
425        let url = Url::parse("https://github.com/owner/repo/blob/main/src/lib.rs").unwrap();
426        let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
427        assert_eq!(parsed.owner, "owner");
428        assert_eq!(parsed.repo, "repo");
429        assert_eq!(parsed.git_ref, "main");
430        assert_eq!(parsed.path, "src/lib.rs");
431    }
432
433    #[test]
434    fn test_parse_blob_url_nested_path() {
435        let url = Url::parse("https://github.com/owner/repo/blob/v1.0.0/crates/core/src/main.rs")
436            .unwrap();
437        let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
438        assert_eq!(parsed.git_ref, "v1.0.0");
439        assert_eq!(parsed.path, "crates/core/src/main.rs");
440    }
441
442    #[test]
443    fn test_rejects_non_blob() {
444        let url = Url::parse("https://github.com/owner/repo/tree/main/src").unwrap();
445        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
446    }
447
448    #[test]
449    fn test_rejects_too_few_segments() {
450        let url = Url::parse("https://github.com/owner/repo/blob/main").unwrap();
451        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
452    }
453
454    #[test]
455    fn test_rejects_non_github() {
456        let url = Url::parse("https://gitlab.com/owner/repo/blob/main/file.rs").unwrap();
457        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
458    }
459
460    #[test]
461    fn test_rejects_reserved_owner() {
462        let url = Url::parse("https://github.com/settings/repo/blob/main/file.rs").unwrap();
463        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
464    }
465
466    #[test]
467    fn test_fetcher_matches() {
468        let fetcher = GitHubCodeFetcher::new();
469
470        let url = Url::parse("https://github.com/rust-lang/rust/blob/master/Cargo.toml").unwrap();
471        assert!(fetcher.matches(&url));
472
473        let url = Url::parse("https://github.com/rust-lang/rust").unwrap();
474        assert!(!fetcher.matches(&url));
475
476        let url = Url::parse("https://github.com/rust-lang/rust/issues/1").unwrap();
477        assert!(!fetcher.matches(&url));
478    }
479
480    #[test]
481    fn test_detect_language() {
482        assert_eq!(detect_language("main.rs"), Some("rust"));
483        assert_eq!(detect_language("app.py"), Some("python"));
484        assert_eq!(detect_language("index.tsx"), Some("tsx"));
485        assert_eq!(detect_language("Cargo.toml"), Some("toml"));
486        assert_eq!(detect_language("unknown.xyz"), None);
487        assert_eq!(detect_language("Dockerfile"), Some("dockerfile"));
488    }
489
490    #[test]
491    fn test_format_file_response() {
492        let parsed = ParsedBlobUrl {
493            owner: "owner".to_string(),
494            repo: "repo".to_string(),
495            git_ref: "main".to_string(),
496            path: "src/lib.rs".to_string(),
497        };
498        let contents = GitHubContents {
499            name: "lib.rs".to_string(),
500            path: "src/lib.rs".to_string(),
501            size: 42,
502            content_type: "file".to_string(),
503            content: None,
504            html_url: Some("https://github.com/owner/repo/blob/main/src/lib.rs".to_string()),
505        };
506
507        let output = format_file_response(&parsed, &contents, Some("fn main() {}"), Some("rust"));
508
509        assert!(output.contains("# src/lib.rs"));
510        assert!(output.contains("**Repository:** owner/repo"));
511        assert!(output.contains("**Language:** rust"));
512        assert!(output.contains("```rust\nfn main() {}\n```"));
513    }
514
515    #[test]
516    fn test_base64_decode() {
517        // "Hello" in base64
518        assert_eq!(base64_decode("SGVsbG8="), Some(b"Hello".to_vec()));
519        assert_eq!(base64_decode(""), Some(vec![]));
520        assert_eq!(base64_decode("abc"), None);
521    }
522}