Skip to main content

codesearch/daemon/
github.rs

1//! GitHub auto-discovery: list repos from orgs/users, clone missing ones.
2//!
3//! All errors are non-fatal — GitHub failure never blocks the daemon.
4//! Missing tokens, API errors, and clone failures are logged and skipped.
5
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8
9use anyhow::{Context, Result};
10use serde::Deserialize;
11use tracing::{error, info, warn};
12
13use super::{GitHubConfig, GitHubSource, OwnerKind};
14
15/// Minimal GitHub repo response (only fields we need).
16#[derive(Debug, Deserialize)]
17struct GitHubRepo {
18    name: String,
19    clone_url: String,
20    archived: bool,
21    fork: bool,
22}
23
24/// GitHub API client with bearer token auth.
25struct GitHubClient {
26    client: reqwest::Client,
27    token: String,
28}
29
30impl GitHubClient {
31    fn new(token: String) -> Result<Self> {
32        let client = reqwest::Client::builder()
33            .user_agent("codesearch-daemon")
34            .build()
35            .context("Failed to build HTTP client")?;
36        Ok(Self { client, token })
37    }
38
39    /// List all repos for a source (paginated, 100 per page).
40    async fn list_repos(&self, source: &GitHubSource) -> Result<Vec<GitHubRepo>> {
41        let base_url = match source.kind {
42            OwnerKind::Org => format!("https://api.github.com/orgs/{}/repos", source.owner),
43            OwnerKind::User => format!("https://api.github.com/users/{}/repos", source.owner),
44        };
45
46        let mut all_repos = Vec::new();
47        let mut page = 1u32;
48
49        loop {
50            let resp = self
51                .client
52                .get(&base_url)
53                .query(&[
54                    ("per_page", "100"),
55                    ("page", &page.to_string()),
56                ])
57                .header("Authorization", format!("Bearer {}", self.token))
58                .header("X-GitHub-Api-Version", "2022-11-28")
59                .header("Accept", "application/vnd.github+json")
60                .send()
61                .await
62                .with_context(|| format!("GitHub API request failed (page {})", page))?;
63
64            // Check rate limit before processing
65            if let Some(remaining) = resp
66                .headers()
67                .get("x-ratelimit-remaining")
68                .and_then(|v| v.to_str().ok())
69                .and_then(|v| v.parse::<u32>().ok())
70            {
71                if remaining == 0 {
72                    warn!("GitHub API rate limit exhausted, stopping pagination");
73                    break;
74                }
75            }
76
77            let status = resp.status();
78            if !status.is_success() {
79                let body = resp.text().await.unwrap_or_default();
80                return Err(anyhow::anyhow!(
81                    "GitHub API returned {}: {}",
82                    status,
83                    body
84                ));
85            }
86
87            let repos: Vec<GitHubRepo> = resp
88                .json()
89                .await
90                .context("Failed to parse GitHub repo list")?;
91
92            let count = repos.len();
93            all_repos.extend(repos);
94
95            // Last page when we get fewer than 100
96            if count < 100 {
97                break;
98            }
99            page += 1;
100        }
101
102        Ok(all_repos)
103    }
104}
105
106/// Resolve token from token_file (with ~ expansion) or GITHUB_TOKEN env var.
107fn resolve_token(config: &GitHubConfig) -> Option<String> {
108    // Try token_file first
109    if let Some(ref path) = config.token_file {
110        let expanded = shellexpand::tilde(path);
111        match std::fs::read_to_string(expanded.as_ref()) {
112            Ok(token) => {
113                let token = token.trim().to_string();
114                if !token.is_empty() {
115                    return Some(token);
116                }
117                warn!("Token file {} is empty", path);
118            }
119            Err(e) => {
120                warn!("Failed to read token file {}: {}", path, e);
121            }
122        }
123    }
124
125    // Fall back to env var
126    match std::env::var("GITHUB_TOKEN") {
127        Ok(token) if !token.is_empty() => Some(token),
128        _ => None,
129    }
130}
131
132/// Simple wildcard pattern matcher (supports `*` only).
133///
134/// Patterns like `"*.wiki"`, `"legacy-*"`, `"test-*-old"` are matched
135/// against repo names. `*` matches any sequence of characters (including empty).
136fn matches_pattern(name: &str, pattern: &str) -> bool {
137    let parts: Vec<&str> = pattern.split('*').collect();
138
139    if parts.len() == 1 {
140        // No wildcard — exact match
141        return name == pattern;
142    }
143
144    let mut pos = 0;
145
146    for (i, part) in parts.iter().enumerate() {
147        if part.is_empty() {
148            continue;
149        }
150
151        if i == 0 {
152            // First segment must match at start
153            if !name.starts_with(part) {
154                return false;
155            }
156            pos = part.len();
157        } else if i == parts.len() - 1 {
158            // Last segment must match at end
159            if !name[pos..].ends_with(part) {
160                return false;
161            }
162            pos = name.len();
163        } else {
164            // Middle segment — find anywhere after current pos
165            match name[pos..].find(part) {
166                Some(found) => pos += found + part.len(),
167                None => return false,
168            }
169        }
170    }
171
172    true
173}
174
175/// Check if a repo name matches any exclude pattern.
176fn is_excluded(name: &str, patterns: &[String]) -> bool {
177    patterns.iter().any(|p| matches_pattern(name, p))
178}
179
180/// Filter repos based on source configuration.
181fn filter_repos(repos: Vec<GitHubRepo>, source: &GitHubSource) -> Vec<GitHubRepo> {
182    repos
183        .into_iter()
184        .filter(|r| {
185            if source.skip_archived && r.archived {
186                return false;
187            }
188            if source.skip_forks && r.fork {
189                return false;
190            }
191            if is_excluded(&r.name, &source.exclude) {
192                return false;
193            }
194            true
195        })
196        .collect()
197}
198
199/// Clone a repo using gix (blocking, runs in spawn_blocking).
200async fn clone_repo(clone_url: &str, dest: &Path, token: &str) -> Result<()> {
201    let url_with_auth = clone_url.replacen("https://", &format!("https://x-access-token:{}@", token), 1);
202    let dest = dest.to_path_buf();
203    let url = url_with_auth.clone();
204
205    tokio::task::spawn_blocking(move || -> Result<()> {
206        // Ensure parent directory exists
207        if let Some(parent) = dest.parent() {
208            std::fs::create_dir_all(parent)
209                .with_context(|| format!("Failed to create directory {}", parent.display()))?;
210        }
211
212        let mut prepare = gix::prepare_clone(gix::url::parse(url.as_str().into())?, &dest)
213            .with_context(|| format!("Failed to prepare clone to {}", dest.display()))?;
214
215        let (mut checkout, _outcome) = prepare
216            .fetch_then_checkout(gix::progress::Discard, &gix::interrupt::IS_INTERRUPTED)
217            .with_context(|| format!("Failed to fetch {}", dest.display()))?;
218
219        let (_repo, _outcome) = checkout
220            .main_worktree(gix::progress::Discard, &gix::interrupt::IS_INTERRUPTED)
221            .with_context(|| format!("Failed to checkout {}", dest.display()))?;
222
223        Ok(())
224    })
225    .await
226    .context("Clone task panicked")?
227}
228
229/// Resolve all repos from GitHub sources + explicit list.
230///
231/// Returns a deduplicated list of repo paths. All GitHub errors are non-fatal.
232pub async fn resolve_all_repos(
233    explicit: Vec<PathBuf>,
234    github_config: Option<&GitHubConfig>,
235) -> Vec<PathBuf> {
236    let mut all_paths: Vec<PathBuf> = explicit;
237    let mut seen = HashSet::new();
238
239    let config = match github_config {
240        Some(c) if !c.sources.is_empty() => c,
241        _ => {
242            // No GitHub config or no sources — just return explicit repos
243            return all_paths;
244        }
245    };
246
247    let token = match resolve_token(config) {
248        Some(t) => t,
249        None => {
250            warn!("No GitHub token available — skipping repo discovery (set token_file or GITHUB_TOKEN)");
251            return all_paths;
252        }
253    };
254
255    let client = match GitHubClient::new(token.clone()) {
256        Ok(c) => c,
257        Err(e) => {
258            error!("Failed to create GitHub client: {}", e);
259            return all_paths;
260        }
261    };
262
263    for source in &config.sources {
264        info!(
265            "Discovering repos from {} {} (clone_base: {})",
266            match source.kind {
267                OwnerKind::Org => "org",
268                OwnerKind::User => "user",
269            },
270            source.owner,
271            source.clone_base.display()
272        );
273
274        let repos = match client.list_repos(source).await {
275            Ok(r) => r,
276            Err(e) => {
277                error!("Failed to list repos for {}: {}", source.owner, e);
278                continue;
279            }
280        };
281
282        let total = repos.len();
283        let filtered = filter_repos(repos, source);
284        info!(
285            "Found {} repos for {} ({} after filtering)",
286            total,
287            source.owner,
288            filtered.len()
289        );
290
291        // Expand ~ in clone_base
292        let clone_base_str = source.clone_base.to_string_lossy();
293        let expanded = shellexpand::tilde(&clone_base_str);
294        let clone_base = PathBuf::from(expanded.as_ref());
295
296        for repo in &filtered {
297            let local_path = clone_base.join(&repo.name);
298
299            if local_path.exists() {
300                info!("Found local clone: {}", local_path.display());
301                all_paths.push(local_path);
302            } else if source.auto_clone {
303                info!("Cloning {} → {}", repo.name, local_path.display());
304                match clone_repo(&repo.clone_url, &local_path, &token).await {
305                    Ok(()) => {
306                        info!("Cloned {}", repo.name);
307                        all_paths.push(local_path);
308                    }
309                    Err(e) => {
310                        error!("Failed to clone {}: {}", repo.name, e);
311                    }
312                }
313            } else {
314                info!("Skipping {} (not cloned, auto_clone=false)", repo.name);
315            }
316        }
317    }
318
319    // Deduplicate by canonical path
320    all_paths.retain(|p| {
321        let key = p.canonicalize().unwrap_or_else(|_| p.clone());
322        seen.insert(key)
323    });
324
325    all_paths
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_matches_pattern_exact() {
334        assert!(matches_pattern("foo", "foo"));
335        assert!(!matches_pattern("foo", "bar"));
336    }
337
338    #[test]
339    fn test_matches_pattern_suffix_wildcard() {
340        assert!(matches_pattern("legacy-api", "legacy-*"));
341        assert!(matches_pattern("legacy-", "legacy-*"));
342        assert!(!matches_pattern("new-api", "legacy-*"));
343    }
344
345    #[test]
346    fn test_matches_pattern_prefix_wildcard() {
347        assert!(matches_pattern("repo.wiki", "*.wiki"));
348        assert!(matches_pattern(".wiki", "*.wiki"));
349        assert!(!matches_pattern("repo.git", "*.wiki"));
350    }
351
352    #[test]
353    fn test_matches_pattern_middle_wildcard() {
354        assert!(matches_pattern("test-foo-old", "test-*-old"));
355        assert!(matches_pattern("test--old", "test-*-old"));
356        assert!(!matches_pattern("test-foo-new", "test-*-old"));
357    }
358
359    #[test]
360    fn test_matches_pattern_star_only() {
361        assert!(matches_pattern("anything", "*"));
362        assert!(matches_pattern("", "*"));
363    }
364
365    #[test]
366    fn test_is_excluded() {
367        let patterns = vec!["*.wiki".to_string(), "legacy-*".to_string()];
368        assert!(is_excluded("repo.wiki", &patterns));
369        assert!(is_excluded("legacy-api", &patterns));
370        assert!(!is_excluded("codesearch", &patterns));
371    }
372}