use super::{
language_matches, license_matches, CodeReference, MiningError, MiningFilters, MiningQuery,
ReferenceMiner,
};
use async_trait::async_trait;
use car_engine::{RateLimit, RateLimiter};
use serde::Deserialize;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio::sync::Mutex;
const API_BASE: &str = "https://api.github.com";
const SEARCH_PATH: &str = "/search/code";
const RL_KEY: &str = "github_search";
fn default_rate_limit() -> RateLimit {
RateLimit {
max_calls: 30,
interval_secs: 60.0,
}
}
pub struct GitHubSource {
client: reqwest::Client,
token: String,
limiter: Arc<RateLimiter>,
configure_limiter: AtomicBool,
api_base: String,
license_cache: Arc<Mutex<HashMap<String, Option<String>>>>,
}
impl GitHubSource {
pub fn from_env() -> Result<Self, MiningError> {
let token = std::env::var("GITHUB_TOKEN")
.map_err(|_| MiningError::Unavailable("GITHUB_TOKEN not set".into()))?;
Self::with_token(token)
}
pub fn with_token(token: String) -> Result<Self, MiningError> {
let limiter = Arc::new(RateLimiter::new());
let client = reqwest::Client::builder()
.user_agent("car-agents/reference-miner")
.build()
.map_err(|e| MiningError::Other(anyhow::anyhow!(e)))?;
Ok(Self {
client,
token,
limiter,
configure_limiter: AtomicBool::new(true),
api_base: API_BASE.to_string(),
license_cache: Arc::new(Mutex::new(HashMap::new())),
})
}
async fn ensure_rate_limit_configured(&self) {
if self.configure_limiter.swap(false, Ordering::AcqRel) {
self.limiter.set_limit(RL_KEY, default_rate_limit()).await;
}
}
pub fn with_api_base(mut self, base: impl Into<String>) -> Self {
self.api_base = base.into();
self
}
pub fn with_rate_limiter(mut self, limiter: Arc<RateLimiter>) -> Self {
self.limiter = limiter;
self.configure_limiter.store(false, Ordering::Release);
self
}
async fn fetch_license(&self, full_name: &str) -> Option<String> {
{
let guard = self.license_cache.lock().await;
if let Some(v) = guard.get(full_name) {
return v.clone();
}
}
let url = format!("{}/repos/{}", self.api_base, full_name);
self.limiter.acquire(RL_KEY).await;
let resp = self
.client
.get(&url)
.bearer_auth(&self.token)
.header("Accept", "application/vnd.github+json")
.send()
.await;
let spdx: Option<String> = match resp {
Ok(r) if r.status().is_success() => r
.json::<RepoMeta>()
.await
.ok()
.and_then(|m| m.license.and_then(|l| l.spdx_id))
.filter(|s| !s.is_empty() && s != "NOASSERTION"),
_ => None,
};
self.license_cache
.lock()
.await
.insert(full_name.to_string(), spdx.clone());
spdx
}
}
#[async_trait]
impl ReferenceMiner for GitHubSource {
fn name(&self) -> &str {
"github"
}
async fn search(&self, query: &MiningQuery) -> Result<Vec<CodeReference>, MiningError> {
if query.query.trim().is_empty() {
return Err(MiningError::InvalidQuery("empty query".into()));
}
self.ensure_rate_limit_configured().await;
let filters = &query.filters;
let q = build_search_q(&query.query, filters);
let per_page = filters.max_results.clamp(1, 100);
let url = format!("{}{}", self.api_base, SEARCH_PATH);
self.limiter.acquire(RL_KEY).await;
let resp = self
.client
.get(&url)
.bearer_auth(&self.token)
.header("Accept", "application/vnd.github.v3.text-match+json")
.query(&[("q", q.as_str()), ("per_page", &per_page.to_string())])
.send()
.await
.map_err(|e| MiningError::Unavailable(format!("github request failed: {e}")))?;
let status = resp.status();
if status == reqwest::StatusCode::FORBIDDEN || status == reqwest::StatusCode::TOO_MANY_REQUESTS {
return Err(MiningError::RateLimited(format!(
"github returned {} (rate limit)",
status.as_u16()
)));
}
if status == reqwest::StatusCode::UNPROCESSABLE_ENTITY {
return Err(MiningError::InvalidQuery(format!(
"github rejected query: {q}"
)));
}
if !status.is_success() {
let body = resp.text().await.unwrap_or_default();
return Err(MiningError::Unavailable(format!(
"github {}: {}",
status.as_u16(),
truncate(&body, 200)
)));
}
let payload: SearchResponse = resp
.json()
.await
.map_err(|e| MiningError::Other(anyhow::anyhow!("github body parse: {e}")))?;
let mut unique_repos: Vec<String> = payload
.items
.iter()
.map(|i| i.repository.full_name.clone())
.collect();
unique_repos.sort();
unique_repos.dedup();
let license_map: HashMap<String, Option<String>> =
futures::future::join_all(unique_repos.into_iter().map(|name| async {
let l = self.fetch_license(&name).await;
(name, l)
}))
.await
.into_iter()
.collect();
let total = payload.items.len().max(1) as f32;
let mut out = Vec::new();
for (rank, item) in payload.items.into_iter().enumerate() {
if !language_matches(&filters.languages, &item.path) {
continue;
}
let license = license_map
.get(&item.repository.full_name)
.cloned()
.unwrap_or(None);
if !license_matches(&filters.license_allowlist, license.as_deref()) {
continue;
}
let score = 1.0 - (rank as f32 / total);
let snippet = item
.text_matches
.iter()
.map(|m| m.fragment.clone())
.collect::<Vec<_>>()
.join("\n---\n");
let snippet = if snippet.is_empty() {
format!("(no text_matches fragment — see {})", item.html_url)
} else {
snippet
};
let commit = item.sha.clone();
out.push(CodeReference {
repo: format!("github.com/{}", item.repository.full_name),
commit,
path: item.path,
snippet,
score,
license,
why_relevant: format!(
"github rank {} for query \"{}\"",
rank + 1,
truncate(&query.query, 60)
),
});
}
out.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
if filters.max_results > 0 {
out.truncate(filters.max_results);
}
Ok(out)
}
}
fn build_search_q(query: &str, filters: &MiningFilters) -> String {
let mut parts = vec![query.to_string()];
if let Some(first) = filters.languages.first() {
parts.push(format!("language:{}", first));
}
parts.join(" ")
}
fn truncate(s: &str, n: usize) -> String {
if s.len() <= n {
return s.to_string();
}
let mut end = n;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
let mut out = s[..end].to_string();
out.push('…');
out
}
#[derive(Deserialize)]
struct SearchResponse {
#[serde(default)]
items: Vec<SearchItem>,
}
#[derive(Deserialize)]
struct SearchItem {
path: String,
sha: String,
html_url: String,
repository: RepoRef,
#[serde(default)]
text_matches: Vec<TextMatch>,
}
#[derive(Deserialize)]
struct RepoRef {
full_name: String,
}
#[derive(Deserialize)]
struct TextMatch {
fragment: String,
}
#[derive(Deserialize)]
struct RepoMeta {
license: Option<LicenseMeta>,
}
#[derive(Deserialize)]
struct LicenseMeta {
spdx_id: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_q_appends_language() {
let f = MiningFilters {
languages: vec!["rust".into()],
..Default::default()
};
assert_eq!(build_search_q("token bucket", &f), "token bucket language:rust");
}
#[test]
fn build_q_without_language() {
let f = MiningFilters::default();
assert_eq!(build_search_q("foo bar", &f), "foo bar");
}
}