car-search 0.8.0

External code discovery + indexing for Common Agent Runtime
Documentation
//! `GitHubSource` — mine references via GitHub's code search API.
//!
//! Uses `https://api.github.com/search/code` with the `text-match` media type
//! so we can extract `text_matches` fragments without a second fetch.
//! Auth via `GITHUB_TOKEN` env var (required — the endpoint rejects
//! unauthenticated requests).
//!
//! Rate limiting reuses [`car_engine::RateLimiter`] with a conservative default
//! matching GitHub's documented 30 search requests/minute for authenticated
//! users.

use super::{
    language_matches, license_matches, CodeReference, MiningError, MiningFilters, MiningQuery,
    ReferenceMiner,
};
use async_trait::async_trait;
use car_engine::{RateLimit, RateLimiter};
use serde::Deserialize;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use tokio::sync::Mutex;

const API_BASE: &str = "https://api.github.com";
const SEARCH_PATH: &str = "/search/code";
/// Tool key used for the shared `RateLimiter`.
const RL_KEY: &str = "github_search";

/// Default: 30 requests / 60s — matches GitHub's authenticated search limit.
fn default_rate_limit() -> RateLimit {
    RateLimit {
        max_calls: 30,
        interval_secs: 60.0,
    }
}

/// A miner backed by GitHub's code search API.
pub struct GitHubSource {
    client: reqwest::Client,
    token: String,
    limiter: Arc<RateLimiter>,
    /// When `true`, ensure the default rate limit is registered on `limiter`
    /// before first use. We defer this to the first async call because
    /// `RateLimiter::set_limit` is async and we don't want to require a
    /// runtime at construction time.
    configure_limiter: AtomicBool,
    api_base: String,
    /// Cache of repo full_name → Option<spdx_id>. `None` means we asked and
    /// the repo had no detected license; absent means we haven't asked.
    license_cache: Arc<Mutex<HashMap<String, Option<String>>>>,
}

impl GitHubSource {
    /// Build a `GitHubSource` reading `GITHUB_TOKEN` from the environment.
    ///
    /// Returns `MiningError::Unavailable` if the token is missing, because
    /// the search endpoint returns 401 without it.
    pub fn from_env() -> Result<Self, MiningError> {
        let token = std::env::var("GITHUB_TOKEN")
            .map_err(|_| MiningError::Unavailable("GITHUB_TOKEN not set".into()))?;
        Self::with_token(token)
    }

    pub fn with_token(token: String) -> Result<Self, MiningError> {
        let limiter = Arc::new(RateLimiter::new());
        let client = reqwest::Client::builder()
            .user_agent("car-agents/reference-miner")
            .build()
            .map_err(|e| MiningError::Other(anyhow::anyhow!(e)))?;
        Ok(Self {
            client,
            token,
            limiter,
            configure_limiter: AtomicBool::new(true),
            api_base: API_BASE.to_string(),
            license_cache: Arc::new(Mutex::new(HashMap::new())),
        })
    }

    async fn ensure_rate_limit_configured(&self) {
        // Only the first caller actually configures; subsequent callers no-op.
        if self.configure_limiter.swap(false, Ordering::AcqRel) {
            self.limiter.set_limit(RL_KEY, default_rate_limit()).await;
        }
    }

    /// Override the API base URL. Used by tests to point at a mock server.
    pub fn with_api_base(mut self, base: impl Into<String>) -> Self {
        self.api_base = base.into();
        self
    }

    /// Share an externally-owned `RateLimiter` (e.g. the engine's global one).
    /// The caller is responsible for configuring limits on the provided
    /// instance; if `RL_KEY` isn't configured there, no limiting is applied.
    pub fn with_rate_limiter(mut self, limiter: Arc<RateLimiter>) -> Self {
        self.limiter = limiter;
        // The shared limiter is the caller's responsibility; don't overwrite
        // their configuration on first use.
        self.configure_limiter.store(false, Ordering::Release);
        self
    }

    async fn fetch_license(&self, full_name: &str) -> Option<String> {
        {
            let guard = self.license_cache.lock().await;
            if let Some(v) = guard.get(full_name) {
                return v.clone();
            }
        }
        let url = format!("{}/repos/{}", self.api_base, full_name);
        self.limiter.acquire(RL_KEY).await;
        let resp = self
            .client
            .get(&url)
            .bearer_auth(&self.token)
            .header("Accept", "application/vnd.github+json")
            .send()
            .await;
        let spdx: Option<String> = match resp {
            Ok(r) if r.status().is_success() => r
                .json::<RepoMeta>()
                .await
                .ok()
                .and_then(|m| m.license.and_then(|l| l.spdx_id))
                .filter(|s| !s.is_empty() && s != "NOASSERTION"),
            _ => None,
        };
        self.license_cache
            .lock()
            .await
            .insert(full_name.to_string(), spdx.clone());
        spdx
    }
}

#[async_trait]
impl ReferenceMiner for GitHubSource {
    fn name(&self) -> &str {
        "github"
    }

    async fn search(&self, query: &MiningQuery) -> Result<Vec<CodeReference>, MiningError> {
        if query.query.trim().is_empty() {
            return Err(MiningError::InvalidQuery("empty query".into()));
        }
        self.ensure_rate_limit_configured().await;
        let filters = &query.filters;
        let q = build_search_q(&query.query, filters);
        let per_page = filters.max_results.clamp(1, 100);
        let url = format!("{}{}", self.api_base, SEARCH_PATH);

        self.limiter.acquire(RL_KEY).await;
        let resp = self
            .client
            .get(&url)
            .bearer_auth(&self.token)
            .header("Accept", "application/vnd.github.v3.text-match+json")
            .query(&[("q", q.as_str()), ("per_page", &per_page.to_string())])
            .send()
            .await
            .map_err(|e| MiningError::Unavailable(format!("github request failed: {e}")))?;

        let status = resp.status();
        if status == reqwest::StatusCode::FORBIDDEN || status == reqwest::StatusCode::TOO_MANY_REQUESTS {
            // Secondary / primary rate limit.
            return Err(MiningError::RateLimited(format!(
                "github returned {} (rate limit)",
                status.as_u16()
            )));
        }
        if status == reqwest::StatusCode::UNPROCESSABLE_ENTITY {
            return Err(MiningError::InvalidQuery(format!(
                "github rejected query: {q}"
            )));
        }
        if !status.is_success() {
            let body = resp.text().await.unwrap_or_default();
            return Err(MiningError::Unavailable(format!(
                "github {}: {}",
                status.as_u16(),
                truncate(&body, 200)
            )));
        }

        let payload: SearchResponse = resp
            .json()
            .await
            .map_err(|e| MiningError::Other(anyhow::anyhow!("github body parse: {e}")))?;

        // Resolve licenses in parallel, but bounded — one lookup per unique repo.
        let mut unique_repos: Vec<String> = payload
            .items
            .iter()
            .map(|i| i.repository.full_name.clone())
            .collect();
        unique_repos.sort();
        unique_repos.dedup();
        let license_map: HashMap<String, Option<String>> =
            futures::future::join_all(unique_repos.into_iter().map(|name| async {
                let l = self.fetch_license(&name).await;
                (name, l)
            }))
            .await
            .into_iter()
            .collect();

        let total = payload.items.len().max(1) as f32;
        let mut out = Vec::new();
        for (rank, item) in payload.items.into_iter().enumerate() {
            if !language_matches(&filters.languages, &item.path) {
                continue;
            }
            let license = license_map
                .get(&item.repository.full_name)
                .cloned()
                .unwrap_or(None);
            if !license_matches(&filters.license_allowlist, license.as_deref()) {
                continue;
            }
            // GitHub doesn't expose a numeric score we can compare across queries.
            // Fall back to a positional score derived from the returned order,
            // which is already relevance-ranked by the API.
            let score = 1.0 - (rank as f32 / total);
            let snippet = item
                .text_matches
                .iter()
                .map(|m| m.fragment.clone())
                .collect::<Vec<_>>()
                .join("\n---\n");
            let snippet = if snippet.is_empty() {
                format!("(no text_matches fragment — see {})", item.html_url)
            } else {
                snippet
            };
            let commit = item.sha.clone();
            out.push(CodeReference {
                repo: format!("github.com/{}", item.repository.full_name),
                commit,
                path: item.path,
                snippet,
                score,
                license,
                why_relevant: format!(
                    "github rank {} for query \"{}\"",
                    rank + 1,
                    truncate(&query.query, 60)
                ),
            });
        }
        out.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
        if filters.max_results > 0 {
            out.truncate(filters.max_results);
        }
        Ok(out)
    }
}

fn build_search_q(query: &str, filters: &MiningFilters) -> String {
    let mut parts = vec![query.to_string()];
    // GitHub's code search accepts a `language:` qualifier. Use the first
    // allowed language (multiple language qualifiers aren't ANDed usefully).
    if let Some(first) = filters.languages.first() {
        parts.push(format!("language:{}", first));
    }
    parts.join(" ")
}

fn truncate(s: &str, n: usize) -> String {
    if s.len() <= n {
        return s.to_string();
    }
    let mut end = n;
    while end > 0 && !s.is_char_boundary(end) {
        end -= 1;
    }
    let mut out = s[..end].to_string();
    out.push('');
    out
}

#[derive(Deserialize)]
struct SearchResponse {
    #[serde(default)]
    items: Vec<SearchItem>,
}

#[derive(Deserialize)]
struct SearchItem {
    path: String,
    sha: String,
    html_url: String,
    repository: RepoRef,
    #[serde(default)]
    text_matches: Vec<TextMatch>,
}

#[derive(Deserialize)]
struct RepoRef {
    full_name: String,
}

#[derive(Deserialize)]
struct TextMatch {
    fragment: String,
}

#[derive(Deserialize)]
struct RepoMeta {
    license: Option<LicenseMeta>,
}

#[derive(Deserialize)]
struct LicenseMeta {
    spdx_id: Option<String>,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn build_q_appends_language() {
        let f = MiningFilters {
            languages: vec!["rust".into()],
            ..Default::default()
        };
        assert_eq!(build_search_q("token bucket", &f), "token bucket language:rust");
    }

    #[test]
    fn build_q_without_language() {
        let f = MiningFilters::default();
        assert_eq!(build_search_q("foo bar", &f), "foo bar");
    }
}