car-search 0.25.0

External code discovery + indexing for Common Agent Runtime
//! ReferenceMiner agent — discover analogous code across external repositories.
//!
//! Given a natural-language query, returns ranked `CodeReference` hits from
//! configured sources (GitHub search, local clone roots, pre-indexed corpora).
//! Designed to feed the GATHER phase of analysis agents and the paper_to_code
//! mission: answer "how do others solve this?" with concrete evidence.
//!
//! This module defines the trait and data types. Concrete implementations
//! live in submodules: [`github`], [`local`], [`multi`], and [`scoring`].

use async_trait::async_trait;
use serde::{Deserialize, Serialize};

pub mod github;
pub mod local;
pub mod multi;
pub mod scoring;

pub use github::GitHubSource;
pub use local::LocalCloneSource;
pub use multi::MultiSource;
pub use scoring::score_with_llm;

/// Conventional language → extension mapping used by `MiningFilters::languages`.
/// Returns `true` if `path` matches any of the configured languages.
/// An empty `languages` list matches everything.
pub(crate) fn language_matches(languages: &[String], path: &str) -> bool {
    if languages.is_empty() {
        return true;
    }
    let ext = std::path::Path::new(path)
        .extension()
        .and_then(|s| s.to_str())
        .unwrap_or("")
        .to_ascii_lowercase();
    for lang in languages {
        let l = lang.to_ascii_lowercase();
        let ok = match l.as_str() {
            "rust" => ext == "rs",
            "typescript" => matches!(ext.as_str(), "ts" | "tsx"),
            "javascript" => matches!(ext.as_str(), "js" | "jsx" | "mjs" | "cjs"),
            "python" => ext == "py",
            "go" => ext == "go",
            "c" => matches!(ext.as_str(), "c" | "h"),
            "cpp" | "c++" => matches!(ext.as_str(), "cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx"),
            "java" => ext == "java",
            "ruby" => ext == "rb",
            "swift" => ext == "swift",
            "kotlin" => matches!(ext.as_str(), "kt" | "kts"),
            other => ext == other,
        };
        if ok {
            return true;
        }
    }
    false
}

/// `true` if the reference passes the license allowlist.
/// Empty allowlist → always passes. Otherwise, `None` license is dropped
/// (unknown license is not a match) and the SPDX id must match exactly.
pub(crate) fn license_matches(allowlist: &[String], license: Option<&str>) -> bool {
    if allowlist.is_empty() {
        return true;
    }
    match license {
        None => false,
        Some(id) => allowlist.iter().any(|a| a.eq_ignore_ascii_case(id)),
    }
}

/// A single code hit returned by a `ReferenceMiner`.
///
/// Carries enough provenance that downstream agents can:
///   - attribute evidence correctly (never confuse foreign code with project code),
///   - honor license constraints,
///   - re-fetch the exact snippet by (repo, commit, path).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeReference {
    /// Origin repository identifier, e.g. `github.com/owner/name` or a local path.
    pub repo: String,
    /// Commit SHA or ref the snippet was extracted at. Required for reproducibility.
    pub commit: String,
    /// Path within the repo.
    pub path: String,
    /// The snippet itself. Should be small enough to fit curated evidence budgets.
    pub snippet: String,
    /// Relevance score in [0.0, 1.0]. Higher is better.
    pub score: f32,
    /// SPDX-style license identifier if known (e.g. "MIT", "Apache-2.0"),
    /// or `None` when the source did not disclose one.
    pub license: Option<String>,
    /// One-sentence rationale for why this hit matched the query. Useful
    /// for both LLM consumers and human auditing.
    pub why_relevant: String,
}

/// Where a `ReferenceMiner` should look.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum MiningScope {
    /// Public code search (e.g. GitHub).
    Remote,
    /// User-configured local clone roots only. No network.
    Local,
    /// Both remote and local.
    All,
}

impl Default for MiningScope {
    fn default() -> Self {
        MiningScope::All
    }
}

/// Filters applied before scoring.
///
/// Keep this struct narrow — filters that require the full snippet belong in
/// the scorer, not here.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MiningFilters {
    /// Restrict to these programming languages (by conventional name,
    /// e.g. "rust", "typescript"). Empty means no restriction.
    pub languages: Vec<String>,
    /// Allowed SPDX license identifiers. Empty means no restriction.
    /// Implementations SHOULD drop hits whose license is unknown when this
    /// list is non-empty — unknown license is not a match.
    pub license_allowlist: Vec<String>,
    /// Maximum results to return after scoring.
    pub max_results: usize,
}

/// Query passed to a `ReferenceMiner`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MiningQuery {
    pub query: String,
    #[serde(default)]
    pub scope: MiningScope,
    #[serde(default)]
    pub filters: MiningFilters,
}

/// Errors a miner can produce. Kept as a single enum so consumers handle one
/// type at the boundary rather than per-source variants.
#[derive(Debug, thiserror::Error)]
pub enum MiningError {
    #[error("rate limit exceeded: {0}")]
    RateLimited(String),
    #[error("source unavailable: {0}")]
    Unavailable(String),
    #[error("invalid query: {0}")]
    InvalidQuery(String),
    #[error(transparent)]
    Other(#[from] anyhow::Error),
}

/// A source of code references. Implementations should be lock-free and
/// cancellation-safe; the mining agent may call multiple sources concurrently.
#[async_trait]
pub trait ReferenceMiner: Send + Sync {
    /// Human-readable name for logs and provenance tagging.
    fn name(&self) -> &str;

    /// Execute a mining query. Results should already be scored and filtered
    /// according to `query.filters`, sorted descending by `score`.
    async fn search(&self, query: &MiningQuery) -> Result<Vec<CodeReference>, MiningError>;
}