car-ast 0.14.0

Tree-sitter AST parsing for code-aware inference
Documentation
//! Cross-repository symbol index — aggregates multiple `ProjectIndex`
//! instances so symbol lookup and call-graph queries can span foreign code
//! while keeping provenance intact.
//!
//! Each hit carries a `RepoProvenance` tag so callers (e.g. the GATHER phase
//! of analysis agents) never confuse the host project's own symbols with
//! external code.

use std::collections::HashMap;
use std::path::Path;
use std::sync::OnceLock;

use crate::index::{CrossReference, IndexedSymbol, ProjectIndex};

/// Identifier for a repository in the global index.
///
/// Use `RepoId::host()` for the user's own project (the default in legacy
/// single-repo flows) and `RepoId::foreign(repo, commit)` for an external
/// checkout indexed at a specific commit.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum RepoId {
    Host,
    Foreign { repo: String, commit: String },
}

impl RepoId {
    pub fn host() -> Self {
        RepoId::Host
    }
    pub fn foreign(repo: impl Into<String>, commit: impl Into<String>) -> Self {
        RepoId::Foreign {
            repo: repo.into(),
            commit: commit.into(),
        }
    }
    pub fn is_host(&self) -> bool {
        matches!(self, RepoId::Host)
    }
    pub fn is_foreign(&self) -> bool {
        matches!(self, RepoId::Foreign { .. })
    }
}

/// Provenance attached to every cross-repo hit.
#[derive(Debug, Clone)]
pub struct RepoProvenance {
    pub repo: RepoId,
}

/// A symbol hit spanning multiple repos. Always tagged with provenance.
#[derive(Debug, Clone)]
pub struct GlobalHit<'a> {
    pub provenance: RepoProvenance,
    pub symbol: &'a IndexedSymbol,
}

/// A cross-reference spanning multiple repos.
#[derive(Debug, Clone)]
pub struct GlobalReference<'a> {
    pub provenance: RepoProvenance,
    pub reference: &'a CrossReference,
}

/// An "implementation" match — a symbol that plausibly implements a named
/// concept (trait, interface, design idea). Derived heuristically from
/// symbol name + signature.
#[derive(Debug, Clone)]
pub struct ImplementationHit<'a> {
    pub provenance: RepoProvenance,
    pub symbol: &'a IndexedSymbol,
    /// Short rationale for why this matched the concept.
    pub evidence: String,
}

/// Cross-repo symbol index.
///
/// Maintains a lazy top-level name index so `find` / `find_fuzzy` /
/// `find_implementations` don't linear-scan every repo on every call. The
/// cache is built on first lookup and invalidated on `insert`.
#[derive(Debug, Default)]
pub struct GlobalIndex {
    repos: HashMap<RepoId, ProjectIndex>,
    name_cache: OnceLock<HashMap<String, Vec<(RepoId, String)>>>,
}

impl GlobalIndex {
    pub fn new() -> Self {
        Self {
            repos: HashMap::new(),
            name_cache: OnceLock::new(),
        }
    }

    /// Register an already-built `ProjectIndex` under a given `RepoId`.
    /// Re-registering with the same `RepoId` replaces the previous entry
    /// and invalidates the name cache.
    pub fn insert(&mut self, id: RepoId, index: ProjectIndex) {
        self.repos.insert(id, index);
        self.name_cache = OnceLock::new();
    }

    /// Build-or-return the name cache. Map is `lowercased_name →
    /// [(repo, "file::symbol-key")]` — values point back to entries in the
    /// underlying `ProjectIndex::symbols` map.
    fn ensure_name_cache(&self) -> &HashMap<String, Vec<(RepoId, String)>> {
        self.name_cache.get_or_init(|| {
            let mut map: HashMap<String, Vec<(RepoId, String)>> = HashMap::new();
            for (repo_id, idx) in &self.repos {
                for (key, sym) in &idx.symbols {
                    map.entry(sym.symbol.name.to_lowercase())
                        .or_default()
                        .push((repo_id.clone(), key.clone()));
                }
            }
            map
        })
    }

    /// Look up a symbol in one repo's ProjectIndex by its canonical key.
    fn get_symbol(&self, id: &RepoId, key: &str) -> Option<&IndexedSymbol> {
        self.repos.get(id).and_then(|p| p.symbols.get(key))
    }

    /// Convenience: build and register the host project index from a path.
    pub fn add_host(&mut self, root: &Path) {
        self.repos.insert(RepoId::Host, ProjectIndex::build(root));
    }

    /// Convenience: build and register a foreign repo index from a path.
    pub fn add_foreign(&mut self, repo: impl Into<String>, commit: impl Into<String>, root: &Path) {
        let id = RepoId::foreign(repo, commit);
        self.repos.insert(id, ProjectIndex::build(root));
    }

    pub fn repos(&self) -> impl Iterator<Item = &RepoId> {
        self.repos.keys()
    }

    /// Exact-name lookup across all registered repos. O(1) average via
    /// the lazy name cache, then filters exact case-sensitive matches.
    pub fn find(&self, name: &str) -> Vec<GlobalHit<'_>> {
        let cache = self.ensure_name_cache();
        let lower = name.to_lowercase();
        let Some(entries) = cache.get(&lower) else {
            return Vec::new();
        };
        let mut out = Vec::new();
        for (id, key) in entries {
            if let Some(sym) = self.get_symbol(id, key) {
                if sym.symbol.name == name {
                    out.push(GlobalHit {
                        provenance: RepoProvenance { repo: id.clone() },
                        symbol: sym,
                    });
                }
            }
        }
        out
    }

    /// Fuzzy case-insensitive lookup across all registered repos. Uses the
    /// name cache: substring-matches over cached keys are O(K) where K is
    /// the distinct-name count (not the full symbol count).
    pub fn find_fuzzy(&self, query: &str) -> Vec<GlobalHit<'_>> {
        let cache = self.ensure_name_cache();
        let q_lower = query.to_lowercase();
        let mut out = Vec::new();
        for (name_lower, entries) in cache.iter() {
            if !name_lower.contains(&q_lower) {
                continue;
            }
            for (id, key) in entries {
                if let Some(sym) = self.get_symbol(id, key) {
                    out.push(GlobalHit {
                        provenance: RepoProvenance { repo: id.clone() },
                        symbol: sym,
                    });
                }
            }
        }
        out
    }

    /// Callers of a symbol name across all repos.
    pub fn callers_of(&self, symbol_name: &str) -> Vec<GlobalReference<'_>> {
        let mut out = Vec::new();
        for (id, idx) in &self.repos {
            for r in idx.callers_of(symbol_name) {
                out.push(GlobalReference {
                    provenance: RepoProvenance { repo: id.clone() },
                    reference: r,
                });
            }
        }
        out
    }

    /// Callees of a symbol name across all repos.
    pub fn callees_of(&self, symbol_name: &str) -> Vec<GlobalReference<'_>> {
        let mut out = Vec::new();
        for (id, idx) in &self.repos {
            for r in idx.callees_of(symbol_name) {
                out.push(GlobalReference {
                    provenance: RepoProvenance { repo: id.clone() },
                    reference: r,
                });
            }
        }
        out
    }

    /// Find symbols that plausibly implement a named concept. The concept
    /// matcher is deliberately dumb (substring over name + signature, plus
    /// a small bonus for matches in the signature) — the intent is evidence
    /// gathering, not semantic reasoning. LLM scoring happens downstream.
    pub fn find_implementations(&self, concept: &str) -> Vec<ImplementationHit<'_>> {
        let needle = concept.to_lowercase();
        if needle.is_empty() {
            return Vec::new();
        }
        let mut hits: Vec<ImplementationHit<'_>> = Vec::new();
        for (id, idx) in &self.repos {
            for sym in idx.symbols.values() {
                let name_l = sym.symbol.name.to_lowercase();
                let sig_l = sym.symbol.signature.to_lowercase();
                let name_hit = name_l.contains(&needle);
                let sig_hit = sig_l.contains(&needle);
                if !name_hit && !sig_hit {
                    continue;
                }
                let evidence = if name_hit && sig_hit {
                    format!("name and signature both match '{concept}'")
                } else if name_hit {
                    format!("name matches '{concept}'")
                } else {
                    format!("signature matches '{concept}'")
                };
                hits.push(ImplementationHit {
                    provenance: RepoProvenance { repo: id.clone() },
                    symbol: sym,
                    evidence,
                });
            }
        }
        // Stable ordering: host first, then by repo id, then by symbol name —
        // gives deterministic output so callers can diff across runs.
        hits.sort_by(|a, b| {
            let a_host = a.provenance.repo.is_host();
            let b_host = b.provenance.repo.is_host();
            b_host
                .cmp(&a_host)
                .then_with(|| a.symbol.symbol.name.cmp(&b.symbol.symbol.name))
        });
        hits
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use std::io::Write;

    fn write(dir: &Path, rel: &str, body: &str) {
        let p = dir.join(rel);
        if let Some(parent) = p.parent() {
            fs::create_dir_all(parent).unwrap();
        }
        let mut f = fs::File::create(p).unwrap();
        f.write_all(body.as_bytes()).unwrap();
    }

    #[test]
    fn find_spans_host_and_foreign() {
        let tmp = tempfile::tempdir().unwrap();
        let host = tmp.path().join("host");
        let foreign = tmp.path().join("foreign");
        fs::create_dir_all(&host).unwrap();
        fs::create_dir_all(&foreign).unwrap();

        write(&host, "src/a.rs", "pub fn parse_query() {}\n");
        write(
            &foreign,
            "src/b.rs",
            "pub fn parse_query() {}\npub fn other() {}\n",
        );

        let mut g = GlobalIndex::new();
        g.add_host(&host);
        g.add_foreign("github.com/foo/bar", "abc123", &foreign);

        let hits = g.find("parse_query");
        assert_eq!(
            hits.len(),
            2,
            "should find parse_query in both host and foreign"
        );
        assert!(hits.iter().any(|h| h.provenance.repo.is_host()));
        assert!(hits.iter().any(|h| h.provenance.repo.is_foreign()));
    }

    #[test]
    fn find_implementations_orders_host_first() {
        let tmp = tempfile::tempdir().unwrap();
        let host = tmp.path().join("host");
        let foreign = tmp.path().join("foreign");
        fs::create_dir_all(&host).unwrap();
        fs::create_dir_all(&foreign).unwrap();

        write(&host, "src/a.rs", "pub fn cache_get() {}\n");
        write(&foreign, "src/b.rs", "pub fn cache_set() {}\n");

        let mut g = GlobalIndex::new();
        g.add_host(&host);
        g.add_foreign("github.com/foo/bar", "abc123", &foreign);

        let hits = g.find_implementations("cache");
        assert!(hits.len() >= 2);
        assert!(
            hits[0].provenance.repo.is_host(),
            "host hits must sort before foreign hits"
        );
        for h in &hits {
            assert!(!h.evidence.is_empty());
        }
    }

    #[test]
    fn empty_concept_returns_nothing() {
        let g = GlobalIndex::new();
        assert!(g.find_implementations("").is_empty());
    }
}