car-search 0.16.1

External code discovery + indexing for Common Agent Runtime
Documentation
//! Indexer agent — persist foreign code knowledge into the memory graph.
//!
//! Consumes `CodeReference` items (typically produced by `ReferenceMiner`),
//! parses each snippet via `car-ast`, and inserts the resulting symbols as
//! `MemKind::CodeSymbol` nodes under `Partition::Foreign { source_repo, commit }`.
//!
//! Idempotency is inherited from `MemoryGraph::insert_foreign` — re-ingesting
//! the same (repo, commit, key) returns the existing node rather than creating
//! a duplicate.

use crate::reference_miner::CodeReference;
use car_ast::parse_file;
#[cfg(test)]
use car_memgine::Partition;
use car_memgine::{ContentType, FactMetadata, MemKind, MemNode, MemoryGraph, NodeIndex};
use chrono::Utc;

/// Outcome of indexing a batch of `CodeReference`s.
#[derive(Debug, Default, Clone)]
pub struct IndexReport {
    /// Total references considered.
    pub references_seen: usize,
    /// Number of references that parsed successfully.
    pub references_parsed: usize,
    /// Number of symbol nodes inserted (duplicates deduped via idempotency).
    pub symbols_inserted: usize,
    /// Node indices inserted or returned from the idempotency index.
    pub node_ids: Vec<NodeIndex>,
    /// Per-reference parse failures with reason.
    pub failures: Vec<(String, String)>,
}

/// Indexer agent. Stateless — the backing state lives in the `MemoryGraph`.
pub struct Indexer;

impl Indexer {
    pub fn new() -> Self {
        Self
    }

    /// Ingest references into the graph. Returns a report describing what
    /// was parsed and inserted.
    pub fn ingest(&self, graph: &mut MemoryGraph, references: &[CodeReference]) -> IndexReport {
        let mut report = IndexReport::default();
        report.references_seen = references.len();

        for r in references {
            let parsed = match parse_file(&r.snippet, &r.path) {
                Some(p) => p,
                None => {
                    // Parsing failed — store the raw snippet so the knowledge
                    // isn't lost. On success we rely on parsed symbols and
                    // skip the snippet node to avoid duplicating content.
                    let nix =
                        graph.insert_foreign(r.repo.clone(), r.commit.clone(), snippet_node(r));
                    report.node_ids.push(nix);
                    report.symbols_inserted += 1;
                    report
                        .failures
                        .push((r.path.clone(), "unsupported language".into()));
                    continue;
                }
            };
            report.references_parsed += 1;

            for sym in parsed.all_symbols() {
                let node = MemNode {
                    kind: MemKind::CodeSymbol,
                    layer: 2,
                    key: format!("{}::{}", r.path, sym.name),
                    value: if sym.signature.is_empty() {
                        sym.name.clone()
                    } else {
                        sym.signature.clone()
                    },
                    fact_id: Some(format!(
                        "foreign::{}::{}::{}::{}",
                        r.repo, r.commit, r.path, sym.name
                    )),
                    scope: foreign_scope(&r.repo, &r.commit),
                    authority: license_authority(&r.license),
                    is_constraint: false,
                    created_at: Utc::now(),
                    expires_at: None,
                    content_type: ContentType::Code(car_memgine::CodeLanguage::Unknown),
                    metadata: FactMetadata::default(),
                };
                let nix = graph.insert_foreign(r.repo.clone(), r.commit.clone(), node);
                report.node_ids.push(nix);
                report.symbols_inserted += 1;
            }
        }

        report
    }
}

impl Default for Indexer {
    fn default() -> Self {
        Self::new()
    }
}

fn snippet_node(r: &CodeReference) -> MemNode {
    MemNode {
        kind: MemKind::Fact,
        layer: 2,
        key: r.path.clone(),
        value: r.snippet.clone(),
        fact_id: Some(format!("foreign::{}::{}::{}", r.repo, r.commit, r.path)),
        scope: foreign_scope(&r.repo, &r.commit),
        authority: license_authority(&r.license),
        is_constraint: false,
        created_at: Utc::now(),
        expires_at: None,
        content_type: ContentType::Code(car_memgine::CodeLanguage::Unknown),
        metadata: FactMetadata::default(),
    }
}

fn foreign_scope(repo: &str, commit: &str) -> String {
    format!("foreign:{repo}@{commit}")
}

fn license_authority(license: &Option<String>) -> String {
    match license {
        Some(spdx) => format!("external:{spdx}"),
        None => "external:unknown-license".into(),
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::reference_miner::CodeReference;

    fn rust_ref() -> CodeReference {
        CodeReference {
            repo: "github.com/acme/widget".into(),
            commit: "abc123".into(),
            path: "src/lib.rs".into(),
            snippet: "pub fn hello() -> &'static str { \"hi\" }\npub struct Greeter;".into(),
            score: 0.9,
            license: Some("MIT".into()),
            why_relevant: "matches greeting query".into(),
        }
    }

    #[test]
    fn ingests_symbols_into_foreign_partition() {
        let mut g = MemoryGraph::new();
        let report = Indexer::new().ingest(&mut g, &[rust_ref()]);
        assert_eq!(report.references_seen, 1);
        assert_eq!(report.references_parsed, 1);
        assert!(report.symbols_inserted >= 2, "expected anchor + symbols");
        for nix in &report.node_ids {
            match g.partition_of(*nix) {
                Partition::Foreign { source_repo, .. } => {
                    assert_eq!(source_repo, "github.com/acme/widget");
                }
                _ => panic!("indexer wrote outside Foreign partition"),
            }
        }
    }

    #[test]
    fn ingestion_is_idempotent() {
        let mut g = MemoryGraph::new();
        let r = rust_ref();
        let first = Indexer::new().ingest(&mut g, &[r.clone()]);
        let node_count_after_first = g.inner.node_count();
        let second = Indexer::new().ingest(&mut g, &[r]);
        assert_eq!(
            g.inner.node_count(),
            node_count_after_first,
            "re-ingesting the same reference must not grow the graph"
        );
        assert_eq!(first.node_ids.len(), second.node_ids.len());
    }

    #[test]
    fn unsupported_language_stores_raw_snippet() {
        let mut g = MemoryGraph::new();
        let r = CodeReference {
            path: "doc/notes.xyz".into(),
            snippet: "some notes in an unknown format".into(),
            ..rust_ref()
        };
        let report = Indexer::new().ingest(&mut g, &[r]);
        assert_eq!(report.references_parsed, 0);
        assert_eq!(report.symbols_inserted, 1);
        assert_eq!(report.failures.len(), 1);
    }
}