agent-source-repository 0.1.0

pub(crate) mod build;
mod cache;

use std::collections::{HashMap, HashSet};
use std::fmt;
use std::path::Path;

use anyhow::{bail, Context, Result};

use crate::bm25::Bm25Index;
use crate::encoder::{SemanticIndex, StaticEncoder};
use crate::exact::ExactIndex;
use crate::graph::DependencyGraph;
use crate::model::{Chunk, IndexStats, SearchResult};
use crate::search::{search_bm25, search_hybrid, HybridSearchContext};
use crate::source_tree::SourceTree;
use crate::tokens::tokenize;
use build::{build_bm25_index_from_path, build_index_from_path};

struct HybridSearchBackend {
    encoder: StaticEncoder,
    semantic_index: SemanticIndex,
}

enum SearchBackend {
    Hybrid(Box<HybridSearchBackend>),
    Bm25Only,
}

#[derive(Debug)]
pub enum SemanticIndexBuildError {
    SemanticUnavailable(anyhow::Error),
    Index(anyhow::Error),
}

impl fmt::Display for SemanticIndexBuildError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::SemanticUnavailable(err) => write!(f, "semantic model unavailable: {err:#}"),
            Self::Index(err) => write!(f, "index build failed: {err:#}"),
        }
    }
}

impl std::error::Error for SemanticIndexBuildError {}

pub struct SourceIndex {
    bm25_index: Bm25Index,
    exact_index: ExactIndex,
    backend: SearchBackend,
    chunks: Vec<Chunk>,
    file_mapping: HashMap<String, Vec<usize>>,
    language_mapping: HashMap<String, Vec<usize>>,
    graph: DependencyGraph,
}

impl SourceIndex {
    pub fn from_source(source_path_or_git_url: &str, include_text_files: bool) -> Result<Self> {
        let source = SourceTree::from_source(source_path_or_git_url, None)?;
        Self::from_source_tree(source, None, None, None, include_text_files)
    }

    pub fn from_source_bm25(
        source_path_or_git_url: &str,
        include_text_files: bool,
    ) -> Result<Self> {
        let source = SourceTree::from_source(source_path_or_git_url, None)?;
        Self::from_source_tree_bm25(source, None, None, include_text_files, true)
    }

    pub fn from_source_semantic(
        source_path_or_git_url: &str,
        include_text_files: bool,
    ) -> std::result::Result<Self, SemanticIndexBuildError> {
        let encoder =
            StaticEncoder::load(None).map_err(SemanticIndexBuildError::SemanticUnavailable)?;
        let source = SourceTree::from_source(source_path_or_git_url, None)
            .map_err(SemanticIndexBuildError::Index)?;
        Self::from_source_tree(source, Some(encoder), None, None, include_text_files)
            .map_err(SemanticIndexBuildError::Index)
    }

    pub fn from_path(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
        let source = SourceTree::from_path(path)?;
        Self::from_source_tree(source, None, None, None, include_text_files)
    }

    pub fn from_path_bm25(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
        let source = SourceTree::from_path(path)?;
        Self::from_source_tree_bm25(source, None, None, include_text_files, true)
    }

    /// Build a BM25/exact index without using the user-home source index cache.
    ///
    /// ASR uses this path so agent-facing commands do not write outside ASR_HOME
    /// and do not mutate or depend on repository-local generated state.
    pub fn from_path_bm25_uncached(
        path: impl AsRef<Path>,
        include_text_files: bool,
    ) -> Result<Self> {
        let source = SourceTree::from_path(path)?;
        Self::from_source_tree_bm25(source, None, None, include_text_files, false)
    }

    /// Build a BM25/exact index from a persisted ASR chunk snapshot.
    ///
    /// This constructor is lexical-only by design. It lets `asr search` query the
    /// chunks produced by `asr repo index` without reading an arbitrary path from
    /// the CLI and without requiring semantic model files.
    pub fn from_chunks_bm25(chunks: Vec<Chunk>) -> Result<Self> {
        if chunks.is_empty() {
            bail!("No indexed chunks available");
        }

        let bm25_docs: Vec<Vec<String>> = chunks
            .iter()
            .map(|chunk| tokenize(&build::enrich_for_bm25(chunk)))
            .collect();
        let bm25_index = Bm25Index::new(&bm25_docs);
        let (file_mapping, language_mapping) = build_mappings(&chunks);

        Ok(Self {
            bm25_index,
            exact_index: ExactIndex::new(&chunks),
            backend: SearchBackend::Bm25Only,
            chunks,
            file_mapping,
            language_mapping,
            graph: DependencyGraph::new(),
        })
    }

    fn from_source_tree(
        source: SourceTree,
        encoder: Option<StaticEncoder>,
        extensions: Option<&HashSet<String>>,
        ignore: Option<&HashSet<String>>,
        include_text_files: bool,
    ) -> Result<Self> {
        let path = source.root().to_path_buf();
        let (backend, bm25_index, chunks, graph) = match encoder {
            Some(encoder) => {
                let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
                    &path,
                    &encoder,
                    extensions,
                    ignore,
                    include_text_files,
                    &path,
                )?;
                (
                    SearchBackend::Hybrid(Box::new(HybridSearchBackend {
                        encoder,
                        semantic_index,
                    })),
                    bm25_index,
                    chunks,
                    graph,
                )
            }
            None => {
                let try_semantic = || -> Result<_> {
                    let encoder = StaticEncoder::load(None)?;
                    let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
                        &path,
                        &encoder,
                        extensions,
                        ignore,
                        include_text_files,
                        &path,
                    )?;
                    Ok((
                        SearchBackend::Hybrid(Box::new(HybridSearchBackend {
                            encoder,
                            semantic_index,
                        })),
                        bm25_index,
                        chunks,
                        graph,
                    ))
                };
                match try_semantic() {
                    Ok(result) => result,
                    Err(err) => {
                        log::warn!(
                            "semantic model unavailable; falling back to BM25-only search: {err:#}"
                        );
                        return Self::from_source_tree_bm25(
                            source,
                            extensions,
                            ignore,
                            include_text_files,
                            true,
                        );
                    }
                }
            }
        };

        let (file_mapping, language_mapping) = build_mappings(&chunks);
        Ok(Self {
            bm25_index,
            exact_index: ExactIndex::new(&chunks),
            backend,
            chunks,
            file_mapping,
            language_mapping,
            graph,
        })
    }

    fn from_source_tree_bm25(
        source: SourceTree,
        extensions: Option<&HashSet<String>>,
        ignore: Option<&HashSet<String>>,
        include_text_files: bool,
        use_cache: bool,
    ) -> Result<Self> {
        let path = source.root();
        let cacheable =
            use_cache && !source.is_temporary() && extensions.is_none() && ignore.is_none();
        if cacheable {
            if let Some(cached) = cache::load_bm25(path, include_text_files) {
                let (file_mapping, language_mapping) = build_mappings(&cached.chunks);
                return Ok(Self {
                    bm25_index: cached.bm25_index,
                    exact_index: ExactIndex::new(&cached.chunks),
                    backend: SearchBackend::Bm25Only,
                    chunks: cached.chunks,
                    file_mapping,
                    language_mapping,
                    graph: cached.graph,
                });
            }
        }

        let (bm25_index, chunks, graph) =
            build_bm25_index_from_path(path, extensions, ignore, include_text_files, path)?;

        let (file_mapping, language_mapping) = build_mappings(&chunks);
        if cacheable {
            let manifest = cache::build_manifest(path, include_text_files);
            let _ = cache::store_bm25(
                path,
                include_text_files,
                manifest,
                &bm25_index,
                &chunks,
                &graph,
            );
        }

        Ok(Self {
            bm25_index,
            exact_index: ExactIndex::new(&chunks),
            backend: SearchBackend::Bm25Only,
            chunks,
            file_mapping,
            language_mapping,
            graph,
        })
    }

    pub fn from_git(url: &str, ref_: Option<&str>, include_text_files: bool) -> Result<Self> {
        let source = SourceTree::from_git(url, ref_)?;
        Self::from_source_tree(source, None, None, None, include_text_files)
    }

    pub fn search(
        &self,
        query: &str,
        top_k: usize,
        alpha: Option<f64>,
        filter_languages: Option<&[String]>,
        filter_paths: Option<&[String]>,
    ) -> Vec<SearchResult> {
        if self.chunks.is_empty() || query.trim().is_empty() {
            return Vec::new();
        }

        let selector = self.get_selector(filter_languages, filter_paths);
        let selector_ref = selector.as_deref();

        let backend_results = match &self.backend {
            SearchBackend::Hybrid(backend) => search_hybrid(
                query,
                HybridSearchContext {
                    encoder: &backend.encoder,
                    semantic_index: &backend.semantic_index,
                    bm25_index: &self.bm25_index,
                    chunks: &self.chunks,
                    graph: Some(&self.graph),
                    file_mapping: &self.file_mapping,
                },
                top_k,
                alpha,
                selector_ref,
            ),
            SearchBackend::Bm25Only => {
                search_bm25(query, &self.bm25_index, &self.chunks, top_k, selector_ref)
            }
        };

        fuse_exact_results(
            query,
            &self.exact_index,
            &self.chunks,
            top_k,
            selector_ref,
            backend_results,
        )
    }

    pub fn find_related(&self, source: &Chunk, top_k: usize) -> Result<Vec<SearchResult>> {
        let selector = source
            .language
            .as_ref()
            .and_then(|lang| self.language_mapping.get(lang))
            .map(|indices| indices.as_slice());

        let backend = match &self.backend {
            SearchBackend::Hybrid(backend) => backend,
            SearchBackend::Bm25Only => {
                bail!("find-related requires a semantic index, but this index is BM25-only")
            }
        };

        let query_embedding = backend
            .encoder
            .encode_single(&source.content)
            .context("failed to encode source chunk for related search")?;

        let results = backend
            .semantic_index
            .query(&query_embedding, top_k + 1, selector);
        let results: Vec<SearchResult> = results
            .into_iter()
            .filter(|&(idx, _)| self.chunks[idx] != *source)
            .take(top_k)
            .map(|(idx, dist)| SearchResult {
                chunk: self.chunks[idx].clone(),
                score: (1.0 - dist) as f64,
                match_lines: vec![],
            })
            .collect();

        Ok(results)
    }

    pub fn supports_find_related(&self) -> bool {
        matches!(self.backend, SearchBackend::Hybrid(_))
    }

    pub fn stats(&self) -> IndexStats {
        let mut language_counts: HashMap<String, usize> = HashMap::new();
        for chunk in &self.chunks {
            if let Some(lang) = &chunk.language {
                *language_counts.entry(lang.clone()).or_default() += 1;
            }
        }
        IndexStats {
            indexed_files: self.file_mapping.len(),
            total_chunks: self.chunks.len(),
            languages: language_counts,
        }
    }

    pub fn chunks(&self) -> &[Chunk] {
        &self.chunks
    }
    pub fn graph(&self) -> &DependencyGraph {
        &self.graph
    }
    pub fn chunk_at(&self, file_path: &str, line: usize) -> Option<&Chunk> {
        crate::chunk_lookup::resolve_chunk(&self.chunks, file_path, line)
    }

    fn get_selector(
        &self,
        filter_languages: Option<&[String]>,
        filter_paths: Option<&[String]>,
    ) -> Option<Vec<usize>> {
        let mut indices = Vec::new();
        if let Some(langs) = filter_languages {
            for lang in langs {
                if let Some(ids) = self.language_mapping.get(lang) {
                    indices.extend(ids);
                }
            }
        }
        if let Some(paths) = filter_paths {
            for path in paths {
                if let Some(ids) = self.file_mapping.get(path) {
                    indices.extend(ids);
                }
            }
        }
        if indices.is_empty() {
            None
        } else {
            indices.sort();
            indices.dedup();
            Some(indices)
        }
    }
}

fn fuse_exact_results(
    query: &str,
    exact_index: &ExactIndex,
    chunks: &[Chunk],
    top_k: usize,
    selector: Option<&[usize]>,
    backend_results: Vec<SearchResult>,
) -> Vec<SearchResult> {
    let exact_results =
        exact_index.search(query, chunks, top_k.saturating_mul(2).max(top_k), selector);
    if exact_results.is_empty() {
        return backend_results;
    }

    let mut merged: Vec<SearchResult> = Vec::new();
    let mut positions: HashMap<(String, usize, usize), usize> = HashMap::new();

    for mut result in exact_results.into_iter().chain(backend_results.into_iter()) {
        let key = (
            result.chunk.file_path.clone(),
            result.chunk.start_line,
            result.chunk.end_line,
        );
        if let Some(&pos) = positions.get(&key) {
            let existing = &mut merged[pos];
            if result.score > existing.score {
                existing.score = result.score;
            } else {
                existing.score += result.score.min(1.0);
            }
            for match_line in result.match_lines.drain(..) {
                if !existing
                    .match_lines
                    .iter()
                    .any(|line| line.line == match_line.line && line.content == match_line.content)
                {
                    existing.match_lines.push(match_line);
                }
            }
        } else {
            positions.insert(key, merged.len());
            merged.push(result);
        }
    }

    merged.sort_by(|a, b| {
        b.score
            .partial_cmp(&a.score)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then_with(|| a.chunk.file_path.cmp(&b.chunk.file_path))
            .then_with(|| a.chunk.start_line.cmp(&b.chunk.start_line))
            .then_with(|| a.chunk.end_line.cmp(&b.chunk.end_line))
    });
    merged.truncate(top_k);
    merged
}

fn build_mappings(chunks: &[Chunk]) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
    let mut file_mapping: HashMap<String, Vec<usize>> = HashMap::new();
    let mut language_mapping: HashMap<String, Vec<usize>> = HashMap::new();
    for (i, chunk) in chunks.iter().enumerate() {
        file_mapping
            .entry(chunk.file_path.clone())
            .or_default()
            .push(i);
        if let Some(lang) = &chunk.language {
            language_mapping.entry(lang.clone()).or_default().push(i);
        }
    }
    (file_mapping, language_mapping)
}

#[cfg(test)]
mod tests {
    use super::*;
    use safetensors::tensor::{serialize, Dtype, TensorView};
    use std::fs;
    use std::path::PathBuf;
    use std::time::{SystemTime, UNIX_EPOCH};
    use tokenizers::models::wordlevel::WordLevel;
    use tokenizers::pre_tokenizers::whitespace::Whitespace;
    use tokenizers::Tokenizer;

    fn unique_temp_dir(name: &str) -> PathBuf {
        let unique = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .expect("system time should be after unix epoch")
            .as_nanos();
        std::env::temp_dir().join(format!("asr-index-test-{name}-{unique}"))
    }

    fn write_test_encoder(root: &Path) -> StaticEncoder {
        fs::create_dir_all(root).expect("test encoder directory should be created");

        let vocab = [
            ("<unk>".to_string(), 0),
            ("search".to_string(), 1),
            ("target".to_string(), 2),
            ("function".to_string(), 3),
        ]
        .into_iter()
        .collect();
        let wordlevel = WordLevel::builder()
            .vocab(vocab)
            .unk_token("<unk>".to_string())
            .build()
            .expect("test wordlevel tokenizer should build");
        let mut tokenizer = Tokenizer::new(wordlevel);
        tokenizer.with_pre_tokenizer(Some(Whitespace));

        let tokenizer_path = root.join("tokenizer.json");
        tokenizer
            .save(&tokenizer_path, false)
            .expect("test tokenizer should be written");

        let embedding_values: [f32; 16] = [
            0.0, 0.0, 0.0, 0.0, // <unk>
            1.0, 0.0, 0.0, 0.0, // search
            0.0, 1.0, 0.0, 0.0, // target
            0.0, 0.0, 1.0, 0.0, // function
        ];
        let embedding_bytes: Vec<u8> = embedding_values
            .iter()
            .flat_map(|value| value.to_le_bytes())
            .collect();
        let view = TensorView::new(Dtype::F32, vec![4, 4], &embedding_bytes)
            .expect("test tensor view should match embedding shape");
        let model_bytes = serialize([("embeddings", view)], &None)
            .expect("test safetensors model should serialize");
        let model_path = root.join("model.safetensors");
        fs::write(&model_path, model_bytes).expect("test model should be written");

        StaticEncoder::from_files(&tokenizer_path, &model_path)
            .expect("test static encoder should load")
    }

    #[test]
    fn search_uses_semantic_index_when_encoder_is_available() {
        let root = unique_temp_dir("semantic-source");
        fs::create_dir_all(root.join("src")).expect("source directory should be created");
        fs::write(
            root.join("src/lib.rs"),
            "pub fn search_target_function() -> &'static str { \"ok\" }\n",
        )
        .expect("source fixture should be written");

        let encoder = write_test_encoder(&unique_temp_dir("encoder"));
        let source = SourceTree::from_path(&root).expect("source tree should load");
        let index = SourceIndex::from_source_tree(source, Some(encoder), None, None, false)
            .expect("index should build with injected semantic encoder");

        assert!(
            index.supports_find_related(),
            "semantic index should be built instead of BM25-only fallback"
        );

        let results = index.search("search target", 3, None, None, None);
        assert!(
            results
                .iter()
                .any(|result| result.chunk.file_path == "src/lib.rs"),
            "hybrid search should return the indexed Rust source: {results:?}"
        );
    }
}