Skip to main content

asr/index/
mod.rs

1pub(crate) mod build;
2mod cache;
3
4use std::collections::{HashMap, HashSet};
5use std::fmt;
6use std::path::Path;
7
8use anyhow::{bail, Context, Result};
9
10use crate::bm25::Bm25Index;
11use crate::encoder::{SemanticIndex, StaticEncoder};
12use crate::exact::ExactIndex;
13use crate::graph::DependencyGraph;
14use crate::model::{Chunk, IndexStats, SearchResult};
15use crate::search::{search_bm25, search_hybrid, HybridSearchContext};
16use crate::source_tree::SourceTree;
17use crate::tokens::tokenize;
18use build::{build_bm25_index_from_path, build_index_from_path};
19
20struct HybridSearchBackend {
21    encoder: StaticEncoder,
22    semantic_index: SemanticIndex,
23}
24
25enum SearchBackend {
26    Hybrid(Box<HybridSearchBackend>),
27    Bm25Only,
28}
29
30#[derive(Debug)]
31pub enum SemanticIndexBuildError {
32    SemanticUnavailable(anyhow::Error),
33    Index(anyhow::Error),
34}
35
36impl fmt::Display for SemanticIndexBuildError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            Self::SemanticUnavailable(err) => write!(f, "semantic model unavailable: {err:#}"),
40            Self::Index(err) => write!(f, "index build failed: {err:#}"),
41        }
42    }
43}
44
45impl std::error::Error for SemanticIndexBuildError {}
46
47pub struct SourceIndex {
48    bm25_index: Bm25Index,
49    exact_index: ExactIndex,
50    backend: SearchBackend,
51    chunks: Vec<Chunk>,
52    file_mapping: HashMap<String, Vec<usize>>,
53    language_mapping: HashMap<String, Vec<usize>>,
54    graph: DependencyGraph,
55}
56
57impl SourceIndex {
58    pub fn from_source(source_path_or_git_url: &str, include_text_files: bool) -> Result<Self> {
59        let source = SourceTree::from_source(source_path_or_git_url, None)?;
60        Self::from_source_tree(source, None, None, None, include_text_files)
61    }
62
63    pub fn from_source_bm25(
64        source_path_or_git_url: &str,
65        include_text_files: bool,
66    ) -> Result<Self> {
67        let source = SourceTree::from_source(source_path_or_git_url, None)?;
68        Self::from_source_tree_bm25(source, None, None, include_text_files, true)
69    }
70
71    pub fn from_source_semantic(
72        source_path_or_git_url: &str,
73        include_text_files: bool,
74    ) -> std::result::Result<Self, SemanticIndexBuildError> {
75        let encoder =
76            StaticEncoder::load(None).map_err(SemanticIndexBuildError::SemanticUnavailable)?;
77        let source = SourceTree::from_source(source_path_or_git_url, None)
78            .map_err(SemanticIndexBuildError::Index)?;
79        Self::from_source_tree(source, Some(encoder), None, None, include_text_files)
80            .map_err(SemanticIndexBuildError::Index)
81    }
82
83    pub fn from_path(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
84        let source = SourceTree::from_path(path)?;
85        Self::from_source_tree(source, None, None, None, include_text_files)
86    }
87
88    pub fn from_path_bm25(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
89        let source = SourceTree::from_path(path)?;
90        Self::from_source_tree_bm25(source, None, None, include_text_files, true)
91    }
92
93    /// Build a BM25/exact index without using the user-home source index cache.
94    ///
95    /// ASR uses this path so agent-facing commands do not write outside ASR_HOME
96    /// and do not mutate or depend on repository-local generated state.
97    pub fn from_path_bm25_uncached(
98        path: impl AsRef<Path>,
99        include_text_files: bool,
100    ) -> Result<Self> {
101        let source = SourceTree::from_path(path)?;
102        Self::from_source_tree_bm25(source, None, None, include_text_files, false)
103    }
104
105    /// Build a BM25/exact index from a persisted ASR chunk snapshot.
106    ///
107    /// This constructor is lexical-only by design. It lets `asr search` query the
108    /// chunks produced by `asr repo index` without reading an arbitrary path from
109    /// the CLI and without requiring semantic model files.
110    pub fn from_chunks_bm25(chunks: Vec<Chunk>) -> Result<Self> {
111        if chunks.is_empty() {
112            bail!("No indexed chunks available");
113        }
114
115        let bm25_docs: Vec<Vec<String>> = chunks
116            .iter()
117            .map(|chunk| tokenize(&build::enrich_for_bm25(chunk)))
118            .collect();
119        let bm25_index = Bm25Index::new(&bm25_docs);
120        let (file_mapping, language_mapping) = build_mappings(&chunks);
121
122        Ok(Self {
123            bm25_index,
124            exact_index: ExactIndex::new(&chunks),
125            backend: SearchBackend::Bm25Only,
126            chunks,
127            file_mapping,
128            language_mapping,
129            graph: DependencyGraph::new(),
130        })
131    }
132
133    fn from_source_tree(
134        source: SourceTree,
135        encoder: Option<StaticEncoder>,
136        extensions: Option<&HashSet<String>>,
137        ignore: Option<&HashSet<String>>,
138        include_text_files: bool,
139    ) -> Result<Self> {
140        let path = source.root().to_path_buf();
141        let (backend, bm25_index, chunks, graph) = match encoder {
142            Some(encoder) => {
143                let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
144                    &path,
145                    &encoder,
146                    extensions,
147                    ignore,
148                    include_text_files,
149                    &path,
150                )?;
151                (
152                    SearchBackend::Hybrid(Box::new(HybridSearchBackend {
153                        encoder,
154                        semantic_index,
155                    })),
156                    bm25_index,
157                    chunks,
158                    graph,
159                )
160            }
161            None => {
162                let try_semantic = || -> Result<_> {
163                    let encoder = StaticEncoder::load(None)?;
164                    let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
165                        &path,
166                        &encoder,
167                        extensions,
168                        ignore,
169                        include_text_files,
170                        &path,
171                    )?;
172                    Ok((
173                        SearchBackend::Hybrid(Box::new(HybridSearchBackend {
174                            encoder,
175                            semantic_index,
176                        })),
177                        bm25_index,
178                        chunks,
179                        graph,
180                    ))
181                };
182                match try_semantic() {
183                    Ok(result) => result,
184                    Err(err) => {
185                        log::warn!(
186                            "semantic model unavailable; falling back to BM25-only search: {err:#}"
187                        );
188                        return Self::from_source_tree_bm25(
189                            source,
190                            extensions,
191                            ignore,
192                            include_text_files,
193                            true,
194                        );
195                    }
196                }
197            }
198        };
199
200        let (file_mapping, language_mapping) = build_mappings(&chunks);
201        Ok(Self {
202            bm25_index,
203            exact_index: ExactIndex::new(&chunks),
204            backend,
205            chunks,
206            file_mapping,
207            language_mapping,
208            graph,
209        })
210    }
211
212    fn from_source_tree_bm25(
213        source: SourceTree,
214        extensions: Option<&HashSet<String>>,
215        ignore: Option<&HashSet<String>>,
216        include_text_files: bool,
217        use_cache: bool,
218    ) -> Result<Self> {
219        let path = source.root();
220        let cacheable =
221            use_cache && !source.is_temporary() && extensions.is_none() && ignore.is_none();
222        if cacheable {
223            if let Some(cached) = cache::load_bm25(path, include_text_files) {
224                let (file_mapping, language_mapping) = build_mappings(&cached.chunks);
225                return Ok(Self {
226                    bm25_index: cached.bm25_index,
227                    exact_index: ExactIndex::new(&cached.chunks),
228                    backend: SearchBackend::Bm25Only,
229                    chunks: cached.chunks,
230                    file_mapping,
231                    language_mapping,
232                    graph: cached.graph,
233                });
234            }
235        }
236
237        let (bm25_index, chunks, graph) =
238            build_bm25_index_from_path(path, extensions, ignore, include_text_files, path)?;
239
240        let (file_mapping, language_mapping) = build_mappings(&chunks);
241        if cacheable {
242            let manifest = cache::build_manifest(path, include_text_files);
243            let _ = cache::store_bm25(
244                path,
245                include_text_files,
246                manifest,
247                &bm25_index,
248                &chunks,
249                &graph,
250            );
251        }
252
253        Ok(Self {
254            bm25_index,
255            exact_index: ExactIndex::new(&chunks),
256            backend: SearchBackend::Bm25Only,
257            chunks,
258            file_mapping,
259            language_mapping,
260            graph,
261        })
262    }
263
264    pub fn from_git(url: &str, ref_: Option<&str>, include_text_files: bool) -> Result<Self> {
265        let source = SourceTree::from_git(url, ref_)?;
266        Self::from_source_tree(source, None, None, None, include_text_files)
267    }
268
269    pub fn search(
270        &self,
271        query: &str,
272        top_k: usize,
273        alpha: Option<f64>,
274        filter_languages: Option<&[String]>,
275        filter_paths: Option<&[String]>,
276    ) -> Vec<SearchResult> {
277        if self.chunks.is_empty() || query.trim().is_empty() {
278            return Vec::new();
279        }
280
281        let selector = self.get_selector(filter_languages, filter_paths);
282        let selector_ref = selector.as_deref();
283
284        let backend_results = match &self.backend {
285            SearchBackend::Hybrid(backend) => search_hybrid(
286                query,
287                HybridSearchContext {
288                    encoder: &backend.encoder,
289                    semantic_index: &backend.semantic_index,
290                    bm25_index: &self.bm25_index,
291                    chunks: &self.chunks,
292                    graph: Some(&self.graph),
293                    file_mapping: &self.file_mapping,
294                },
295                top_k,
296                alpha,
297                selector_ref,
298            ),
299            SearchBackend::Bm25Only => {
300                search_bm25(query, &self.bm25_index, &self.chunks, top_k, selector_ref)
301            }
302        };
303
304        fuse_exact_results(
305            query,
306            &self.exact_index,
307            &self.chunks,
308            top_k,
309            selector_ref,
310            backend_results,
311        )
312    }
313
314    pub fn find_related(&self, source: &Chunk, top_k: usize) -> Result<Vec<SearchResult>> {
315        let selector = source
316            .language
317            .as_ref()
318            .and_then(|lang| self.language_mapping.get(lang))
319            .map(|indices| indices.as_slice());
320
321        let backend = match &self.backend {
322            SearchBackend::Hybrid(backend) => backend,
323            SearchBackend::Bm25Only => {
324                bail!("find-related requires a semantic index, but this index is BM25-only")
325            }
326        };
327
328        let query_embedding = backend
329            .encoder
330            .encode_single(&source.content)
331            .context("failed to encode source chunk for related search")?;
332
333        let results = backend
334            .semantic_index
335            .query(&query_embedding, top_k + 1, selector);
336        let results: Vec<SearchResult> = results
337            .into_iter()
338            .filter(|&(idx, _)| self.chunks[idx] != *source)
339            .take(top_k)
340            .map(|(idx, dist)| SearchResult {
341                chunk: self.chunks[idx].clone(),
342                score: (1.0 - dist) as f64,
343                match_lines: vec![],
344            })
345            .collect();
346
347        Ok(results)
348    }
349
350    pub fn supports_find_related(&self) -> bool {
351        matches!(self.backend, SearchBackend::Hybrid(_))
352    }
353
354    pub fn stats(&self) -> IndexStats {
355        let mut language_counts: HashMap<String, usize> = HashMap::new();
356        for chunk in &self.chunks {
357            if let Some(lang) = &chunk.language {
358                *language_counts.entry(lang.clone()).or_default() += 1;
359            }
360        }
361        IndexStats {
362            indexed_files: self.file_mapping.len(),
363            total_chunks: self.chunks.len(),
364            languages: language_counts,
365        }
366    }
367
368    pub fn chunks(&self) -> &[Chunk] {
369        &self.chunks
370    }
371    pub fn graph(&self) -> &DependencyGraph {
372        &self.graph
373    }
374    pub fn chunk_at(&self, file_path: &str, line: usize) -> Option<&Chunk> {
375        crate::chunk_lookup::resolve_chunk(&self.chunks, file_path, line)
376    }
377
378    fn get_selector(
379        &self,
380        filter_languages: Option<&[String]>,
381        filter_paths: Option<&[String]>,
382    ) -> Option<Vec<usize>> {
383        let mut indices = Vec::new();
384        if let Some(langs) = filter_languages {
385            for lang in langs {
386                if let Some(ids) = self.language_mapping.get(lang) {
387                    indices.extend(ids);
388                }
389            }
390        }
391        if let Some(paths) = filter_paths {
392            for path in paths {
393                if let Some(ids) = self.file_mapping.get(path) {
394                    indices.extend(ids);
395                }
396            }
397        }
398        if indices.is_empty() {
399            None
400        } else {
401            indices.sort();
402            indices.dedup();
403            Some(indices)
404        }
405    }
406}
407
408fn fuse_exact_results(
409    query: &str,
410    exact_index: &ExactIndex,
411    chunks: &[Chunk],
412    top_k: usize,
413    selector: Option<&[usize]>,
414    backend_results: Vec<SearchResult>,
415) -> Vec<SearchResult> {
416    let exact_results =
417        exact_index.search(query, chunks, top_k.saturating_mul(2).max(top_k), selector);
418    if exact_results.is_empty() {
419        return backend_results;
420    }
421
422    let mut merged: Vec<SearchResult> = Vec::new();
423    let mut positions: HashMap<(String, usize, usize), usize> = HashMap::new();
424
425    for mut result in exact_results.into_iter().chain(backend_results.into_iter()) {
426        let key = (
427            result.chunk.file_path.clone(),
428            result.chunk.start_line,
429            result.chunk.end_line,
430        );
431        if let Some(&pos) = positions.get(&key) {
432            let existing = &mut merged[pos];
433            if result.score > existing.score {
434                existing.score = result.score;
435            } else {
436                existing.score += result.score.min(1.0);
437            }
438            for match_line in result.match_lines.drain(..) {
439                if !existing
440                    .match_lines
441                    .iter()
442                    .any(|line| line.line == match_line.line && line.content == match_line.content)
443                {
444                    existing.match_lines.push(match_line);
445                }
446            }
447        } else {
448            positions.insert(key, merged.len());
449            merged.push(result);
450        }
451    }
452
453    merged.sort_by(|a, b| {
454        b.score
455            .partial_cmp(&a.score)
456            .unwrap_or(std::cmp::Ordering::Equal)
457            .then_with(|| a.chunk.file_path.cmp(&b.chunk.file_path))
458            .then_with(|| a.chunk.start_line.cmp(&b.chunk.start_line))
459            .then_with(|| a.chunk.end_line.cmp(&b.chunk.end_line))
460    });
461    merged.truncate(top_k);
462    merged
463}
464
465fn build_mappings(chunks: &[Chunk]) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
466    let mut file_mapping: HashMap<String, Vec<usize>> = HashMap::new();
467    let mut language_mapping: HashMap<String, Vec<usize>> = HashMap::new();
468    for (i, chunk) in chunks.iter().enumerate() {
469        file_mapping
470            .entry(chunk.file_path.clone())
471            .or_default()
472            .push(i);
473        if let Some(lang) = &chunk.language {
474            language_mapping.entry(lang.clone()).or_default().push(i);
475        }
476    }
477    (file_mapping, language_mapping)
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483    use safetensors::tensor::{serialize, Dtype, TensorView};
484    use std::fs;
485    use std::path::PathBuf;
486    use std::time::{SystemTime, UNIX_EPOCH};
487    use tokenizers::models::wordlevel::WordLevel;
488    use tokenizers::pre_tokenizers::whitespace::Whitespace;
489    use tokenizers::Tokenizer;
490
491    fn unique_temp_dir(name: &str) -> PathBuf {
492        let unique = SystemTime::now()
493            .duration_since(UNIX_EPOCH)
494            .expect("system time should be after unix epoch")
495            .as_nanos();
496        std::env::temp_dir().join(format!("asr-index-test-{name}-{unique}"))
497    }
498
499    fn write_test_encoder(root: &Path) -> StaticEncoder {
500        fs::create_dir_all(root).expect("test encoder directory should be created");
501
502        let vocab = [
503            ("<unk>".to_string(), 0),
504            ("search".to_string(), 1),
505            ("target".to_string(), 2),
506            ("function".to_string(), 3),
507        ]
508        .into_iter()
509        .collect();
510        let wordlevel = WordLevel::builder()
511            .vocab(vocab)
512            .unk_token("<unk>".to_string())
513            .build()
514            .expect("test wordlevel tokenizer should build");
515        let mut tokenizer = Tokenizer::new(wordlevel);
516        tokenizer.with_pre_tokenizer(Some(Whitespace));
517
518        let tokenizer_path = root.join("tokenizer.json");
519        tokenizer
520            .save(&tokenizer_path, false)
521            .expect("test tokenizer should be written");
522
523        let embedding_values: [f32; 16] = [
524            0.0, 0.0, 0.0, 0.0, // <unk>
525            1.0, 0.0, 0.0, 0.0, // search
526            0.0, 1.0, 0.0, 0.0, // target
527            0.0, 0.0, 1.0, 0.0, // function
528        ];
529        let embedding_bytes: Vec<u8> = embedding_values
530            .iter()
531            .flat_map(|value| value.to_le_bytes())
532            .collect();
533        let view = TensorView::new(Dtype::F32, vec![4, 4], &embedding_bytes)
534            .expect("test tensor view should match embedding shape");
535        let model_bytes = serialize([("embeddings", view)], &None)
536            .expect("test safetensors model should serialize");
537        let model_path = root.join("model.safetensors");
538        fs::write(&model_path, model_bytes).expect("test model should be written");
539
540        StaticEncoder::from_files(&tokenizer_path, &model_path)
541            .expect("test static encoder should load")
542    }
543
544    #[test]
545    fn search_uses_semantic_index_when_encoder_is_available() {
546        let root = unique_temp_dir("semantic-source");
547        fs::create_dir_all(root.join("src")).expect("source directory should be created");
548        fs::write(
549            root.join("src/lib.rs"),
550            "pub fn search_target_function() -> &'static str { \"ok\" }\n",
551        )
552        .expect("source fixture should be written");
553
554        let encoder = write_test_encoder(&unique_temp_dir("encoder"));
555        let source = SourceTree::from_path(&root).expect("source tree should load");
556        let index = SourceIndex::from_source_tree(source, Some(encoder), None, None, false)
557            .expect("index should build with injected semantic encoder");
558
559        assert!(
560            index.supports_find_related(),
561            "semantic index should be built instead of BM25-only fallback"
562        );
563
564        let results = index.search("search target", 3, None, None, None);
565        assert!(
566            results
567                .iter()
568                .any(|result| result.chunk.file_path == "src/lib.rs"),
569            "hybrid search should return the indexed Rust source: {results:?}"
570        );
571    }
572}