pub(crate) mod build;
mod cache;
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::path::Path;
use anyhow::{bail, Context, Result};
use crate::bm25::Bm25Index;
use crate::encoder::{SemanticIndex, StaticEncoder};
use crate::exact::ExactIndex;
use crate::graph::DependencyGraph;
use crate::model::{Chunk, IndexStats, SearchResult};
use crate::search::{search_bm25, search_hybrid, HybridSearchContext};
use crate::source_tree::SourceTree;
use crate::tokens::tokenize;
use build::{build_bm25_index_from_path, build_index_from_path};
struct HybridSearchBackend {
encoder: StaticEncoder,
semantic_index: SemanticIndex,
}
enum SearchBackend {
Hybrid(Box<HybridSearchBackend>),
Bm25Only,
}
#[derive(Debug)]
pub enum SemanticIndexBuildError {
SemanticUnavailable(anyhow::Error),
Index(anyhow::Error),
}
impl fmt::Display for SemanticIndexBuildError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::SemanticUnavailable(err) => write!(f, "semantic model unavailable: {err:#}"),
Self::Index(err) => write!(f, "index build failed: {err:#}"),
}
}
}
impl std::error::Error for SemanticIndexBuildError {}
pub struct SourceIndex {
bm25_index: Bm25Index,
exact_index: ExactIndex,
backend: SearchBackend,
chunks: Vec<Chunk>,
file_mapping: HashMap<String, Vec<usize>>,
language_mapping: HashMap<String, Vec<usize>>,
graph: DependencyGraph,
}
impl SourceIndex {
pub fn from_source(source_path_or_git_url: &str, include_text_files: bool) -> Result<Self> {
let source = SourceTree::from_source(source_path_or_git_url, None)?;
Self::from_source_tree(source, None, None, None, include_text_files)
}
pub fn from_source_bm25(
source_path_or_git_url: &str,
include_text_files: bool,
) -> Result<Self> {
let source = SourceTree::from_source(source_path_or_git_url, None)?;
Self::from_source_tree_bm25(source, None, None, include_text_files, true)
}
pub fn from_source_semantic(
source_path_or_git_url: &str,
include_text_files: bool,
) -> std::result::Result<Self, SemanticIndexBuildError> {
let encoder =
StaticEncoder::load(None).map_err(SemanticIndexBuildError::SemanticUnavailable)?;
let source = SourceTree::from_source(source_path_or_git_url, None)
.map_err(SemanticIndexBuildError::Index)?;
Self::from_source_tree(source, Some(encoder), None, None, include_text_files)
.map_err(SemanticIndexBuildError::Index)
}
pub fn from_path(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
let source = SourceTree::from_path(path)?;
Self::from_source_tree(source, None, None, None, include_text_files)
}
pub fn from_path_bm25(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
let source = SourceTree::from_path(path)?;
Self::from_source_tree_bm25(source, None, None, include_text_files, true)
}
pub fn from_path_bm25_uncached(
path: impl AsRef<Path>,
include_text_files: bool,
) -> Result<Self> {
let source = SourceTree::from_path(path)?;
Self::from_source_tree_bm25(source, None, None, include_text_files, false)
}
pub fn from_chunks_bm25(chunks: Vec<Chunk>) -> Result<Self> {
if chunks.is_empty() {
bail!("No indexed chunks available");
}
let bm25_docs: Vec<Vec<String>> = chunks
.iter()
.map(|chunk| tokenize(&build::enrich_for_bm25(chunk)))
.collect();
let bm25_index = Bm25Index::new(&bm25_docs);
let (file_mapping, language_mapping) = build_mappings(&chunks);
Ok(Self {
bm25_index,
exact_index: ExactIndex::new(&chunks),
backend: SearchBackend::Bm25Only,
chunks,
file_mapping,
language_mapping,
graph: DependencyGraph::new(),
})
}
fn from_source_tree(
source: SourceTree,
encoder: Option<StaticEncoder>,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
) -> Result<Self> {
let path = source.root().to_path_buf();
let (backend, bm25_index, chunks, graph) = match encoder {
Some(encoder) => {
let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
&path,
&encoder,
extensions,
ignore,
include_text_files,
&path,
)?;
(
SearchBackend::Hybrid(Box::new(HybridSearchBackend {
encoder,
semantic_index,
})),
bm25_index,
chunks,
graph,
)
}
None => {
let try_semantic = || -> Result<_> {
let encoder = StaticEncoder::load(None)?;
let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
&path,
&encoder,
extensions,
ignore,
include_text_files,
&path,
)?;
Ok((
SearchBackend::Hybrid(Box::new(HybridSearchBackend {
encoder,
semantic_index,
})),
bm25_index,
chunks,
graph,
))
};
match try_semantic() {
Ok(result) => result,
Err(err) => {
log::warn!(
"semantic model unavailable; falling back to BM25-only search: {err:#}"
);
return Self::from_source_tree_bm25(
source,
extensions,
ignore,
include_text_files,
true,
);
}
}
}
};
let (file_mapping, language_mapping) = build_mappings(&chunks);
Ok(Self {
bm25_index,
exact_index: ExactIndex::new(&chunks),
backend,
chunks,
file_mapping,
language_mapping,
graph,
})
}
fn from_source_tree_bm25(
source: SourceTree,
extensions: Option<&HashSet<String>>,
ignore: Option<&HashSet<String>>,
include_text_files: bool,
use_cache: bool,
) -> Result<Self> {
let path = source.root();
let cacheable =
use_cache && !source.is_temporary() && extensions.is_none() && ignore.is_none();
if cacheable {
if let Some(cached) = cache::load_bm25(path, include_text_files) {
let (file_mapping, language_mapping) = build_mappings(&cached.chunks);
return Ok(Self {
bm25_index: cached.bm25_index,
exact_index: ExactIndex::new(&cached.chunks),
backend: SearchBackend::Bm25Only,
chunks: cached.chunks,
file_mapping,
language_mapping,
graph: cached.graph,
});
}
}
let (bm25_index, chunks, graph) =
build_bm25_index_from_path(path, extensions, ignore, include_text_files, path)?;
let (file_mapping, language_mapping) = build_mappings(&chunks);
if cacheable {
let manifest = cache::build_manifest(path, include_text_files);
let _ = cache::store_bm25(
path,
include_text_files,
manifest,
&bm25_index,
&chunks,
&graph,
);
}
Ok(Self {
bm25_index,
exact_index: ExactIndex::new(&chunks),
backend: SearchBackend::Bm25Only,
chunks,
file_mapping,
language_mapping,
graph,
})
}
pub fn from_git(url: &str, ref_: Option<&str>, include_text_files: bool) -> Result<Self> {
let source = SourceTree::from_git(url, ref_)?;
Self::from_source_tree(source, None, None, None, include_text_files)
}
pub fn search(
&self,
query: &str,
top_k: usize,
alpha: Option<f64>,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Vec<SearchResult> {
if self.chunks.is_empty() || query.trim().is_empty() {
return Vec::new();
}
let selector = self.get_selector(filter_languages, filter_paths);
let selector_ref = selector.as_deref();
let backend_results = match &self.backend {
SearchBackend::Hybrid(backend) => search_hybrid(
query,
HybridSearchContext {
encoder: &backend.encoder,
semantic_index: &backend.semantic_index,
bm25_index: &self.bm25_index,
chunks: &self.chunks,
graph: Some(&self.graph),
file_mapping: &self.file_mapping,
},
top_k,
alpha,
selector_ref,
),
SearchBackend::Bm25Only => {
search_bm25(query, &self.bm25_index, &self.chunks, top_k, selector_ref)
}
};
fuse_exact_results(
query,
&self.exact_index,
&self.chunks,
top_k,
selector_ref,
backend_results,
)
}
pub fn find_related(&self, source: &Chunk, top_k: usize) -> Result<Vec<SearchResult>> {
let selector = source
.language
.as_ref()
.and_then(|lang| self.language_mapping.get(lang))
.map(|indices| indices.as_slice());
let backend = match &self.backend {
SearchBackend::Hybrid(backend) => backend,
SearchBackend::Bm25Only => {
bail!("find-related requires a semantic index, but this index is BM25-only")
}
};
let query_embedding = backend
.encoder
.encode_single(&source.content)
.context("failed to encode source chunk for related search")?;
let results = backend
.semantic_index
.query(&query_embedding, top_k + 1, selector);
let results: Vec<SearchResult> = results
.into_iter()
.filter(|&(idx, _)| self.chunks[idx] != *source)
.take(top_k)
.map(|(idx, dist)| SearchResult {
chunk: self.chunks[idx].clone(),
score: (1.0 - dist) as f64,
match_lines: vec![],
})
.collect();
Ok(results)
}
pub fn supports_find_related(&self) -> bool {
matches!(self.backend, SearchBackend::Hybrid(_))
}
pub fn stats(&self) -> IndexStats {
let mut language_counts: HashMap<String, usize> = HashMap::new();
for chunk in &self.chunks {
if let Some(lang) = &chunk.language {
*language_counts.entry(lang.clone()).or_default() += 1;
}
}
IndexStats {
indexed_files: self.file_mapping.len(),
total_chunks: self.chunks.len(),
languages: language_counts,
}
}
pub fn chunks(&self) -> &[Chunk] {
&self.chunks
}
pub fn graph(&self) -> &DependencyGraph {
&self.graph
}
pub fn chunk_at(&self, file_path: &str, line: usize) -> Option<&Chunk> {
crate::chunk_lookup::resolve_chunk(&self.chunks, file_path, line)
}
fn get_selector(
&self,
filter_languages: Option<&[String]>,
filter_paths: Option<&[String]>,
) -> Option<Vec<usize>> {
let mut indices = Vec::new();
if let Some(langs) = filter_languages {
for lang in langs {
if let Some(ids) = self.language_mapping.get(lang) {
indices.extend(ids);
}
}
}
if let Some(paths) = filter_paths {
for path in paths {
if let Some(ids) = self.file_mapping.get(path) {
indices.extend(ids);
}
}
}
if indices.is_empty() {
None
} else {
indices.sort();
indices.dedup();
Some(indices)
}
}
}
fn fuse_exact_results(
query: &str,
exact_index: &ExactIndex,
chunks: &[Chunk],
top_k: usize,
selector: Option<&[usize]>,
backend_results: Vec<SearchResult>,
) -> Vec<SearchResult> {
let exact_results =
exact_index.search(query, chunks, top_k.saturating_mul(2).max(top_k), selector);
if exact_results.is_empty() {
return backend_results;
}
let mut merged: Vec<SearchResult> = Vec::new();
let mut positions: HashMap<(String, usize, usize), usize> = HashMap::new();
for mut result in exact_results.into_iter().chain(backend_results.into_iter()) {
let key = (
result.chunk.file_path.clone(),
result.chunk.start_line,
result.chunk.end_line,
);
if let Some(&pos) = positions.get(&key) {
let existing = &mut merged[pos];
if result.score > existing.score {
existing.score = result.score;
} else {
existing.score += result.score.min(1.0);
}
for match_line in result.match_lines.drain(..) {
if !existing
.match_lines
.iter()
.any(|line| line.line == match_line.line && line.content == match_line.content)
{
existing.match_lines.push(match_line);
}
}
} else {
positions.insert(key, merged.len());
merged.push(result);
}
}
merged.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.chunk.file_path.cmp(&b.chunk.file_path))
.then_with(|| a.chunk.start_line.cmp(&b.chunk.start_line))
.then_with(|| a.chunk.end_line.cmp(&b.chunk.end_line))
});
merged.truncate(top_k);
merged
}
fn build_mappings(chunks: &[Chunk]) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
let mut file_mapping: HashMap<String, Vec<usize>> = HashMap::new();
let mut language_mapping: HashMap<String, Vec<usize>> = HashMap::new();
for (i, chunk) in chunks.iter().enumerate() {
file_mapping
.entry(chunk.file_path.clone())
.or_default()
.push(i);
if let Some(lang) = &chunk.language {
language_mapping.entry(lang.clone()).or_default().push(i);
}
}
(file_mapping, language_mapping)
}
#[cfg(test)]
mod tests {
use super::*;
use safetensors::tensor::{serialize, Dtype, TensorView};
use std::fs;
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};
use tokenizers::models::wordlevel::WordLevel;
use tokenizers::pre_tokenizers::whitespace::Whitespace;
use tokenizers::Tokenizer;
fn unique_temp_dir(name: &str) -> PathBuf {
let unique = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time should be after unix epoch")
.as_nanos();
std::env::temp_dir().join(format!("asr-index-test-{name}-{unique}"))
}
fn write_test_encoder(root: &Path) -> StaticEncoder {
fs::create_dir_all(root).expect("test encoder directory should be created");
let vocab = [
("<unk>".to_string(), 0),
("search".to_string(), 1),
("target".to_string(), 2),
("function".to_string(), 3),
]
.into_iter()
.collect();
let wordlevel = WordLevel::builder()
.vocab(vocab)
.unk_token("<unk>".to_string())
.build()
.expect("test wordlevel tokenizer should build");
let mut tokenizer = Tokenizer::new(wordlevel);
tokenizer.with_pre_tokenizer(Some(Whitespace));
let tokenizer_path = root.join("tokenizer.json");
tokenizer
.save(&tokenizer_path, false)
.expect("test tokenizer should be written");
let embedding_values: [f32; 16] = [
0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ];
let embedding_bytes: Vec<u8> = embedding_values
.iter()
.flat_map(|value| value.to_le_bytes())
.collect();
let view = TensorView::new(Dtype::F32, vec![4, 4], &embedding_bytes)
.expect("test tensor view should match embedding shape");
let model_bytes = serialize([("embeddings", view)], &None)
.expect("test safetensors model should serialize");
let model_path = root.join("model.safetensors");
fs::write(&model_path, model_bytes).expect("test model should be written");
StaticEncoder::from_files(&tokenizer_path, &model_path)
.expect("test static encoder should load")
}
#[test]
fn search_uses_semantic_index_when_encoder_is_available() {
let root = unique_temp_dir("semantic-source");
fs::create_dir_all(root.join("src")).expect("source directory should be created");
fs::write(
root.join("src/lib.rs"),
"pub fn search_target_function() -> &'static str { \"ok\" }\n",
)
.expect("source fixture should be written");
let encoder = write_test_encoder(&unique_temp_dir("encoder"));
let source = SourceTree::from_path(&root).expect("source tree should load");
let index = SourceIndex::from_source_tree(source, Some(encoder), None, None, false)
.expect("index should build with injected semantic encoder");
assert!(
index.supports_find_related(),
"semantic index should be built instead of BM25-only fallback"
);
let results = index.search("search target", 3, None, None, None);
assert!(
results
.iter()
.any(|result| result.chunk.file_path == "src/lib.rs"),
"hybrid search should return the indexed Rust source: {results:?}"
);
}
}