1pub(crate) mod build;
2mod cache;
3
4use std::collections::{HashMap, HashSet};
5use std::fmt;
6use std::path::Path;
7
8use anyhow::{bail, Context, Result};
9
10use crate::bm25::Bm25Index;
11use crate::encoder::{SemanticIndex, StaticEncoder};
12use crate::exact::ExactIndex;
13use crate::graph::DependencyGraph;
14use crate::model::{Chunk, IndexStats, SearchResult};
15use crate::search::{search_bm25, search_hybrid, HybridSearchContext};
16use crate::source_tree::SourceTree;
17use crate::tokens::tokenize;
18use build::{build_bm25_index_from_path, build_index_from_path};
19
20struct HybridSearchBackend {
21 encoder: StaticEncoder,
22 semantic_index: SemanticIndex,
23}
24
25enum SearchBackend {
26 Hybrid(Box<HybridSearchBackend>),
27 Bm25Only,
28}
29
30#[derive(Debug)]
31pub enum SemanticIndexBuildError {
32 SemanticUnavailable(anyhow::Error),
33 Index(anyhow::Error),
34}
35
36impl fmt::Display for SemanticIndexBuildError {
37 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38 match self {
39 Self::SemanticUnavailable(err) => write!(f, "semantic model unavailable: {err:#}"),
40 Self::Index(err) => write!(f, "index build failed: {err:#}"),
41 }
42 }
43}
44
45impl std::error::Error for SemanticIndexBuildError {}
46
47pub struct SourceIndex {
48 bm25_index: Bm25Index,
49 exact_index: ExactIndex,
50 backend: SearchBackend,
51 chunks: Vec<Chunk>,
52 file_mapping: HashMap<String, Vec<usize>>,
53 language_mapping: HashMap<String, Vec<usize>>,
54 graph: DependencyGraph,
55}
56
57impl SourceIndex {
58 pub fn from_source(source_path_or_git_url: &str, include_text_files: bool) -> Result<Self> {
59 let source = SourceTree::from_source(source_path_or_git_url, None)?;
60 Self::from_source_tree(source, None, None, None, include_text_files)
61 }
62
63 pub fn from_source_bm25(
64 source_path_or_git_url: &str,
65 include_text_files: bool,
66 ) -> Result<Self> {
67 let source = SourceTree::from_source(source_path_or_git_url, None)?;
68 Self::from_source_tree_bm25(source, None, None, include_text_files, true)
69 }
70
71 pub fn from_source_semantic(
72 source_path_or_git_url: &str,
73 include_text_files: bool,
74 ) -> std::result::Result<Self, SemanticIndexBuildError> {
75 let encoder =
76 StaticEncoder::load(None).map_err(SemanticIndexBuildError::SemanticUnavailable)?;
77 let source = SourceTree::from_source(source_path_or_git_url, None)
78 .map_err(SemanticIndexBuildError::Index)?;
79 Self::from_source_tree(source, Some(encoder), None, None, include_text_files)
80 .map_err(SemanticIndexBuildError::Index)
81 }
82
83 pub fn from_path(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
84 let source = SourceTree::from_path(path)?;
85 Self::from_source_tree(source, None, None, None, include_text_files)
86 }
87
88 pub fn from_path_bm25(path: impl AsRef<Path>, include_text_files: bool) -> Result<Self> {
89 let source = SourceTree::from_path(path)?;
90 Self::from_source_tree_bm25(source, None, None, include_text_files, true)
91 }
92
93 pub fn from_path_bm25_uncached(
98 path: impl AsRef<Path>,
99 include_text_files: bool,
100 ) -> Result<Self> {
101 let source = SourceTree::from_path(path)?;
102 Self::from_source_tree_bm25(source, None, None, include_text_files, false)
103 }
104
105 pub fn from_chunks_bm25(chunks: Vec<Chunk>) -> Result<Self> {
111 if chunks.is_empty() {
112 bail!("No indexed chunks available");
113 }
114
115 let bm25_docs: Vec<Vec<String>> = chunks
116 .iter()
117 .map(|chunk| tokenize(&build::enrich_for_bm25(chunk)))
118 .collect();
119 let bm25_index = Bm25Index::new(&bm25_docs);
120 let (file_mapping, language_mapping) = build_mappings(&chunks);
121
122 Ok(Self {
123 bm25_index,
124 exact_index: ExactIndex::new(&chunks),
125 backend: SearchBackend::Bm25Only,
126 chunks,
127 file_mapping,
128 language_mapping,
129 graph: DependencyGraph::new(),
130 })
131 }
132
133 fn from_source_tree(
134 source: SourceTree,
135 encoder: Option<StaticEncoder>,
136 extensions: Option<&HashSet<String>>,
137 ignore: Option<&HashSet<String>>,
138 include_text_files: bool,
139 ) -> Result<Self> {
140 let path = source.root().to_path_buf();
141 let (backend, bm25_index, chunks, graph) = match encoder {
142 Some(encoder) => {
143 let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
144 &path,
145 &encoder,
146 extensions,
147 ignore,
148 include_text_files,
149 &path,
150 )?;
151 (
152 SearchBackend::Hybrid(Box::new(HybridSearchBackend {
153 encoder,
154 semantic_index,
155 })),
156 bm25_index,
157 chunks,
158 graph,
159 )
160 }
161 None => {
162 let try_semantic = || -> Result<_> {
163 let encoder = StaticEncoder::load(None)?;
164 let (bm25_index, semantic_index, chunks, graph) = build_index_from_path(
165 &path,
166 &encoder,
167 extensions,
168 ignore,
169 include_text_files,
170 &path,
171 )?;
172 Ok((
173 SearchBackend::Hybrid(Box::new(HybridSearchBackend {
174 encoder,
175 semantic_index,
176 })),
177 bm25_index,
178 chunks,
179 graph,
180 ))
181 };
182 match try_semantic() {
183 Ok(result) => result,
184 Err(err) => {
185 log::warn!(
186 "semantic model unavailable; falling back to BM25-only search: {err:#}"
187 );
188 return Self::from_source_tree_bm25(
189 source,
190 extensions,
191 ignore,
192 include_text_files,
193 true,
194 );
195 }
196 }
197 }
198 };
199
200 let (file_mapping, language_mapping) = build_mappings(&chunks);
201 Ok(Self {
202 bm25_index,
203 exact_index: ExactIndex::new(&chunks),
204 backend,
205 chunks,
206 file_mapping,
207 language_mapping,
208 graph,
209 })
210 }
211
212 fn from_source_tree_bm25(
213 source: SourceTree,
214 extensions: Option<&HashSet<String>>,
215 ignore: Option<&HashSet<String>>,
216 include_text_files: bool,
217 use_cache: bool,
218 ) -> Result<Self> {
219 let path = source.root();
220 let cacheable =
221 use_cache && !source.is_temporary() && extensions.is_none() && ignore.is_none();
222 if cacheable {
223 if let Some(cached) = cache::load_bm25(path, include_text_files) {
224 let (file_mapping, language_mapping) = build_mappings(&cached.chunks);
225 return Ok(Self {
226 bm25_index: cached.bm25_index,
227 exact_index: ExactIndex::new(&cached.chunks),
228 backend: SearchBackend::Bm25Only,
229 chunks: cached.chunks,
230 file_mapping,
231 language_mapping,
232 graph: cached.graph,
233 });
234 }
235 }
236
237 let (bm25_index, chunks, graph) =
238 build_bm25_index_from_path(path, extensions, ignore, include_text_files, path)?;
239
240 let (file_mapping, language_mapping) = build_mappings(&chunks);
241 if cacheable {
242 let manifest = cache::build_manifest(path, include_text_files);
243 let _ = cache::store_bm25(
244 path,
245 include_text_files,
246 manifest,
247 &bm25_index,
248 &chunks,
249 &graph,
250 );
251 }
252
253 Ok(Self {
254 bm25_index,
255 exact_index: ExactIndex::new(&chunks),
256 backend: SearchBackend::Bm25Only,
257 chunks,
258 file_mapping,
259 language_mapping,
260 graph,
261 })
262 }
263
264 pub fn from_git(url: &str, ref_: Option<&str>, include_text_files: bool) -> Result<Self> {
265 let source = SourceTree::from_git(url, ref_)?;
266 Self::from_source_tree(source, None, None, None, include_text_files)
267 }
268
269 pub fn search(
270 &self,
271 query: &str,
272 top_k: usize,
273 alpha: Option<f64>,
274 filter_languages: Option<&[String]>,
275 filter_paths: Option<&[String]>,
276 ) -> Vec<SearchResult> {
277 if self.chunks.is_empty() || query.trim().is_empty() {
278 return Vec::new();
279 }
280
281 let selector = self.get_selector(filter_languages, filter_paths);
282 let selector_ref = selector.as_deref();
283
284 let backend_results = match &self.backend {
285 SearchBackend::Hybrid(backend) => search_hybrid(
286 query,
287 HybridSearchContext {
288 encoder: &backend.encoder,
289 semantic_index: &backend.semantic_index,
290 bm25_index: &self.bm25_index,
291 chunks: &self.chunks,
292 graph: Some(&self.graph),
293 file_mapping: &self.file_mapping,
294 },
295 top_k,
296 alpha,
297 selector_ref,
298 ),
299 SearchBackend::Bm25Only => {
300 search_bm25(query, &self.bm25_index, &self.chunks, top_k, selector_ref)
301 }
302 };
303
304 fuse_exact_results(
305 query,
306 &self.exact_index,
307 &self.chunks,
308 top_k,
309 selector_ref,
310 backend_results,
311 )
312 }
313
314 pub fn find_related(&self, source: &Chunk, top_k: usize) -> Result<Vec<SearchResult>> {
315 let selector = source
316 .language
317 .as_ref()
318 .and_then(|lang| self.language_mapping.get(lang))
319 .map(|indices| indices.as_slice());
320
321 let backend = match &self.backend {
322 SearchBackend::Hybrid(backend) => backend,
323 SearchBackend::Bm25Only => {
324 bail!("find-related requires a semantic index, but this index is BM25-only")
325 }
326 };
327
328 let query_embedding = backend
329 .encoder
330 .encode_single(&source.content)
331 .context("failed to encode source chunk for related search")?;
332
333 let results = backend
334 .semantic_index
335 .query(&query_embedding, top_k + 1, selector);
336 let results: Vec<SearchResult> = results
337 .into_iter()
338 .filter(|&(idx, _)| self.chunks[idx] != *source)
339 .take(top_k)
340 .map(|(idx, dist)| SearchResult {
341 chunk: self.chunks[idx].clone(),
342 score: (1.0 - dist) as f64,
343 match_lines: vec![],
344 })
345 .collect();
346
347 Ok(results)
348 }
349
350 pub fn supports_find_related(&self) -> bool {
351 matches!(self.backend, SearchBackend::Hybrid(_))
352 }
353
354 pub fn stats(&self) -> IndexStats {
355 let mut language_counts: HashMap<String, usize> = HashMap::new();
356 for chunk in &self.chunks {
357 if let Some(lang) = &chunk.language {
358 *language_counts.entry(lang.clone()).or_default() += 1;
359 }
360 }
361 IndexStats {
362 indexed_files: self.file_mapping.len(),
363 total_chunks: self.chunks.len(),
364 languages: language_counts,
365 }
366 }
367
368 pub fn chunks(&self) -> &[Chunk] {
369 &self.chunks
370 }
371 pub fn graph(&self) -> &DependencyGraph {
372 &self.graph
373 }
374 pub fn chunk_at(&self, file_path: &str, line: usize) -> Option<&Chunk> {
375 crate::chunk_lookup::resolve_chunk(&self.chunks, file_path, line)
376 }
377
378 fn get_selector(
379 &self,
380 filter_languages: Option<&[String]>,
381 filter_paths: Option<&[String]>,
382 ) -> Option<Vec<usize>> {
383 let mut indices = Vec::new();
384 if let Some(langs) = filter_languages {
385 for lang in langs {
386 if let Some(ids) = self.language_mapping.get(lang) {
387 indices.extend(ids);
388 }
389 }
390 }
391 if let Some(paths) = filter_paths {
392 for path in paths {
393 if let Some(ids) = self.file_mapping.get(path) {
394 indices.extend(ids);
395 }
396 }
397 }
398 if indices.is_empty() {
399 None
400 } else {
401 indices.sort();
402 indices.dedup();
403 Some(indices)
404 }
405 }
406}
407
408fn fuse_exact_results(
409 query: &str,
410 exact_index: &ExactIndex,
411 chunks: &[Chunk],
412 top_k: usize,
413 selector: Option<&[usize]>,
414 backend_results: Vec<SearchResult>,
415) -> Vec<SearchResult> {
416 let exact_results =
417 exact_index.search(query, chunks, top_k.saturating_mul(2).max(top_k), selector);
418 if exact_results.is_empty() {
419 return backend_results;
420 }
421
422 let mut merged: Vec<SearchResult> = Vec::new();
423 let mut positions: HashMap<(String, usize, usize), usize> = HashMap::new();
424
425 for mut result in exact_results.into_iter().chain(backend_results.into_iter()) {
426 let key = (
427 result.chunk.file_path.clone(),
428 result.chunk.start_line,
429 result.chunk.end_line,
430 );
431 if let Some(&pos) = positions.get(&key) {
432 let existing = &mut merged[pos];
433 if result.score > existing.score {
434 existing.score = result.score;
435 } else {
436 existing.score += result.score.min(1.0);
437 }
438 for match_line in result.match_lines.drain(..) {
439 if !existing
440 .match_lines
441 .iter()
442 .any(|line| line.line == match_line.line && line.content == match_line.content)
443 {
444 existing.match_lines.push(match_line);
445 }
446 }
447 } else {
448 positions.insert(key, merged.len());
449 merged.push(result);
450 }
451 }
452
453 merged.sort_by(|a, b| {
454 b.score
455 .partial_cmp(&a.score)
456 .unwrap_or(std::cmp::Ordering::Equal)
457 .then_with(|| a.chunk.file_path.cmp(&b.chunk.file_path))
458 .then_with(|| a.chunk.start_line.cmp(&b.chunk.start_line))
459 .then_with(|| a.chunk.end_line.cmp(&b.chunk.end_line))
460 });
461 merged.truncate(top_k);
462 merged
463}
464
465fn build_mappings(chunks: &[Chunk]) -> (HashMap<String, Vec<usize>>, HashMap<String, Vec<usize>>) {
466 let mut file_mapping: HashMap<String, Vec<usize>> = HashMap::new();
467 let mut language_mapping: HashMap<String, Vec<usize>> = HashMap::new();
468 for (i, chunk) in chunks.iter().enumerate() {
469 file_mapping
470 .entry(chunk.file_path.clone())
471 .or_default()
472 .push(i);
473 if let Some(lang) = &chunk.language {
474 language_mapping.entry(lang.clone()).or_default().push(i);
475 }
476 }
477 (file_mapping, language_mapping)
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483 use safetensors::tensor::{serialize, Dtype, TensorView};
484 use std::fs;
485 use std::path::PathBuf;
486 use std::time::{SystemTime, UNIX_EPOCH};
487 use tokenizers::models::wordlevel::WordLevel;
488 use tokenizers::pre_tokenizers::whitespace::Whitespace;
489 use tokenizers::Tokenizer;
490
491 fn unique_temp_dir(name: &str) -> PathBuf {
492 let unique = SystemTime::now()
493 .duration_since(UNIX_EPOCH)
494 .expect("system time should be after unix epoch")
495 .as_nanos();
496 std::env::temp_dir().join(format!("asr-index-test-{name}-{unique}"))
497 }
498
499 fn write_test_encoder(root: &Path) -> StaticEncoder {
500 fs::create_dir_all(root).expect("test encoder directory should be created");
501
502 let vocab = [
503 ("<unk>".to_string(), 0),
504 ("search".to_string(), 1),
505 ("target".to_string(), 2),
506 ("function".to_string(), 3),
507 ]
508 .into_iter()
509 .collect();
510 let wordlevel = WordLevel::builder()
511 .vocab(vocab)
512 .unk_token("<unk>".to_string())
513 .build()
514 .expect("test wordlevel tokenizer should build");
515 let mut tokenizer = Tokenizer::new(wordlevel);
516 tokenizer.with_pre_tokenizer(Some(Whitespace));
517
518 let tokenizer_path = root.join("tokenizer.json");
519 tokenizer
520 .save(&tokenizer_path, false)
521 .expect("test tokenizer should be written");
522
523 let embedding_values: [f32; 16] = [
524 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ];
529 let embedding_bytes: Vec<u8> = embedding_values
530 .iter()
531 .flat_map(|value| value.to_le_bytes())
532 .collect();
533 let view = TensorView::new(Dtype::F32, vec![4, 4], &embedding_bytes)
534 .expect("test tensor view should match embedding shape");
535 let model_bytes = serialize([("embeddings", view)], &None)
536 .expect("test safetensors model should serialize");
537 let model_path = root.join("model.safetensors");
538 fs::write(&model_path, model_bytes).expect("test model should be written");
539
540 StaticEncoder::from_files(&tokenizer_path, &model_path)
541 .expect("test static encoder should load")
542 }
543
544 #[test]
545 fn search_uses_semantic_index_when_encoder_is_available() {
546 let root = unique_temp_dir("semantic-source");
547 fs::create_dir_all(root.join("src")).expect("source directory should be created");
548 fs::write(
549 root.join("src/lib.rs"),
550 "pub fn search_target_function() -> &'static str { \"ok\" }\n",
551 )
552 .expect("source fixture should be written");
553
554 let encoder = write_test_encoder(&unique_temp_dir("encoder"));
555 let source = SourceTree::from_path(&root).expect("source tree should load");
556 let index = SourceIndex::from_source_tree(source, Some(encoder), None, None, false)
557 .expect("index should build with injected semantic encoder");
558
559 assert!(
560 index.supports_find_related(),
561 "semantic index should be built instead of BM25-only fallback"
562 );
563
564 let results = index.search("search target", 3, None, None, None);
565 assert!(
566 results
567 .iter()
568 .any(|result| result.chunk.file_path == "src/lib.rs"),
569 "hybrid search should return the indexed Rust source: {results:?}"
570 );
571 }
572}