use super::{MetadataKey, StorageError, StorageResult};
use crate::relationship::RelationshipMetadata;
use crate::vector::{ClusterId, EmbeddingGenerator, SegmentOrdinal, VectorId, VectorSearchEngine};
use crate::{FileId, RelationKind, Relationship, SymbolId, SymbolKind};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::sync::{Arc, RwLock};
use tantivy::DocId;
use tantivy::{
Index, IndexReader, IndexSettings, IndexWriter, ReloadPolicy, TantivyDocument as Document,
Term,
collector::TopDocs,
directory::MmapDirectory,
query::{BooleanQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery},
schema::{
FAST, Field, IndexRecordOption, NumericOptions, STORED, STRING, Schema, SchemaBuilder,
TextFieldIndexing, TextOptions, Value,
},
tokenizer::{NgramTokenizer, TextAnalyzer},
};
#[derive(Debug)]
pub struct IndexSchema {
pub doc_type: Field,
pub symbol_id: Field,
pub name: Field, pub name_text: Field, pub doc_comment: Field,
pub signature: Field,
pub module_path: Field,
pub kind: Field,
pub file_path: Field,
pub line_number: Field,
pub column: Field,
pub end_line: Field,
pub end_column: Field,
pub context: Field,
pub visibility: Field,
pub scope_context: Field,
pub language: Field,
pub from_symbol_id: Field,
pub to_symbol_id: Field,
pub relation_kind: Field,
pub relation_weight: Field,
pub relation_line: Field,
pub relation_column: Field,
pub relation_context: Field,
pub file_id: Field,
pub file_hash: Field,
pub file_timestamp: Field,
pub file_mtime: Field,
pub meta_key: Field,
pub meta_value: Field,
pub cluster_id: Field,
pub vector_id: Field,
pub has_vector: Field,
pub import_file_id: Field, pub import_path: Field, pub import_alias: Field, pub import_is_glob: Field, pub import_is_type_only: Field, }
impl IndexSchema {
pub fn build() -> (Schema, IndexSchema) {
let mut builder = SchemaBuilder::default();
let doc_type = builder.add_text_field("doc_type", STRING | STORED | FAST);
let indexed_u64_options = NumericOptions::default()
.set_indexed()
.set_stored()
.set_fast();
let symbol_id = builder.add_u64_field("symbol_id", indexed_u64_options.clone());
let file_path = builder.add_text_field("file_path", STRING | STORED);
let line_number = builder.add_u64_field("line_number", indexed_u64_options.clone());
let column = builder.add_u64_field("column", STORED);
let end_line = builder.add_u64_field("end_line", STORED | FAST);
let end_column = builder.add_u64_field("end_column", STORED);
let text_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let name = builder.add_text_field("name", STRING | STORED);
let ngram_text_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("ngram")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let name_text = builder.add_text_field("name_text", ngram_text_options);
let doc_comment = builder.add_text_field("doc_comment", text_options.clone());
let signature = builder.add_text_field("signature", text_options.clone());
let context = builder.add_text_field("context", text_options.clone());
let module_path = builder.add_text_field("module_path", STRING | STORED);
let kind = builder.add_text_field("kind", STRING | STORED);
let visibility = builder.add_u64_field("visibility", STORED);
let scope_context = builder.add_text_field("scope_context", STRING | STORED);
let language = builder.add_text_field("language", STRING | STORED | FAST);
let from_symbol_id = builder.add_u64_field("from_symbol_id", indexed_u64_options.clone());
let to_symbol_id = builder.add_u64_field("to_symbol_id", indexed_u64_options.clone());
let relation_kind = builder.add_text_field("relation_kind", STRING | STORED | FAST);
let relation_weight = builder.add_f64_field("relation_weight", STORED);
let relation_line = builder.add_u64_field("relation_line", STORED);
let relation_column = builder.add_u64_field("relation_column", STORED);
let relation_context = builder.add_text_field("relation_context", text_options.clone());
let file_id = builder.add_u64_field("file_id", indexed_u64_options.clone());
let file_hash = builder.add_text_field("file_hash", STRING | STORED);
let file_timestamp = builder.add_u64_field("file_timestamp", STORED | FAST);
let file_mtime = builder.add_u64_field("file_mtime", STORED | FAST);
let meta_key = builder.add_text_field("meta_key", STRING | STORED | FAST);
let meta_value = builder.add_u64_field("meta_value", STORED | FAST);
let cluster_id = builder.add_u64_field("cluster_id", FAST | STORED);
let vector_id = builder.add_u64_field("vector_id", FAST | STORED);
let has_vector = builder.add_u64_field("has_vector", FAST | STORED);
let import_file_id = builder.add_u64_field("import_file_id", indexed_u64_options.clone());
let import_path = builder.add_text_field("import_path", STRING | STORED);
let import_alias = builder.add_text_field("import_alias", STRING | STORED);
let import_is_glob = builder.add_u64_field("import_is_glob", STORED);
let import_is_type_only = builder.add_u64_field("import_is_type_only", STORED);
let schema = builder.build();
let index_schema = IndexSchema {
doc_type,
symbol_id,
name,
name_text,
doc_comment,
signature,
module_path,
kind,
file_path,
line_number,
column,
end_line,
end_column,
context,
visibility,
scope_context,
language,
from_symbol_id,
to_symbol_id,
relation_kind,
relation_weight,
relation_line,
relation_column,
relation_context,
file_id,
file_hash,
file_timestamp,
file_mtime,
meta_key,
meta_value,
cluster_id,
vector_id,
has_vector,
import_file_id,
import_path,
import_alias,
import_is_glob,
import_is_type_only,
};
(schema, index_schema)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct VectorMetadata {
pub vector_id: Option<VectorId>,
pub cluster_id: Option<ClusterId>,
pub embedding_version: u32,
}
#[derive(Serialize, Deserialize)]
struct VectorMetadataJson {
vector_id: Option<u32>,
cluster_id: Option<u32>,
embedding_version: u32,
}
impl VectorMetadata {
pub fn new(embedding_version: u32) -> Self {
Self {
vector_id: None,
cluster_id: None,
embedding_version,
}
}
pub fn with_vector(vector_id: VectorId, cluster_id: ClusterId, embedding_version: u32) -> Self {
Self {
vector_id: Some(vector_id),
cluster_id: Some(cluster_id),
embedding_version,
}
}
pub fn has_vector(&self) -> bool {
self.vector_id.is_some()
}
pub fn to_json(&self) -> StorageResult<String> {
let json_repr = VectorMetadataJson {
vector_id: self.vector_id.map(|id| id.get()),
cluster_id: self.cluster_id.map(|id| id.get()),
embedding_version: self.embedding_version,
};
serde_json::to_string(&json_repr).map_err(|e| {
StorageError::Serialization(format!("Failed to serialize VectorMetadata: {e}"))
})
}
pub fn from_json(json: &str) -> StorageResult<Self> {
let json_repr: VectorMetadataJson = serde_json::from_str(json).map_err(|e| {
StorageError::Serialization(format!("Failed to deserialize VectorMetadata: {e}"))
})?;
Ok(Self {
vector_id: json_repr.vector_id.and_then(VectorId::new),
cluster_id: json_repr.cluster_id.and_then(ClusterId::new),
embedding_version: json_repr.embedding_version,
})
}
}
#[derive(Debug, Clone)]
struct ClusterCache {
generation: u64,
segment_mappings: HashMap<SegmentOrdinal, HashMap<ClusterId, Vec<DocId>>>,
}
impl ClusterCache {
fn new(generation: u64) -> Self {
Self {
generation,
segment_mappings: HashMap::new(),
}
}
fn is_valid_for_generation(&self, generation: u64) -> bool {
self.generation == generation
}
fn add_document(&mut self, segment_ord: SegmentOrdinal, cluster_id: ClusterId, doc_id: DocId) {
self.segment_mappings
.entry(segment_ord)
.or_default()
.entry(cluster_id)
.or_default()
.push(doc_id);
}
fn get_documents(
&self,
segment_ord: SegmentOrdinal,
cluster_id: ClusterId,
) -> Option<&[DocId]> {
self.segment_mappings
.get(&segment_ord)
.and_then(|clusters| clusters.get(&cluster_id))
.map(|docs| docs.as_slice())
}
fn sort_all(&mut self) {
for segment_map in self.segment_mappings.values_mut() {
for doc_list in segment_map.values_mut() {
doc_list.sort_unstable();
}
}
}
#[cfg(test)]
fn total_documents(&self) -> usize {
self.segment_mappings
.values()
.flat_map(|clusters| clusters.values())
.map(|docs| docs.len())
.sum()
}
fn all_cluster_ids(&self) -> Vec<ClusterId> {
let mut cluster_ids: Vec<ClusterId> = self
.segment_mappings
.values()
.flat_map(|clusters| clusters.keys())
.copied()
.collect();
cluster_ids.sort_unstable_by_key(|id| id.get());
cluster_ids.dedup();
cluster_ids
}
}
#[derive(Debug, Clone, Serialize)]
pub struct SearchResult {
pub symbol_id: SymbolId,
pub name: String,
pub kind: SymbolKind,
pub file_path: String,
pub line: u32,
pub column: u16,
pub doc_comment: Option<String>,
pub signature: Option<String>,
pub module_path: String,
pub score: f32,
pub highlights: Vec<TextHighlight>,
pub context: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct TextHighlight {
pub field: String,
pub start: usize,
pub end: usize,
}
pub struct DocumentIndex {
index: Index,
reader: IndexReader,
schema: IndexSchema,
index_path: PathBuf,
pub(crate) writer: RwLock<Option<IndexWriter<Document>>>,
heap_size: usize,
max_retry_attempts: u32,
vector_storage_path: Option<PathBuf>,
vector_engine: Option<Arc<Mutex<VectorSearchEngine>>>,
cluster_cache: Arc<RwLock<Option<ClusterCache>>>,
embedding_generator: Option<Arc<dyn EmbeddingGenerator>>,
pub(crate) pending_embeddings: Mutex<Vec<(SymbolId, String)>>,
pending_symbol_counter: Mutex<Option<u32>>,
pending_file_counter: Mutex<Option<u32>>,
}
impl std::fmt::Debug for DocumentIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DocumentIndex")
.field("index_path", &self.index_path)
.field("schema", &self.schema)
.field("vector_storage_path", &self.vector_storage_path)
.field("has_vector_engine", &self.vector_engine.is_some())
.field(
"has_embedding_generator",
&self.embedding_generator.is_some(),
)
.field(
"has_cluster_cache",
&self.cluster_cache.read().unwrap().is_some(),
)
.field(
"pending_embeddings_count",
&self.pending_embeddings.lock().unwrap().len(),
)
.finish()
}
}
impl DocumentIndex {
pub fn new(
index_path: impl AsRef<Path>,
settings: &crate::config::Settings,
) -> StorageResult<Self> {
let index_path = index_path.as_ref().to_path_buf();
std::fs::create_dir_all(&index_path)?;
let heap_size = settings.indexing.tantivy_heap_mb * 1_000_000;
let heap_size = heap_size.clamp(10_000_000, 1_000_000_000);
let max_retry_attempts = settings.indexing.max_retry_attempts;
let (schema, index_schema) = IndexSchema::build();
let index = if index_path.join("meta.json").exists() {
Index::open_in_dir(&index_path)?
} else {
let dir = MmapDirectory::open(&index_path)?;
Index::create(dir, schema, IndexSettings::default())?
};
let ngram_tokenizer =
TextAnalyzer::builder(NgramTokenizer::new(3, 10, false).unwrap()).build();
index.tokenizers().register("ngram", ngram_tokenizer);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()?;
if index_path.join("meta.json").exists() {
reader.reload()?;
}
Ok(Self {
index,
reader,
schema: index_schema,
index_path,
writer: RwLock::new(None),
heap_size,
max_retry_attempts,
vector_storage_path: None,
vector_engine: None,
cluster_cache: Arc::new(RwLock::new(None)),
embedding_generator: None,
pending_embeddings: Mutex::new(Vec::new()),
pending_symbol_counter: Mutex::new(None),
pending_file_counter: Mutex::new(None),
})
}
fn create_writer_with_retry(&self) -> Result<IndexWriter<Document>, tantivy::TantivyError> {
for attempt in 0..self.max_retry_attempts {
match self.index.writer::<Document>(self.heap_size) {
Ok(writer) => return Ok(writer),
Err(e) => {
let is_transient = std::error::Error::source(&e)
.and_then(|s| s.downcast_ref::<std::io::Error>())
.map(|io_err| {
matches!(
io_err.kind(),
std::io::ErrorKind::PermissionDenied
| std::io::ErrorKind::TimedOut
| std::io::ErrorKind::WouldBlock
)
})
.unwrap_or(false);
if is_transient && attempt < self.max_retry_attempts - 1 {
let delay = 100 * (1 << attempt);
eprintln!(
"Attempt {}/{}: Transient permission error, retrying after {}ms",
attempt + 1,
self.max_retry_attempts,
delay
);
std::thread::sleep(std::time::Duration::from_millis(delay));
} else {
return Err(e);
}
}
}
}
unreachable!()
}
pub fn with_vector_support(
mut self,
vector_engine: Arc<Mutex<VectorSearchEngine>>,
vector_storage_path: impl AsRef<Path>,
) -> Self {
self.vector_storage_path = Some(vector_storage_path.as_ref().to_path_buf());
self.vector_engine = Some(vector_engine);
self
}
pub fn with_embedding_generator(mut self, generator: Arc<dyn EmbeddingGenerator>) -> Self {
self.embedding_generator = Some(generator);
self
}
pub fn has_vector_support(&self) -> bool {
self.vector_engine.is_some()
}
pub fn vector_storage_path(&self) -> Option<&Path> {
self.vector_storage_path.as_deref()
}
pub fn vector_engine(&self) -> Option<&Arc<Mutex<VectorSearchEngine>>> {
self.vector_engine.as_ref()
}
fn post_commit_vector_processing(&self) -> StorageResult<()> {
let pending_embeddings = {
let mut pending = self
.pending_embeddings
.lock()
.map_err(|_| StorageError::LockPoisoned)?;
std::mem::take(&mut *pending)
};
if pending_embeddings.is_empty() {
return Ok(());
}
let vector_engine = self
.vector_engine
.as_ref()
.ok_or_else(|| StorageError::General("Vector engine not configured".to_string()))?;
let embedding_generator = self.embedding_generator.as_ref().ok_or_else(|| {
StorageError::General("Embedding generator not configured".to_string())
})?;
let texts: Vec<&str> = pending_embeddings
.iter()
.map(|(_, text)| text.as_str())
.collect();
let embeddings = embedding_generator
.generate_embeddings(&texts)
.map_err(|e| StorageError::General(format!("Embedding generation failed: {e}")))?;
if embeddings.len() != pending_embeddings.len() {
return Err(StorageError::General(format!(
"Embedding count mismatch: expected {}, got {}",
pending_embeddings.len(),
embeddings.len()
)));
}
let mut vectors = Vec::with_capacity(pending_embeddings.len());
for (i, (symbol_id, _)) in pending_embeddings.iter().enumerate() {
if let Some(vector_id) = crate::vector::VectorId::new(symbol_id.value()) {
vectors.push((vector_id, embeddings[i].clone()));
}
}
let mut engine = vector_engine
.lock()
.map_err(|_| StorageError::LockPoisoned)?;
engine
.index_vectors(&vectors)
.map_err(|e| StorageError::General(format!("Vector indexing failed: {e}")))?;
drop(engine);
Ok(())
}
fn build_cluster_cache(&self) -> StorageResult<()> {
if !self.has_vector_support() {
return Ok(());
}
let searcher = self.reader.searcher();
let generation = searcher.segment_readers().len() as u64;
{
let cache = self
.cluster_cache
.read()
.map_err(|_| StorageError::LockPoisoned)?;
if let Some(ref existing_cache) = *cache {
if existing_cache.is_valid_for_generation(generation) {
return Ok(());
}
}
}
let mut new_cache = ClusterCache::new(generation);
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let segment_ord = SegmentOrdinal::new(segment_ord as u32);
let fast_fields = segment_reader.fast_fields();
let cluster_id_reader = fast_fields.u64("cluster_id")?.first_or_default_col(0);
let has_vector_reader = fast_fields.u64("has_vector")?.first_or_default_col(0);
for doc_id in 0..segment_reader.num_docs() {
let has_vector_val = has_vector_reader.get_val(doc_id);
if has_vector_val == 1 {
let cluster_val = cluster_id_reader.get_val(doc_id);
if let Some(cluster_id) = ClusterId::new(cluster_val as u32) {
new_cache.add_document(segment_ord, cluster_id, doc_id);
}
}
}
}
new_cache.sort_all();
let mut cache = self
.cluster_cache
.write()
.map_err(|_| StorageError::LockPoisoned)?;
*cache = Some(new_cache);
Ok(())
}
pub fn get_cluster_documents(
&self,
segment_ord: SegmentOrdinal,
cluster_id: ClusterId,
) -> StorageResult<Vec<DocId>> {
let cache = self
.cluster_cache
.read()
.map_err(|_| StorageError::LockPoisoned)?;
if let Some(ref cluster_cache) = *cache {
Ok(cluster_cache
.get_documents(segment_ord, cluster_id)
.map(|docs| docs.to_vec())
.unwrap_or_default())
} else {
Ok(Vec::new())
}
}
pub fn get_all_cluster_ids(&self) -> StorageResult<Vec<ClusterId>> {
let cache = self
.cluster_cache
.read()
.map_err(|_| StorageError::LockPoisoned)?;
if let Some(ref cluster_cache) = *cache {
Ok(cluster_cache.all_cluster_ids())
} else {
Ok(Vec::new())
}
}
pub fn warm_cluster_cache(&self) -> StorageResult<()> {
if !self.has_vector_support() {
return Ok(());
}
{
let mut cache = self
.cluster_cache
.write()
.map_err(|_| StorageError::LockPoisoned)?;
*cache = None; }
self.build_cluster_cache()
}
pub fn get_cache_generation(&self) -> StorageResult<Option<u64>> {
let cache = self
.cluster_cache
.read()
.map_err(|_| StorageError::LockPoisoned)?;
Ok(cache.as_ref().map(|c| c.generation))
}
pub fn reload_and_warm(&self) -> StorageResult<()> {
self.reader.reload()?;
if self.has_vector_support() {
self.warm_cluster_cache()?;
}
Ok(())
}
pub fn update_cluster_assignments(&self) -> StorageResult<()> {
if !self.has_vector_support() {
return Ok(());
}
let vector_engine = self
.vector_engine
.as_ref()
.ok_or_else(|| StorageError::General("Vector engine not configured".to_string()))?;
let engine = vector_engine
.lock()
.map_err(|_| StorageError::LockPoisoned)?;
let cluster_assignments = engine.get_all_cluster_assignments();
drop(engine);
let mut updates_needed = Vec::new();
for (vector_id, cluster_id) in cluster_assignments {
if let Some(symbol_id) = SymbolId::new(vector_id.get()) {
updates_needed.push((symbol_id, cluster_id));
}
}
if updates_needed.is_empty() {
return Ok(());
}
let searcher = self.reader.searcher();
self.start_batch()?;
let writer_lock = self.writer.read().map_err(|_| StorageError::LockPoisoned)?;
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
for (symbol_id, cluster_id) in updates_needed {
let symbol_id_term =
Term::from_field_u64(self.schema.symbol_id, symbol_id.value() as u64);
let query = TermQuery::new(symbol_id_term.clone(), IndexRecordOption::Basic);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() {
if let Ok(old_doc) = searcher.doc::<Document>(*doc_address) {
writer.delete_term(symbol_id_term);
let mut new_doc = Document::new();
for (field, value) in old_doc.field_values() {
if field != self.schema.cluster_id && field != self.schema.has_vector {
new_doc.add_field_value(field, value);
}
}
new_doc.add_u64(self.schema.cluster_id, cluster_id.get() as u64);
new_doc.add_u64(self.schema.has_vector, 1);
writer.add_document(new_doc)?;
}
}
}
drop(writer_lock);
self.commit_batch()?;
Ok(())
}
pub fn start_batch(&self) -> StorageResult<()> {
let mut writer_lock = self
.writer
.write()
.map_err(|_| StorageError::LockPoisoned)?;
if writer_lock.is_none() {
let writer = self.create_writer_with_retry()?;
*writer_lock = Some(writer);
let current = self
.query_metadata(MetadataKey::SymbolCounter)?
.unwrap_or(0) as u32;
if let Ok(mut pending_guard) = self.pending_symbol_counter.lock() {
*pending_guard = Some(current + 1);
}
let file_current = self.query_metadata(MetadataKey::FileCounter)?.unwrap_or(0) as u32;
if let Ok(mut pending_guard) = self.pending_file_counter.lock() {
*pending_guard = Some(file_current + 1);
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub fn add_document(
&self,
symbol_id: SymbolId,
name: &str,
kind: SymbolKind,
file_id: FileId,
file_path: &str,
line: u32,
column: u16,
end_line: u32,
end_column: u16,
doc_comment: Option<&str>,
signature: Option<&str>,
module_path: &str,
context: Option<&str>,
visibility: crate::Visibility,
scope_context: Option<crate::ScopeContext>,
language_id: Option<&str>, ) -> StorageResult<()> {
let writer_lock = self.writer.read().map_err(|_| StorageError::LockPoisoned)?;
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let mut doc = Document::new();
doc.add_text(self.schema.doc_type, "symbol");
doc.add_u64(self.schema.symbol_id, symbol_id.value() as u64);
doc.add_u64(self.schema.file_id, file_id.value() as u64);
doc.add_text(self.schema.name, name);
doc.add_text(self.schema.name_text, name); doc.add_text(self.schema.file_path, file_path);
doc.add_u64(self.schema.line_number, line as u64);
doc.add_u64(self.schema.column, column as u64);
doc.add_u64(self.schema.end_line, end_line as u64);
doc.add_u64(self.schema.end_column, end_column as u64);
if let Some(comment) = doc_comment {
doc.add_text(self.schema.doc_comment, comment);
}
if let Some(sig) = signature {
doc.add_text(self.schema.signature, sig);
}
if let Some(ctx) = context {
doc.add_text(self.schema.context, ctx);
}
doc.add_text(self.schema.module_path, module_path);
doc.add_text(self.schema.kind, format!("{kind:?}"));
doc.add_u64(self.schema.visibility, visibility as u64);
if let Some(scope) = scope_context {
doc.add_text(self.schema.scope_context, format!("{scope:?}"));
} else {
doc.add_text(self.schema.scope_context, "None");
}
if let Some(lang) = language_id {
doc.add_text(self.schema.language, lang);
} else {
doc.add_text(self.schema.language, "");
}
if self.has_vector_support() {
doc.add_u64(self.schema.cluster_id, 0); doc.add_u64(self.schema.vector_id, symbol_id.value() as u64);
doc.add_u64(self.schema.has_vector, 0); }
writer.add_document(doc)?;
if self.has_vector_support() && self.embedding_generator.is_some() {
let symbol_text = format!("{} {:?} {}", name, kind, signature.unwrap_or(""));
let mut pending = self
.pending_embeddings
.lock()
.map_err(|_| StorageError::LockPoisoned)?;
pending.push((symbol_id, symbol_text));
}
Ok(())
}
pub fn commit_batch(&self) -> StorageResult<()> {
let mut writer_lock = match self.writer.write() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!("Warning: Recovering from poisoned writer rwlock in commit_batch");
poisoned.into_inner()
}
};
if let Some(mut writer) = writer_lock.take() {
match writer.commit() {
Ok(_opstamp) => {
}
Err(e) => {
let is_permission_error = std::error::Error::source(&e)
.and_then(|s| s.downcast_ref::<std::io::Error>())
.map(|io_err| matches!(io_err.kind(), std::io::ErrorKind::PermissionDenied))
.unwrap_or(false);
if is_permission_error {
return Err(StorageError::General(format!(
"Failed to commit index due to permission error.\n\
This can happen when:\n\
1. Security software is scanning the index directory\n\
2. Another process has locked the files\n\
3. Insufficient file system permissions\n\
\nOriginal error: {e}\n\
\nTry:\n\
- Reducing tantivy_heap_mb in settings (15-25MB)\n\
- Adding .codanna to security software exclusions\n\
- Ensuring no other codanna processes are running"
)));
}
return Err(e.into());
}
}
self.reader.reload()?;
if let Ok(mut pending_guard) = self.pending_symbol_counter.lock() {
*pending_guard = None;
}
if let Ok(mut pending_guard) = self.pending_file_counter.lock() {
*pending_guard = None;
}
if self.has_vector_support() && self.embedding_generator.is_some() {
self.post_commit_vector_processing()?;
}
self.build_cluster_cache()?;
}
Ok(())
}
pub fn remove_file_documents(&self, file_path: &str) -> StorageResult<()> {
let writer_lock = self.writer.read().map_err(|_| StorageError::LockPoisoned)?;
let term = Term::from_field_text(self.schema.file_path, file_path);
if let Some(writer) = writer_lock.as_ref() {
writer.delete_term(term);
} else {
drop(writer_lock); let mut writer = self.index.writer::<Document>(50_000_000)?;
writer.delete_term(term);
writer.commit()?;
self.reader.reload()?;
}
Ok(())
}
pub fn search(
&self,
query_str: &str,
limit: usize,
kind_filter: Option<SymbolKind>,
module_filter: Option<&str>,
language_filter: Option<&str>,
) -> StorageResult<Vec<SearchResult>> {
let searcher = self.reader.searcher();
let query_parser = QueryParser::for_index(
&self.index,
vec![
self.schema.name_text, self.schema.doc_comment,
self.schema.signature,
self.schema.context,
],
);
let main_query = match query_parser.parse_query(query_str) {
Ok(query) => query,
Err(_parse_error) => {
let name_term = Term::from_field_text(self.schema.name_text, query_str);
let doc_term = Term::from_field_text(self.schema.doc_comment, query_str);
let sig_term = Term::from_field_text(self.schema.signature, query_str);
let ctx_term = Term::from_field_text(self.schema.context, query_str);
Box::new(BooleanQuery::new(vec![
(
Occur::Should,
Box::new(TermQuery::new(name_term, IndexRecordOption::Basic))
as Box<dyn Query>,
),
(
Occur::Should,
Box::new(TermQuery::new(doc_term, IndexRecordOption::Basic))
as Box<dyn Query>,
),
(
Occur::Should,
Box::new(TermQuery::new(sig_term, IndexRecordOption::Basic))
as Box<dyn Query>,
),
(
Occur::Should,
Box::new(TermQuery::new(ctx_term, IndexRecordOption::Basic))
as Box<dyn Query>,
),
])) as Box<dyn Query>
}
};
let name_text_term = Term::from_field_text(self.schema.name_text, query_str);
let fuzzy_ngram_query = FuzzyTermQuery::new(name_text_term, 1, true);
let name_term = Term::from_field_text(self.schema.name, query_str);
let fuzzy_whole_word_query = FuzzyTermQuery::new(name_term, 1, true);
let mut all_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
all_clauses.push((
Occur::Must,
Box::new(BooleanQuery::new(vec![
(Occur::Should, main_query),
(Occur::Should, Box::new(fuzzy_ngram_query)),
(Occur::Should, Box::new(fuzzy_whole_word_query)),
])),
));
all_clauses.push((
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)),
));
if let Some(kind) = kind_filter {
let term = Term::from_field_text(self.schema.kind, &format!("{kind:?}"));
all_clauses.push((
Occur::Must,
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
));
}
if let Some(module) = module_filter {
let term = Term::from_field_text(self.schema.module_path, module);
all_clauses.push((
Occur::Must,
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
));
}
if let Some(lang) = language_filter {
let term = Term::from_field_text(self.schema.language, lang);
all_clauses.push((
Occur::Must,
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
));
}
let final_query = BooleanQuery::new(all_clauses);
let top_docs = searcher.search(&final_query, &TopDocs::with_limit(limit))?;
let mut results = Vec::new();
for (score, doc_address) in top_docs {
let doc: Document = searcher.doc(doc_address)?;
let symbol_id = doc
.get_first(self.schema.symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let name = doc
.get_first(self.schema.name)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let file_path = doc
.get_first(self.schema.file_path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let line = doc
.get_first(self.schema.line_number)
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
let column = doc
.get_first(self.schema.column)
.and_then(|v| v.as_u64())
.unwrap_or(0) as u16;
let doc_comment = doc
.get_first(self.schema.doc_comment)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let signature = doc
.get_first(self.schema.signature)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let context = doc
.get_first(self.schema.context)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let kind_str = doc
.get_first(self.schema.kind)
.and_then(|v| v.as_str())
.unwrap_or("Unknown");
let kind = match kind_str {
"Function" => SymbolKind::Function,
"Struct" => SymbolKind::Struct,
"Trait" => SymbolKind::Trait,
"Method" => SymbolKind::Method,
"Field" => SymbolKind::Field,
"Module" => SymbolKind::Module,
"Constant" => SymbolKind::Constant,
_ => SymbolKind::Function, };
let module_path = doc
.get_first(self.schema.module_path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
results.push(SearchResult {
symbol_id,
name,
kind,
file_path,
line,
column,
doc_comment,
signature,
module_path,
score,
highlights: Vec::new(), context,
});
}
Ok(results)
}
pub fn document_count(&self) -> StorageResult<u64> {
let searcher = self.reader.searcher();
Ok(searcher.num_docs())
}
pub fn clear(&self) -> StorageResult<()> {
let meta_path = self.index_path.join("meta.json");
if !meta_path.exists() {
return Ok(());
}
let mut writer = self.index.writer::<Document>(50_000_000)?;
writer.delete_all_documents()?;
writer.commit()?;
self.reader.reload()?;
Ok(())
}
pub fn find_symbol_by_id(&self, id: SymbolId) -> StorageResult<Option<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_u64(self.schema.symbol_id, id.0 as u64),
IndexRecordOption::Basic,
);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc::<Document>(*doc_address)?;
Ok(Some(self.document_to_symbol(&doc)?))
} else {
Ok(None)
}
}
pub fn find_symbol_by_id_with_language(
&self,
id: SymbolId,
language: &str,
) -> StorageResult<Option<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.symbol_id, id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.language, language),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc::<Document>(*doc_address)?;
Ok(Some(self.document_to_symbol(&doc)?))
} else {
Ok(None)
}
}
pub fn find_symbols_by_name(
&self,
name: &str,
language_filter: Option<&str>,
) -> StorageResult<Vec<crate::Symbol>> {
let searcher = self.reader.searcher();
let name_query = Box::new(TermQuery::new(
Term::from_field_text(self.schema.name, name),
IndexRecordOption::Basic,
)) as Box<dyn Query>;
let mut query_clauses = vec![
(Occur::Must, name_query),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
];
if let Some(lang) = language_filter {
query_clauses.push((
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.language, lang),
IndexRecordOption::Basic,
)),
));
}
let final_query = BooleanQuery::new(query_clauses);
let top_docs = searcher.search(&final_query, &TopDocs::with_limit(100))?;
let mut symbols = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
symbols.push(self.document_to_symbol(&doc)?);
}
Ok(symbols)
}
pub fn find_symbol_by_name_and_range(
&self,
name: &str,
file_id: FileId,
range: &crate::Range,
) -> StorageResult<Option<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.name, name),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.file_id, file_id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.line_number, range.start_line as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
let symbol = self.document_to_symbol(&doc)?;
if symbol.range.end_line == range.end_line {
return Ok(Some(symbol));
}
}
Ok(None)
}
pub fn find_symbols_by_file(&self, file_id: FileId) -> StorageResult<Vec<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.file_id, file_id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1000))?;
let mut symbols = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
symbols.push(self.document_to_symbol(&doc)?);
}
Ok(symbols)
}
pub fn find_symbols_by_module(&self, module_path: &str) -> StorageResult<Vec<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.module_path, module_path),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1000))?;
let mut symbols = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
symbols.push(self.document_to_symbol(&doc)?);
}
Ok(symbols)
}
pub fn get_all_symbols(&self, limit: usize) -> StorageResult<Vec<crate::Symbol>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
)]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;
let mut symbols = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
symbols.push(self.document_to_symbol(&doc)?);
}
Ok(symbols)
}
fn document_to_symbol(&self, doc: &Document) -> StorageResult<crate::Symbol> {
use crate::{Range, Symbol, SymbolKind, Visibility};
let symbol_id = doc
.get_first(self.schema.symbol_id)
.and_then(|v| v.as_u64())
.ok_or(StorageError::InvalidFieldValue {
field: "symbol_id".to_string(),
reason: "missing from document".to_string(),
})?;
let name = doc
.get_first(self.schema.name)
.and_then(|v| v.as_str())
.ok_or(StorageError::InvalidFieldValue {
field: "name".to_string(),
reason: "missing from document".to_string(),
})?
.to_string();
let kind_str = doc
.get_first(self.schema.kind)
.and_then(|v| v.as_str())
.ok_or(StorageError::InvalidFieldValue {
field: "kind".to_string(),
reason: "missing from document".to_string(),
})?;
let kind = SymbolKind::from_str_with_default(kind_str);
let file_id = doc
.get_first(self.schema.file_id)
.and_then(|v| v.as_u64())
.ok_or(StorageError::InvalidFieldValue {
field: "file_id".to_string(),
reason: "missing from document".to_string(),
})?;
let start_line = doc
.get_first(self.schema.line_number)
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
let start_col = doc
.get_first(self.schema.column)
.and_then(|v| v.as_u64())
.unwrap_or(0) as u16;
let end_line = doc
.get_first(self.schema.end_line)
.and_then(|v| v.as_u64())
.unwrap_or(start_line as u64) as u32;
let end_col = doc
.get_first(self.schema.end_column)
.and_then(|v| v.as_u64())
.unwrap_or(0) as u16;
let signature = doc
.get_first(self.schema.signature)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let doc_comment = doc
.get_first(self.schema.doc_comment)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let module_path = doc
.get_first(self.schema.module_path)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let visibility = doc
.get_first(self.schema.visibility)
.and_then(|v| v.as_u64())
.map(|v| match v {
0 => Visibility::Public,
1 => Visibility::Crate,
2 => Visibility::Module,
3 => Visibility::Private,
_ => Visibility::Private,
})
.unwrap_or(Visibility::Private);
let scope_context = doc
.get_first(self.schema.scope_context)
.and_then(|v| v.as_str())
.and_then(|s| {
match s {
"None" => None,
"Module" => Some(crate::ScopeContext::Module),
"Global" => Some(crate::ScopeContext::Global),
"Package" => Some(crate::ScopeContext::Package),
"Parameter" => Some(crate::ScopeContext::Parameter),
s if s.starts_with("ClassMember") => {
let class_name = if s.contains("class_name: Some(") {
s.split("class_name: Some(\"")
.nth(1)
.and_then(|rest| rest.split("\")").next())
.map(|name| name.to_string().into())
} else {
None
};
Some(crate::ScopeContext::ClassMember { class_name })
}
s if s.starts_with("Local") => {
let hoisted = s.contains("hoisted: true") || s.contains("hoisted:true");
let parent_name = if s.contains("parent_name: Some(") {
let start = s.find("parent_name: Some(\"").map(|i| i + 19)?;
let end = s[start..].find('"').map(|i| start + i)?;
Some(s[start..end].to_string().into())
} else {
None
};
let parent_kind = if s.contains("parent_kind: Some(") {
let start = s.find("parent_kind: Some(").map(|i| i + 18)?;
let end = s[start..].find(')').map(|i| start + i)?;
let kind_str = &s[start..end];
match kind_str {
"Function" => Some(crate::SymbolKind::Function),
"Class" => Some(crate::SymbolKind::Class),
"Method" => Some(crate::SymbolKind::Method),
_ => None,
}
} else {
None
};
Some(crate::ScopeContext::Local {
hoisted,
parent_name,
parent_kind,
})
}
_ => None,
}
});
Ok(Symbol {
id: SymbolId(symbol_id as u32),
name: name.into(),
kind,
file_id: FileId(file_id as u32),
range: Range {
start_line,
start_column: start_col,
end_line,
end_column: end_col,
},
file_path: doc
.get_first(self.schema.file_path)
.and_then(|v| v.as_str())
.unwrap_or("<unknown>")
.into(),
signature: signature.map(|s| s.into()),
doc_comment: doc_comment.map(|s| s.into()),
module_path: module_path.map(|s| s.into()),
visibility,
scope_context,
language_id: {
doc.get_first(self.schema.language)
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.and_then(|lang_str| {
crate::parsing::get_registry()
.lock()
.ok()
.and_then(|registry| registry.find_language_id(lang_str))
})
},
})
}
pub fn get_file_info(&self, path: &str) -> StorageResult<Option<(FileId, String, u64)>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "file_info"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.file_path, path),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc::<Document>(*doc_address)?;
let file_id = doc
.get_first(self.schema.file_id)
.and_then(|v| v.as_u64())
.and_then(|id| FileId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "file_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let hash = doc
.get_first(self.schema.file_hash)
.and_then(|v| v.as_str())
.ok_or(StorageError::InvalidFieldValue {
field: "file_hash".to_string(),
reason: "missing from document".to_string(),
})?
.to_string();
let mtime = doc
.get_first(self.schema.file_mtime)
.and_then(|v| v.as_u64())
.unwrap_or(0);
Ok(Some((file_id, hash, mtime)))
} else {
Ok(None)
}
}
pub fn get_next_file_id(&self) -> StorageResult<u32> {
if let Ok(mut pending_guard) = self.pending_file_counter.lock() {
if let Some(ref mut counter) = *pending_guard {
let next_id = *counter;
*counter += 1;
return Ok(next_id);
}
}
let current = self.query_metadata(MetadataKey::FileCounter)?.unwrap_or(0) as u32;
Ok(current + 1)
}
pub fn get_next_symbol_id(&self) -> StorageResult<u32> {
if let Ok(mut pending_guard) = self.pending_symbol_counter.lock() {
if let Some(ref mut counter) = *pending_guard {
let next_id = *counter;
*counter += 1;
return Ok(next_id);
}
}
let current = self
.query_metadata(MetadataKey::SymbolCounter)?
.unwrap_or(0) as u32;
Ok(current + 1)
}
pub fn update_pending_symbol_counter(&self, new_value: u32) -> StorageResult<()> {
if let Ok(mut pending_guard) = self.pending_symbol_counter.lock() {
if let Some(ref mut counter) = *pending_guard {
*counter = new_value;
}
}
Ok(())
}
pub fn delete_symbol(&self, id: SymbolId) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!("Warning: Recovering from poisoned writer rwlock in delete_symbol");
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let term = Term::from_field_u64(self.schema.symbol_id, id.0 as u64);
writer.delete_term(term);
Ok(())
}
pub fn delete_relationships_for_symbol(&self, id: SymbolId) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!(
"Warning: Recovering from poisoned writer rwlock in delete_relationships"
);
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let from_term = Term::from_field_u64(self.schema.from_symbol_id, id.0 as u64);
writer.delete_term(from_term);
let to_term = Term::from_field_u64(self.schema.to_symbol_id, id.0 as u64);
writer.delete_term(to_term);
Ok(())
}
pub fn count_symbols(&self) -> StorageResult<usize> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
);
let count = searcher.search(&query, &tantivy::collector::Count)?;
Ok(count)
}
pub fn count_relationships(&self) -> StorageResult<usize> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "relationship"),
IndexRecordOption::Basic,
);
let count = searcher.search(&query, &tantivy::collector::Count)?;
Ok(count)
}
pub fn count_files(&self) -> StorageResult<usize> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "file_info"),
IndexRecordOption::Basic,
);
let count = searcher.search(&query, &tantivy::collector::Count)?;
Ok(count)
}
pub fn get_all_indexed_paths(&self) -> StorageResult<Vec<PathBuf>> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "file_info"),
IndexRecordOption::Basic,
);
let collector = TopDocs::with_limit(100_000);
let top_docs = searcher.search(&query, &collector)?;
let mut paths = Vec::new();
for (_score, doc_address) in top_docs {
let doc: Document = searcher.doc(doc_address)?;
if let Some(path_value) = doc.get_first(self.schema.file_path) {
if let Some(path_str) = path_value.as_str() {
paths.push(PathBuf::from(path_str));
}
}
}
Ok(paths)
}
pub fn get_relationships_from(
&self,
from_id: SymbolId,
kind: RelationKind,
) -> StorageResult<Vec<(SymbolId, SymbolId, Relationship)>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "relationship"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.from_symbol_id, from_id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.relation_kind, &format!("{kind:?}")),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1000))?;
let mut relationships = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
let to_id = doc
.get_first(self.schema.to_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "to_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let mut relationship = Relationship::new(kind);
if let Some(line) = doc
.get_first(self.schema.relation_line)
.and_then(|v| v.as_u64())
{
if let Some(column) = doc
.get_first(self.schema.relation_column)
.and_then(|v| v.as_u64())
{
let mut metadata =
RelationshipMetadata::new().at_position(line as u32, column as u16);
if let Some(context) = doc
.get_first(self.schema.relation_context)
.and_then(|v| v.as_str())
{
metadata = metadata.with_context(context);
}
relationship = relationship.with_metadata(metadata);
}
}
relationships.push((from_id, to_id, relationship));
}
Ok(relationships)
}
pub fn get_relationships_to(
&self,
to_id: SymbolId,
kind: RelationKind,
) -> StorageResult<Vec<(SymbolId, SymbolId, Relationship)>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "relationship"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.to_symbol_id, to_id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.relation_kind, &format!("{kind:?}")),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1000))?;
let mut relationships = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
let from_id = doc
.get_first(self.schema.from_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "from_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let mut relationship = Relationship::new(kind);
if let Some(line) = doc
.get_first(self.schema.relation_line)
.and_then(|v| v.as_u64())
{
if let Some(column) = doc
.get_first(self.schema.relation_column)
.and_then(|v| v.as_u64())
{
let mut metadata =
RelationshipMetadata::new().at_position(line as u32, column as u16);
if let Some(context) = doc
.get_first(self.schema.relation_context)
.and_then(|v| v.as_str())
{
metadata = metadata.with_context(context);
}
relationship = relationship.with_metadata(metadata);
}
}
relationships.push((from_id, to_id, relationship));
}
Ok(relationships)
}
pub fn get_all_relationships_by_kind(
&self,
kind: RelationKind,
) -> StorageResult<Vec<(SymbolId, SymbolId, Relationship)>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "relationship"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.relation_kind, &format!("{kind:?}")),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10000))?;
let mut relationships = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc::<Document>(doc_address)?;
let from_id = doc
.get_first(self.schema.from_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "from_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let to_id = doc
.get_first(self.schema.to_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "to_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
relationships.push((from_id, to_id, Relationship::new(kind)));
}
Ok(relationships)
}
pub fn get_file_path(&self, file_id: FileId) -> StorageResult<Option<String>> {
let searcher = self.reader.searcher();
let query = BooleanQuery::from(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "file_info"),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.file_id, file_id.0 as u64),
IndexRecordOption::Basic,
)) as Box<dyn Query>,
),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc::<Document>(*doc_address)?;
let path = doc
.get_first(self.schema.file_path)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
Ok(path)
} else {
Ok(None)
}
}
pub fn path(&self) -> &Path {
&self.index_path
}
pub(crate) fn store_relationship(
&self,
from: SymbolId,
to: SymbolId,
rel: &Relationship,
) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!("Warning: Recovering from poisoned writer rwlock in store_relationship");
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let mut doc = Document::new();
doc.add_text(self.schema.doc_type, "relationship");
doc.add_u64(self.schema.from_symbol_id, from.value() as u64);
doc.add_u64(self.schema.to_symbol_id, to.value() as u64);
doc.add_text(self.schema.relation_kind, format!("{:?}", rel.kind));
doc.add_f64(self.schema.relation_weight, rel.weight as f64);
if let Some(ref metadata) = rel.metadata {
if let Some(line) = metadata.line {
doc.add_u64(self.schema.relation_line, line as u64);
}
if let Some(column) = metadata.column {
doc.add_u64(self.schema.relation_column, column as u64);
}
if let Some(ref context) = metadata.context {
doc.add_text(self.schema.relation_context, context.as_ref());
}
}
writer.add_document(doc)?;
Ok(())
}
pub fn index_symbol(&self, symbol: &crate::Symbol, file_path: &str) -> StorageResult<()> {
self.add_document(
symbol.id,
&symbol.name,
symbol.kind,
symbol.file_id,
file_path,
symbol.range.start_line,
symbol.range.start_column,
symbol.range.end_line,
symbol.range.end_column,
symbol.doc_comment.as_ref().map(|s| s.as_ref()),
symbol.signature.as_ref().map(|s| s.as_ref()),
symbol
.module_path
.as_ref()
.map(|s| s.as_ref())
.unwrap_or(""),
None, symbol.visibility,
symbol.scope_context.clone(),
symbol.language_id.as_ref().map(|id| id.as_str()),
)
}
pub fn store_file_registration(
&self,
registration: &crate::indexing::pipeline::FileRegistration,
) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!(
"Warning: Recovering from poisoned writer rwlock in store_file_registration"
);
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let mut doc = Document::new();
doc.add_text(self.schema.doc_type, "file_info");
doc.add_u64(self.schema.file_id, registration.file_id.value() as u64);
doc.add_text(
self.schema.file_path,
registration.path.to_string_lossy().as_ref(),
);
doc.add_text(self.schema.file_hash, ®istration.content_hash);
doc.add_u64(self.schema.file_timestamp, registration.timestamp);
doc.add_u64(self.schema.file_mtime, registration.mtime);
doc.add_text(self.schema.language, registration.language_id.as_str());
writer.add_document(doc)?;
Ok(())
}
pub fn store_import(&self, import: &crate::parsing::Import) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!("Warning: Recovering from poisoned writer rwlock in store_import");
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let mut doc = Document::new();
doc.add_text(self.schema.doc_type, "import");
doc.add_u64(self.schema.import_file_id, import.file_id.value() as u64);
doc.add_text(self.schema.import_path, &import.path);
if let Some(alias) = &import.alias {
doc.add_text(self.schema.import_alias, alias);
}
doc.add_u64(
self.schema.import_is_glob,
if import.is_glob { 1 } else { 0 },
);
doc.add_u64(
self.schema.import_is_type_only,
if import.is_type_only { 1 } else { 0 },
);
writer.add_document(doc)?;
Ok(())
}
pub fn get_imports_for_file(
&self,
file_id: FileId,
) -> StorageResult<Vec<crate::parsing::Import>> {
let query = BooleanQuery::new(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "import"),
IndexRecordOption::Basic,
)),
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.import_file_id, file_id.value() as u64),
IndexRecordOption::Basic,
)),
),
]);
let searcher = self.reader.searcher();
let top_docs = searcher
.search(&query, &TopDocs::with_limit(1000))
.map_err(|e| StorageError::General(format!("Import search failed: {e}")))?;
let mut imports = Vec::new();
for (_score, doc_address) in top_docs {
let doc: Document = searcher.doc(doc_address).map_err(|e| {
StorageError::General(format!("Failed to retrieve import document: {e}"))
})?;
let import_path = doc
.get_first(self.schema.import_path)
.and_then(|v| v.as_str())
.ok_or_else(|| StorageError::General("Missing import_path".to_string()))?
.to_string();
let alias = doc
.get_first(self.schema.import_alias)
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let is_glob = doc
.get_first(self.schema.import_is_glob)
.and_then(|v| v.as_u64())
.map(|v| v == 1)
.unwrap_or(false);
let is_type_only = doc
.get_first(self.schema.import_is_type_only)
.and_then(|v| v.as_u64())
.map(|v| v == 1)
.unwrap_or(false);
imports.push(crate::parsing::Import {
path: import_path,
alias,
file_id,
is_glob,
is_type_only,
});
}
Ok(imports)
}
pub fn delete_imports_for_file(&self, file_id: FileId) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!(
"Warning: Recovering from poisoned writer rwlock in delete_imports_for_file"
);
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let query = BooleanQuery::new(vec![
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_text(self.schema.doc_type, "import"),
IndexRecordOption::Basic,
)),
),
(
Occur::Must,
Box::new(TermQuery::new(
Term::from_field_u64(self.schema.import_file_id, file_id.value() as u64),
IndexRecordOption::Basic,
)),
),
]);
writer.delete_query(Box::new(query))?;
Ok(())
}
pub(crate) fn store_metadata(&self, key: MetadataKey, value: u64) -> StorageResult<()> {
let writer_lock = match self.writer.read() {
Ok(lock) => lock,
Err(poisoned) => {
eprintln!("Warning: Recovering from poisoned writer rwlock in store_metadata");
poisoned.into_inner()
}
};
let writer = writer_lock.as_ref().ok_or(StorageError::NoActiveBatch)?;
let key_str = key.as_str();
let term = Term::from_field_text(self.schema.meta_key, key_str);
writer.delete_term(term);
let mut doc = Document::new();
doc.add_text(self.schema.doc_type, "metadata");
doc.add_text(self.schema.meta_key, key_str);
doc.add_u64(self.schema.meta_value, value);
writer.add_document(doc)?;
Ok(())
}
#[allow(dead_code)]
pub(crate) fn query_relationships(
&self,
) -> StorageResult<Vec<(SymbolId, SymbolId, crate::Relationship)>> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "relationship"),
IndexRecordOption::Basic,
);
let collector = TopDocs::with_limit(1_000_000); let top_docs = searcher.search(&query, &collector)?;
let mut relationships = Vec::new();
for (_score, doc_address) in top_docs {
let doc: Document = searcher.doc(doc_address)?;
let from_id = doc
.get_first(self.schema.from_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "from_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let to_id = doc
.get_first(self.schema.to_symbol_id)
.and_then(|v| v.as_u64())
.and_then(|id| SymbolId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "to_symbol_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let kind_str = doc
.get_first(self.schema.relation_kind)
.and_then(|v| v.as_str())
.ok_or(StorageError::InvalidFieldValue {
field: "relation_kind".to_string(),
reason: "missing from document".to_string(),
})?;
let weight = doc
.get_first(self.schema.relation_weight)
.and_then(|v| v.as_f64())
.unwrap_or(1.0) as f32;
let kind = match kind_str {
"Calls" => RelationKind::Calls,
"CalledBy" => RelationKind::CalledBy,
"Extends" => RelationKind::Extends,
"ExtendedBy" => RelationKind::ExtendedBy,
"Implements" => RelationKind::Implements,
"ImplementedBy" => RelationKind::ImplementedBy,
"Uses" => RelationKind::Uses,
"UsedBy" => RelationKind::UsedBy,
"Defines" => RelationKind::Defines,
"DefinedIn" => RelationKind::DefinedIn,
"References" => RelationKind::References,
"ReferencedBy" => RelationKind::ReferencedBy,
_ => continue, };
let mut relationship = Relationship::new(kind).with_weight(weight);
let has_metadata = doc.get_first(self.schema.relation_line).is_some()
|| doc.get_first(self.schema.relation_column).is_some()
|| doc.get_first(self.schema.relation_context).is_some();
if has_metadata {
let mut metadata = RelationshipMetadata::new();
if let Some(line) = doc
.get_first(self.schema.relation_line)
.and_then(|v| v.as_u64())
{
metadata.line = Some(line as u32);
}
if let Some(column) = doc
.get_first(self.schema.relation_column)
.and_then(|v| v.as_u64())
{
metadata.column = Some(column as u16);
}
if let Some(context) = doc
.get_first(self.schema.relation_context)
.and_then(|v| v.as_str())
{
metadata.context = Some(context.into());
}
relationship = relationship.with_metadata(metadata);
}
relationships.push((from_id, to_id, relationship));
}
Ok(relationships)
}
#[allow(dead_code)]
pub(crate) fn query_file_info(&self) -> StorageResult<Vec<(FileId, String, String, u64)>> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "file_info"),
IndexRecordOption::Basic,
);
let collector = TopDocs::with_limit(100_000); let top_docs = searcher.search(&query, &collector)?;
let mut files = Vec::new();
for (_score, doc_address) in top_docs {
let doc: Document = searcher.doc(doc_address)?;
let file_id = doc
.get_first(self.schema.file_id)
.and_then(|v| v.as_u64())
.and_then(|id| FileId::new(id as u32))
.ok_or(StorageError::InvalidFieldValue {
field: "file_id".to_string(),
reason: "not a valid u32".to_string(),
})?;
let path = doc
.get_first(self.schema.file_path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let hash = doc
.get_first(self.schema.file_hash)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let timestamp = doc
.get_first(self.schema.file_timestamp)
.and_then(|v| v.as_u64())
.unwrap_or(0);
files.push((file_id, path, hash, timestamp));
}
Ok(files)
}
#[allow(dead_code)]
pub(crate) fn count_symbol_documents(&self) -> StorageResult<u64> {
let searcher = self.reader.searcher();
let query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "symbol"),
IndexRecordOption::Basic,
);
let count = searcher.search(&query, &tantivy::collector::Count)?;
Ok(count as u64)
}
#[deprecated(note = "Use Tantivy queries directly instead of rebuilding IndexData")]
#[allow(dead_code)]
pub(crate) fn rebuild_index_data(&self) -> StorageResult<()> {
Ok(())
}
pub(crate) fn query_metadata(&self, key: MetadataKey) -> StorageResult<Option<u64>> {
let searcher = self.reader.searcher();
let doc_type_query = TermQuery::new(
Term::from_field_text(self.schema.doc_type, "metadata"),
IndexRecordOption::Basic,
);
let key_query = TermQuery::new(
Term::from_field_text(self.schema.meta_key, key.as_str()),
IndexRecordOption::Basic,
);
let query = BooleanQuery::new(vec![
(Occur::Must, Box::new(doc_type_query) as Box<dyn Query>),
(Occur::Must, Box::new(key_query) as Box<dyn Query>),
]);
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
if let Some((_score, doc_address)) = top_docs.into_iter().next() {
let doc: Document = searcher.doc(doc_address)?;
let value = doc
.get_first(self.schema.meta_value)
.and_then(|v| v.as_u64());
Ok(value)
} else {
Ok(None)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::indexing::pipeline::FileRegistration;
use crate::parsing::registry::LanguageId;
use std::path::PathBuf;
use tempfile::TempDir;
#[test]
fn test_document_index_creation() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
assert_eq!(index.document_count().unwrap(), 0);
assert!(!index.has_vector_support());
assert!(index.vector_storage_path().is_none());
assert!(index.vector_engine().is_none());
}
#[test]
fn test_schema_has_language_field() {
let (schema, _) = IndexSchema::build();
let language_field = schema.get_field("language");
assert!(
language_field.is_ok(),
"Schema should have 'language' field"
);
let field = language_field.unwrap();
let field_entry = schema.get_field_entry(field);
assert!(field_entry.is_indexed(), "Language field should be indexed");
assert!(field_entry.is_stored(), "Language field should be stored");
}
#[test]
fn test_add_and_search_document() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let symbol_id = SymbolId::new(1).unwrap();
let file_id = FileId::new(1).unwrap();
index
.add_document(
symbol_id,
"parse_json",
SymbolKind::Function,
file_id,
"src/parser.rs",
42,
5,
50, 0, Some("Parse JSON string into a Value"),
Some("fn parse_json(input: &str) -> StorageResult<Value, Error>"),
"crate::parser",
None,
crate::Visibility::Public,
Some(crate::ScopeContext::Module),
None, )
.unwrap();
index.commit_batch().unwrap();
let results = index.search("json", 10, None, None, None).unwrap();
assert_eq!(results.len(), 1);
let result = &results[0];
assert_eq!(result.name, "parse_json");
assert_eq!(result.line, 42);
assert_eq!(result.file_path, "src/parser.rs");
}
#[test]
fn test_store_and_retrieve_symbol_with_language() {
use crate::parsing::registry::LanguageId;
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let symbol = crate::Symbol::new(
SymbolId::new(1).unwrap(),
"test_func",
SymbolKind::Function,
FileId::new(1).unwrap(),
crate::Range::new(10, 0, 15, 0),
)
.with_language_id(LanguageId::new("rust"))
.with_signature("fn test_func() -> Result<()>")
.with_doc("Test function");
index.index_symbol(&symbol, "src/test.rs").unwrap();
index.commit_batch().unwrap();
let retrieved = index.find_symbol_by_id(symbol.id).unwrap();
assert!(retrieved.is_some());
let retrieved_symbol = retrieved.unwrap();
assert_eq!(retrieved_symbol.language_id, Some(LanguageId::new("rust")));
assert_eq!(retrieved_symbol.name.as_ref(), "test_func");
assert_eq!(
retrieved_symbol.signature.as_deref(),
Some("fn test_func() -> Result<()>")
);
}
#[test]
fn test_fuzzy_search() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let symbol_id = SymbolId::new(1).unwrap();
let file_id = FileId::new(1).unwrap();
index
.add_document(
symbol_id,
"handle_request",
SymbolKind::Function,
file_id,
"src/server.rs",
100,
0,
120, 0, Some("Handle incoming HTTP request"),
None,
"crate::server",
None,
crate::Visibility::Private,
Some(crate::ScopeContext::Module),
None, )
.unwrap();
index.commit_batch().unwrap();
let results = index.search("handle", 10, None, None, None).unwrap();
assert!(!results.is_empty(), "Should find exact match");
let results = index.search("handl", 10, None, None, None).unwrap();
assert!(!results.is_empty(), "Should find with fuzzy search");
}
#[test]
fn test_relationship_storage() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let from_id = SymbolId::new(1).unwrap();
let to_id = SymbolId::new(2).unwrap();
let rel = crate::Relationship::new(crate::RelationKind::Calls).with_weight(0.8);
index.store_relationship(from_id, to_id, &rel).unwrap();
index.commit_batch().unwrap();
let relationships = index.query_relationships().unwrap();
assert_eq!(relationships.len(), 1);
let (f, t, r) = &relationships[0];
assert_eq!(*f, from_id);
assert_eq!(*t, to_id);
assert_eq!(r.kind, crate::RelationKind::Calls);
assert_eq!(r.weight, 0.8);
}
#[test]
fn test_file_info_storage() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = crate::FileId::new(1).unwrap();
let registration = FileRegistration {
path: PathBuf::from("src/main.rs"),
file_id,
content_hash: "abc123".to_string(),
language_id: LanguageId::new("rust"),
timestamp: 1234567890,
mtime: 0,
};
index.store_file_registration(®istration).unwrap();
index.commit_batch().unwrap();
let files = index.query_file_info().unwrap();
assert_eq!(files.len(), 1);
let (id, path, hash, timestamp) = &files[0];
assert_eq!(*id, file_id);
assert_eq!(path, "src/main.rs");
assert_eq!(hash, "abc123");
assert_eq!(*timestamp, 1234567890);
}
#[test]
fn test_get_all_indexed_paths() {
println!("=== TEST: get_all_indexed_paths() ===");
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
println!("Step 1: Testing empty index...");
let paths = index.get_all_indexed_paths().unwrap();
assert_eq!(paths.len(), 0);
println!(" ✓ Empty index returns 0 paths");
println!("\nStep 2: Adding file info documents...");
index.start_batch().unwrap();
let test_files = vec![
(1, "src/main.rs", "hash1"),
(2, "src/lib.rs", "hash2"),
(3, "tests/integration.rs", "hash3"),
(4, "src/module/helper.rs", "hash4"),
(5, "benches/benchmark.rs", "hash5"),
];
for (id, path, hash) in &test_files {
let file_id = crate::FileId::new(*id).unwrap();
let registration = FileRegistration {
path: PathBuf::from(*path),
file_id,
content_hash: hash.to_string(),
language_id: LanguageId::new("rust"),
timestamp: 1234567890,
mtime: 0,
};
index.store_file_registration(®istration).unwrap();
println!(" - Added: {path}");
}
index.commit_batch().unwrap();
println!(" ✓ Added {} file info documents", test_files.len());
println!("\nStep 3: Retrieving all indexed paths...");
let paths = index.get_all_indexed_paths().unwrap();
println!(" Retrieved {} paths:", paths.len());
for (i, path) in paths.iter().enumerate() {
println!(" [{}] {}", i + 1, path.display());
}
assert_eq!(paths.len(), test_files.len());
println!(" ✓ Correct number of paths returned");
let path_strings: Vec<String> = paths
.iter()
.map(|p| p.to_string_lossy().to_string())
.collect();
for (_, expected_path, _) in &test_files {
assert!(
path_strings.contains(&expected_path.to_string()),
"Missing path: {expected_path}"
);
}
println!(" ✓ All expected paths are present");
println!("\nStep 4: Adding a symbol document (should be ignored)...");
index.start_batch().unwrap();
let symbol_id = SymbolId::new(100).unwrap();
let file_id = FileId::new(1).unwrap();
index
.add_document(
symbol_id,
"test_function",
SymbolKind::Function,
file_id,
"src/main.rs",
42,
5,
50, 0, Some("Test function"),
Some("fn test_function()"),
"crate",
None,
crate::Visibility::Public,
Some(crate::ScopeContext::Module),
None, )
.unwrap();
index.commit_batch().unwrap();
println!(" - Added symbol document");
let paths_after_symbol = index.get_all_indexed_paths().unwrap();
assert_eq!(paths_after_symbol.len(), test_files.len());
println!(" ✓ Symbol documents correctly ignored");
println!("\n=== TEST PASSED: get_all_indexed_paths() works correctly ===");
}
#[test]
fn test_metadata_storage() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
index.store_metadata(MetadataKey::FileCounter, 42).unwrap();
index
.store_metadata(MetadataKey::SymbolCounter, 100)
.unwrap();
index.commit_batch().unwrap();
assert_eq!(
index.query_metadata(MetadataKey::FileCounter).unwrap(),
Some(42)
);
assert_eq!(
index.query_metadata(MetadataKey::SymbolCounter).unwrap(),
Some(100)
);
}
#[test]
fn test_vector_metadata_creation() {
let metadata = VectorMetadata::new(1);
assert_eq!(metadata.vector_id, None);
assert_eq!(metadata.cluster_id, None);
assert_eq!(metadata.embedding_version, 1);
assert!(!metadata.has_vector());
let vector_id = VectorId::new(42).unwrap();
let cluster_id = ClusterId::new(5).unwrap();
let metadata_with_vector = VectorMetadata::with_vector(vector_id, cluster_id, 2);
assert_eq!(metadata_with_vector.vector_id, Some(vector_id));
assert_eq!(metadata_with_vector.cluster_id, Some(cluster_id));
assert_eq!(metadata_with_vector.embedding_version, 2);
assert!(metadata_with_vector.has_vector());
}
#[test]
fn test_vector_metadata_serialization() {
let metadata = VectorMetadata::new(1);
let json = metadata.to_json().unwrap();
let deserialized = VectorMetadata::from_json(&json).unwrap();
assert_eq!(metadata, deserialized);
let vector_id = VectorId::new(123).unwrap();
let cluster_id = ClusterId::new(7).unwrap();
let metadata_with_vector = VectorMetadata::with_vector(vector_id, cluster_id, 3);
let json = metadata_with_vector.to_json().unwrap();
let deserialized = VectorMetadata::from_json(&json).unwrap();
assert_eq!(metadata_with_vector, deserialized);
assert!(json.contains("\"vector_id\""));
assert!(json.contains("\"cluster_id\""));
assert!(json.contains("\"embedding_version\":3"));
}
#[test]
fn test_vector_metadata_deserialization_error() {
let invalid_json = "{ invalid json }";
let result = VectorMetadata::from_json(invalid_json);
assert!(result.is_err());
match result {
Err(StorageError::Serialization(msg)) => {
assert!(msg.contains("Failed to deserialize VectorMetadata"));
}
_ => panic!("Expected Serialization error"),
}
}
#[test]
fn test_vector_metadata_tantivy_roundtrip() {
use tantivy::schema::{STORED, SchemaBuilder, TEXT};
use tantivy::{Index, TantivyDocument, doc};
let mut schema_builder = SchemaBuilder::default();
let metadata_field = schema_builder.add_text_field("vector_metadata", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(50_000_000).unwrap();
let vector_id = VectorId::new(999).unwrap();
let cluster_id = ClusterId::new(42).unwrap();
let metadata = VectorMetadata::with_vector(vector_id, cluster_id, 5);
let json = metadata.to_json().unwrap();
let doc = doc!(metadata_field => json.clone());
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let doc: TantivyDocument = searcher.doc(tantivy::DocAddress::new(0, 0)).unwrap();
let stored_json = doc
.get_first(metadata_field)
.and_then(|v| v.as_str())
.unwrap();
let retrieved_metadata = VectorMetadata::from_json(stored_json).unwrap();
assert_eq!(metadata, retrieved_metadata);
assert_eq!(json, stored_json);
}
#[test]
fn test_document_index_with_vector_support() {
use crate::vector::{VectorDimension, VectorSearchEngine};
let temp_dir = TempDir::new().unwrap();
let vector_dir = temp_dir.path().join("vectors");
let vector_engine =
VectorSearchEngine::new(&vector_dir, VectorDimension::new(384).unwrap()).unwrap();
let vector_engine_arc = Arc::new(Mutex::new(vector_engine));
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings)
.unwrap()
.with_vector_support(vector_engine_arc.clone(), &vector_dir);
assert!(index.has_vector_support());
assert_eq!(index.vector_storage_path(), Some(vector_dir.as_ref()));
assert!(index.vector_engine().is_some());
let engine_ref = index.vector_engine().unwrap();
assert!(Arc::ptr_eq(engine_ref, &vector_engine_arc));
}
#[test]
fn test_document_index_operations_with_and_without_vectors() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index_no_vectors =
DocumentIndex::new(temp_dir.path().join("no_vectors"), &settings).unwrap();
index_no_vectors.start_batch().unwrap();
index_no_vectors
.add_document(
SymbolId::new(1).unwrap(),
"test_func",
SymbolKind::Function,
FileId::new(1).unwrap(),
"test.rs",
1,
1,
10, 0, None,
None,
"test",
None,
crate::Visibility::Public,
Some(crate::ScopeContext::Module),
None, )
.unwrap();
index_no_vectors.commit_batch().unwrap();
let results = index_no_vectors
.search("test_func", 10, None, None, None)
.unwrap();
assert_eq!(results.len(), 1);
assert!(!index_no_vectors.has_vector_support());
use crate::vector::{VectorDimension, VectorSearchEngine};
let vector_dir = temp_dir.path().join("vectors");
let vector_engine =
VectorSearchEngine::new(&vector_dir, VectorDimension::new(384).unwrap()).unwrap();
let vector_engine_arc = Arc::new(Mutex::new(vector_engine));
let settings = crate::config::Settings::default();
let index_with_vectors =
DocumentIndex::new(temp_dir.path().join("with_vectors"), &settings)
.unwrap()
.with_vector_support(vector_engine_arc, &vector_dir);
index_with_vectors.start_batch().unwrap();
index_with_vectors
.add_document(
SymbolId::new(2).unwrap(),
"vector_func",
SymbolKind::Function,
FileId::new(1).unwrap(),
"test.rs",
10,
1,
20, 0, None,
None,
"test",
None,
crate::Visibility::Public,
Some(crate::ScopeContext::Module),
None, )
.unwrap();
index_with_vectors.commit_batch().unwrap();
let results = index_with_vectors
.search("vector_func", 10, None, None, None)
.unwrap();
assert_eq!(results.len(), 1);
assert!(index_with_vectors.has_vector_support());
}
#[test]
fn test_document_index_debug_impl() {
use crate::vector::{VectorDimension, VectorSearchEngine};
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
let debug_str = format!("{index:?}");
assert!(debug_str.contains("DocumentIndex"));
assert!(debug_str.contains("index_path"));
assert!(debug_str.contains("vector_storage_path: None"));
assert!(debug_str.contains("has_vector_engine: false"));
let vector_dir = temp_dir.path().join("vectors");
let vector_engine =
VectorSearchEngine::new(&vector_dir, VectorDimension::new(384).unwrap()).unwrap();
let vector_engine_arc = Arc::new(Mutex::new(vector_engine));
let settings = crate::config::Settings::default();
let index_with_vectors = DocumentIndex::new(temp_dir.path(), &settings)
.unwrap()
.with_vector_support(vector_engine_arc, &vector_dir);
let debug_str = format!("{index_with_vectors:?}");
assert!(debug_str.contains("DocumentIndex"));
assert!(debug_str.contains("has_vector_engine: true"));
assert!(debug_str.contains("vector_storage_path: Some("));
assert!(debug_str.contains("vectors"));
}
#[test]
fn test_cluster_cache_operations() {
use crate::vector::{VectorDimension, VectorSearchEngine};
use std::time::Instant;
let temp_dir = TempDir::new().unwrap();
let vector_dir = temp_dir.path().join("vectors");
let vector_engine =
VectorSearchEngine::new(&vector_dir, VectorDimension::new(384).unwrap()).unwrap();
let vector_engine_arc = Arc::new(Mutex::new(vector_engine));
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings)
.unwrap()
.with_vector_support(vector_engine_arc, &vector_dir);
index.start_batch().unwrap();
let test_docs = vec![
(1, "parse_string", 0), (2, "parse_number", 0), (3, "parse_json", 0), (4, "Parser", 1), (5, "JsonParser", 1), (6, "handle_error", 2), (7, "ParseError", 2), ];
for (id, name, cluster) in &test_docs {
let mut doc = Document::new();
doc.add_text(index.schema.doc_type, "symbol");
doc.add_u64(index.schema.symbol_id, *id);
doc.add_text(index.schema.name, name);
doc.add_text(index.schema.kind, "function");
doc.add_u64(index.schema.file_id, 1);
doc.add_text(index.schema.file_path, "test.rs");
doc.add_u64(index.schema.line_number, *id * 10);
doc.add_u64(index.schema.column, 1);
doc.add_text(index.schema.module_path, "test");
doc.add_u64(index.schema.cluster_id, *cluster + 1); doc.add_u64(index.schema.vector_id, *id);
doc.add_u64(index.schema.has_vector, 1);
index
.writer
.read()
.unwrap()
.as_ref()
.unwrap()
.add_document(doc)
.unwrap();
}
index.commit_batch().unwrap();
{
let cache = index.cluster_cache.read().unwrap();
assert!(cache.is_some(), "Cache should be built after commit");
let cluster_cache = cache.as_ref().unwrap();
println!(
"Total documents in cache: {}",
cluster_cache.total_documents()
);
let cluster_ids = cluster_cache.all_cluster_ids();
println!(
"Found cluster IDs: {:?}",
cluster_ids.iter().map(|id| id.get()).collect::<Vec<_>>()
);
for (_seg_ord, clusters) in &cluster_cache.segment_mappings {
println!(
"Segment {}: clusters={:?}",
_seg_ord.get(),
clusters.keys().map(|c| c.get()).collect::<Vec<_>>()
);
for (cluster_id, docs) in clusters {
println!(" Cluster {}: {} docs", cluster_id.get(), docs.len());
}
}
}
let all_cluster_ids = index.get_all_cluster_ids().unwrap();
assert_eq!(all_cluster_ids.len(), 3);
let mut cluster1_total = 0;
let mut cluster2_total = 0;
let mut cluster3_total = 0;
{
let cache = index.cluster_cache.read().unwrap();
let cluster_cache = cache.as_ref().unwrap();
for clusters in cluster_cache.segment_mappings.values() {
if let Some(docs) = clusters.get(&ClusterId::new(1).unwrap()) {
cluster1_total += docs.len();
}
if let Some(docs) = clusters.get(&ClusterId::new(2).unwrap()) {
cluster2_total += docs.len();
}
if let Some(docs) = clusters.get(&ClusterId::new(3).unwrap()) {
cluster3_total += docs.len();
}
}
}
assert_eq!(cluster1_total, 3);
assert_eq!(cluster2_total, 2);
assert_eq!(cluster3_total, 2);
let cluster_id = ClusterId::new(1).unwrap();
let segment_ord = SegmentOrdinal::new(0);
let start = Instant::now();
for _ in 0..1000 {
let _ = index
.get_cluster_documents(segment_ord, cluster_id)
.unwrap();
}
let elapsed = start.elapsed();
let per_lookup = elapsed / 1000;
println!("Cluster lookup performance: {per_lookup:?} per lookup");
assert!(
per_lookup.as_micros() < 10,
"Lookup should be <10μs, was {per_lookup:?}"
);
index.start_batch().unwrap();
let mut doc = Document::new();
doc.add_text(index.schema.doc_type, "symbol");
doc.add_u64(index.schema.symbol_id, 8);
doc.add_text(index.schema.name, "new_function");
doc.add_text(index.schema.kind, "function");
doc.add_u64(index.schema.file_id, 1);
doc.add_text(index.schema.file_path, "test.rs");
doc.add_u64(index.schema.line_number, 80);
doc.add_u64(index.schema.column, 1);
doc.add_text(index.schema.module_path, "test");
doc.add_u64(index.schema.cluster_id, 1); doc.add_u64(index.schema.vector_id, 8);
doc.add_u64(index.schema.has_vector, 1);
index
.writer
.read()
.unwrap()
.as_ref()
.unwrap()
.add_document(doc)
.unwrap();
index.commit_batch().unwrap();
{
let cache = index.cluster_cache.read().unwrap();
let cluster_cache = cache.as_ref().unwrap();
assert_eq!(cluster_cache.total_documents(), 8);
}
{
let cache = index.cluster_cache.read().unwrap();
let cluster_cache = cache.as_ref().unwrap();
let mut cluster1_updated = 0;
for clusters in cluster_cache.segment_mappings.values() {
if let Some(docs) = clusters.get(&ClusterId::new(1).unwrap()) {
cluster1_updated += docs.len();
}
}
assert_eq!(cluster1_updated, 4); }
}
#[test]
fn test_cluster_cache_unit_operations() {
let mut cache = ClusterCache::new(1);
let seg0 = SegmentOrdinal::new(0);
let seg1 = SegmentOrdinal::new(1);
let cluster0 = ClusterId::new(1).unwrap();
let cluster1 = ClusterId::new(2).unwrap();
cache.add_document(seg0, cluster0, 0);
cache.add_document(seg0, cluster0, 2);
cache.add_document(seg0, cluster0, 1);
cache.add_document(seg0, cluster1, 3);
cache.add_document(seg1, cluster0, 0);
cache.sort_all();
let docs = cache.get_documents(seg0, cluster0).unwrap();
assert_eq!(docs, &[0, 1, 2]);
let docs = cache.get_documents(seg0, cluster1).unwrap();
assert_eq!(docs, &[3]);
let docs = cache.get_documents(seg1, cluster0).unwrap();
assert_eq!(docs, &[0]);
assert!(cache.get_documents(seg1, cluster1).is_none());
assert_eq!(cache.total_documents(), 5);
assert_eq!(cache.all_cluster_ids().len(), 2);
assert!(cache.is_valid_for_generation(1));
assert!(!cache.is_valid_for_generation(2));
}
#[test]
fn test_find_symbols_by_name_with_language_filter() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
index
.add_document(
SymbolId::new(1).unwrap(),
"main",
SymbolKind::Function,
FileId::new(1).unwrap(),
"src/main.rs",
0, 0, 5, 0, Some("Entry point"), Some("fn main() {}"), "crate", None, crate::Visibility::Public, None, Some("rust"), )
.unwrap();
index
.add_document(
SymbolId::new(2).unwrap(),
"main",
SymbolKind::Function,
FileId::new(2).unwrap(),
"src/main.py",
0, 0, 5, 0, Some("Python entry point"), Some("def main():"), "__main__", None, crate::Visibility::Public, None, Some("python"), )
.unwrap();
index
.add_document(
SymbolId::new(3).unwrap(),
"main",
SymbolKind::Function,
FileId::new(3).unwrap(),
"src/main.ts",
0, 0, 5, 0, Some("TypeScript entry"), Some("function main(): void"), "app", None, crate::Visibility::Public, None, Some("typescript"), )
.unwrap();
index.commit_batch().unwrap();
println!("\n=== Testing find_symbols_by_name with language filtering ===");
let all_symbols = index.find_symbols_by_name("main", None).unwrap();
println!("Test 1 - No filter: Found {} symbols", all_symbols.len());
for symbol in &all_symbols {
println!(" - Symbol ID: {:?}, File: {}", symbol.id, symbol.file_id.0);
}
assert_eq!(
all_symbols.len(),
3,
"Should find 3 'main' functions across all languages"
);
let rust_symbols = index.find_symbols_by_name("main", Some("rust")).unwrap();
println!("Test 2 - Rust filter: Found {} symbols", rust_symbols.len());
for symbol in &rust_symbols {
println!(
" - Symbol ID: {:?}, Module: {:?}",
symbol.id, symbol.module_path
);
}
assert_eq!(rust_symbols.len(), 1, "Should find 1 Rust 'main' function");
assert_eq!(rust_symbols[0].id, SymbolId::new(1).unwrap());
let python_symbols = index.find_symbols_by_name("main", Some("python")).unwrap();
println!(
"Test 3 - Python filter: Found {} symbols",
python_symbols.len()
);
for symbol in &python_symbols {
println!(
" - Symbol ID: {:?}, Module: {:?}",
symbol.id, symbol.module_path
);
}
assert_eq!(
python_symbols.len(),
1,
"Should find 1 Python 'main' function"
);
assert_eq!(python_symbols[0].id, SymbolId::new(2).unwrap());
let ts_symbols = index
.find_symbols_by_name("main", Some("typescript"))
.unwrap();
println!(
"Test 4 - TypeScript filter: Found {} symbols",
ts_symbols.len()
);
for symbol in &ts_symbols {
println!(
" - Symbol ID: {:?}, Module: {:?}",
symbol.id, symbol.module_path
);
}
assert_eq!(
ts_symbols.len(),
1,
"Should find 1 TypeScript 'main' function"
);
assert_eq!(ts_symbols[0].id, SymbolId::new(3).unwrap());
let java_symbols = index.find_symbols_by_name("main", Some("java")).unwrap();
println!(
"Test 5 - Java filter (non-existent): Found {} symbols",
java_symbols.len()
);
assert_eq!(java_symbols.len(), 0, "Should find no Java symbols");
println!("=== All find_symbols_by_name tests completed ===\n");
}
#[test]
fn test_search_with_language_filter() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
index
.add_document(
SymbolId::new(10).unwrap(),
"parse_config",
SymbolKind::Function,
FileId::new(1).unwrap(),
"src/config.rs",
10, 0, 20, 0, Some("Parse configuration from file"), Some("fn parse_config(path: &str) -> Config"), "crate::config", None, crate::Visibility::Public, None, Some("rust"), )
.unwrap();
index
.add_document(
SymbolId::new(11).unwrap(),
"parse_json",
SymbolKind::Function,
FileId::new(2).unwrap(),
"src/parser.py",
5, 0, 10, 0, Some("Parse JSON data"), Some("def parse_json(data: str) -> dict"), "parser", None, crate::Visibility::Public, None, Some("python"), )
.unwrap();
index
.add_document(
SymbolId::new(12).unwrap(),
"parseXML",
SymbolKind::Function,
FileId::new(3).unwrap(),
"src/parser.ts",
1, 0, 8, 0, Some("Parse XML string"), Some("function parseXML(xml: string): Document"), "utils.parser", None, crate::Visibility::Public, None, Some("typescript"), )
.unwrap();
index.commit_batch().unwrap();
println!("\n=== Testing search with language filtering ===");
let all_results = index.search("parse", 10, None, None, None).unwrap();
println!(
"Test 1 - Search 'parse' no filter: Found {} results",
all_results.len()
);
for result in &all_results {
println!(
" - Symbol ID: {:?}, Name: {}",
result.symbol_id, result.name
);
}
assert_eq!(
all_results.len(),
3,
"Should find 3 parse functions across all languages"
);
let rust_results = index.search("parse", 10, None, None, Some("rust")).unwrap();
println!(
"Test 2 - Search 'parse' Rust filter: Found {} results",
rust_results.len()
);
for result in &rust_results {
println!(
" - Symbol ID: {:?}, Name: {}",
result.symbol_id, result.name
);
}
assert_eq!(rust_results.len(), 1, "Should find 1 Rust parse function");
assert_eq!(rust_results[0].symbol_id, SymbolId::new(10).unwrap());
let python_results = index
.search("parse", 10, None, None, Some("python"))
.unwrap();
println!(
"Test 3 - Search 'parse' Python filter: Found {} results",
python_results.len()
);
for result in &python_results {
println!(
" - Symbol ID: {:?}, Name: {}",
result.symbol_id, result.name
);
}
assert_eq!(
python_results.len(),
1,
"Should find 1 Python parse function"
);
assert_eq!(python_results[0].symbol_id, SymbolId::new(11).unwrap());
let rust_functions = index
.search("parse", 10, Some(SymbolKind::Function), None, Some("rust"))
.unwrap();
println!(
"Test 4 - Search 'parse' Rust+Function filter: Found {} results",
rust_functions.len()
);
assert_eq!(
rust_functions.len(),
1,
"Should find 1 Rust function with 'parse'"
);
let java_results = index.search("parse", 10, None, None, Some("java")).unwrap();
println!(
"Test 5 - Search 'parse' Java filter (non-existent): Found {} results",
java_results.len()
);
assert_eq!(java_results.len(), 0, "Should find no Java parse functions");
println!("=== All search tests completed ===\n");
}
#[test]
fn test_language_filter_with_module_filter() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
index
.add_document(
SymbolId::new(20).unwrap(),
"Handler",
SymbolKind::Struct,
FileId::new(1).unwrap(),
"src/server.rs",
1, 0, 10, 0, Some("Request handler"), Some("struct Handler"), "server", None, crate::Visibility::Public, None, Some("rust"), )
.unwrap();
index
.add_document(
SymbolId::new(21).unwrap(),
"Handler",
SymbolKind::Class,
FileId::new(2).unwrap(),
"src/server.py",
1, 0, 12, 0, Some("Request handler class"), Some("class Handler"), "server", None, crate::Visibility::Public, None, Some("python"), )
.unwrap();
index.commit_batch().unwrap();
println!("\n=== Testing combined module and language filters ===");
let rust_server = index
.search("Handler", 10, None, Some("server"), Some("rust"))
.unwrap();
println!(
"Test 1 - Search 'Handler' in server module + Rust: Found {} results",
rust_server.len()
);
for result in &rust_server {
println!(
" - Symbol ID: {:?}, Kind: {:?}",
result.symbol_id, result.kind
);
}
assert_eq!(
rust_server.len(),
1,
"Should find 1 Rust Handler in server module"
);
assert_eq!(rust_server[0].symbol_id, SymbolId::new(20).unwrap());
let python_server = index
.search("Handler", 10, None, Some("server"), Some("python"))
.unwrap();
println!(
"Test 2 - Search 'Handler' in server module + Python: Found {} results",
python_server.len()
);
for result in &python_server {
println!(
" - Symbol ID: {:?}, Kind: {:?}",
result.symbol_id, result.kind
);
}
assert_eq!(
python_server.len(),
1,
"Should find 1 Python Handler in server module"
);
assert_eq!(python_server[0].symbol_id, SymbolId::new(21).unwrap());
println!("=== All combined filter tests completed ===\n");
}
#[test]
fn test_ngram_partial_matching() {
println!("\n=== NGRAM TOKENIZER TEST ===\n");
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = crate::FileId::new(1).unwrap();
println!("Step 1: Indexing symbols...");
let sym1 = crate::Symbol::new(
SymbolId::new(1).unwrap(),
"ArchiveAppService",
SymbolKind::Class,
file_id,
crate::Range::new(10, 5, 50, 10),
)
.with_module_path("Services")
.with_doc("Application service for archiving")
.with_signature("class ArchiveAppService");
println!(" - Indexed: ArchiveAppService");
index
.index_symbol(&sym1, "src/Services/ArchiveAppService.cs")
.unwrap();
let sym2 = crate::Symbol::new(
SymbolId::new(2).unwrap(),
"DocumentArchiver",
SymbolKind::Class,
file_id,
crate::Range::new(20, 5, 60, 10),
)
.with_module_path("Utils")
.with_doc("Archives documents")
.with_signature("class DocumentArchiver");
println!(" - Indexed: DocumentArchiver");
index
.index_symbol(&sym2, "src/Utils/DocumentArchiver.cs")
.unwrap();
let sym3 = crate::Symbol::new(
SymbolId::new(3).unwrap(),
"UserService",
SymbolKind::Class,
file_id,
crate::Range::new(30, 5, 70, 10),
)
.with_module_path("Services")
.with_doc("User management service")
.with_signature("class UserService");
println!(" - Indexed: UserService");
index
.index_symbol(&sym3, "src/Services/UserService.cs")
.unwrap();
index.commit_batch().unwrap();
println!("\nStep 2: Testing partial search with 'Archive'...");
let results = index.search("Archive", 10, None, None, None).unwrap();
println!("\nResults from search('Archive'):");
for (i, result) in results.iter().enumerate() {
let kind = format!("{:?}", result.kind);
println!(" {}. {} ({})", i + 1, result.name, kind);
}
let names: Vec<&str> = results.iter().map(|r| r.name.as_str()).collect();
println!(
"\nExpectation: Should find 'ArchiveAppService' and 'DocumentArchiver', NOT 'UserService'"
);
println!("Actual matches: {names:?}\n");
assert!(
names.contains(&"ArchiveAppService"),
"Ngram tokenizer should find ArchiveAppService with partial query 'Archive'. Found: {names:?}"
);
assert!(
names.contains(&"DocumentArchiver"),
"Ngram tokenizer should find DocumentArchiver with partial query 'Archive'. Found: {names:?}"
);
assert!(
!names.contains(&"UserService"),
"Should not match unrelated symbols. Found: {names:?}"
);
println!("Step 3: Testing exact lookup with 'ArchiveAppService'...");
let exact_results = index
.find_symbols_by_name("ArchiveAppService", None)
.unwrap();
println!("Exact lookup results: {} match(es)", exact_results.len());
for result in &exact_results {
println!(" - {}", result.name);
}
assert_eq!(exact_results.len(), 1);
assert_eq!(exact_results[0].name.as_ref(), "ArchiveAppService");
println!(
"\nStep 4: Testing exact lookup with partial name 'Archive' (should find nothing)..."
);
let no_match = index.find_symbols_by_name("Archive", None).unwrap();
println!("Exact lookup for 'Archive': {} match(es)", no_match.len());
assert_eq!(
no_match.len(),
0,
"Exact lookup should not return partial matches. Found: {:?}",
no_match.iter().map(|s| &s.name).collect::<Vec<_>>()
);
println!("\n=== NGRAM TEST PASSED ===\n");
}
#[test]
fn test_fuzzy_search_typo_tolerance() {
println!("\n=== FUZZY SEARCH TEST (Typo Tolerance) ===\n");
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = crate::FileId::new(1).unwrap();
println!("Step 1: Indexing symbol 'ArchiveService'...");
let sym = crate::Symbol::new(
SymbolId::new(1).unwrap(),
"ArchiveService",
SymbolKind::Class,
file_id,
crate::Range::new(10, 5, 50, 10),
)
.with_doc("Archive service");
index.index_symbol(&sym, "src/ArchiveService.cs").unwrap();
index.commit_batch().unwrap();
println!("\nStep 2: Testing fuzzy search with typos...\n");
println!("Query: 'ArchiveService' (correct spelling)");
let correct = index
.search("ArchiveService", 10, None, None, None)
.unwrap();
println!(" Found: {} result(s)", correct.len());
assert_eq!(correct.len(), 1);
println!("\nQuery: 'ArchivService' (missing 'e', edit distance = 1)");
let typo1 = index.search("ArchivService", 10, None, None, None).unwrap();
println!(" Found: {} result(s)", typo1.len());
if !typo1.is_empty() {
println!(" Match: {}", typo1[0].name);
}
println!("\nQuery: 'ArchaveService' (i→a, edit distance = 1)");
let typo2 = index
.search("ArchaveService", 10, None, None, None)
.unwrap();
println!(" Found: {} result(s)", typo2.len());
if !typo2.is_empty() {
println!(" Match: {}", typo2[0].name);
}
println!("\nQuery: 'Archivee' (partial with extra 'e', edit distance = 1)");
let typo3 = index.search("Archivee", 10, None, None, None).unwrap();
println!(" Found: {} result(s)", typo3.len());
if !typo3.is_empty() {
println!(" Match: {}", typo3[0].name);
}
println!("\nQuery: 'Archhive' (2 errors: extra 'h' and wrong 'h', edit distance = 2)");
let too_many = index.search("Archhive", 10, None, None, None).unwrap();
println!(" Found: {} result(s)", too_many.len());
println!(" Expectation: May find via ngram partial match, but not via fuzzy (distance=2)");
println!("\n=== FUZZY SEARCH EXPLANATION ===");
println!("Fuzzy search (edit distance=1) handles typos like:");
println!(" - Missing character: 'Archiv' finds 'Archive'");
println!(" - Wrong character: 'Archave' finds 'Archive'");
println!(" - Extra character: 'Archivee' finds 'Archive'");
println!("\nNgram tokenizer handles partial matching:");
println!(" - 'Archive' finds 'ArchiveService', 'DocumentArchiver'");
println!("\nBoth work together in the same search query!");
println!("\n=== FUZZY SEARCH TEST COMPLETE ===\n");
}
#[test]
fn test_ngram_vs_fuzzy_interaction() {
println!("\n=== UNDERSTANDING NGRAM + FUZZY INTERACTION ===\n");
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = crate::FileId::new(1).unwrap();
println!("Indexed: 'ArchiveService'\n");
let sym = crate::Symbol::new(
SymbolId::new(1).unwrap(),
"ArchiveService",
SymbolKind::Class,
file_id,
crate::Range::new(10, 5, 50, 10),
);
index.index_symbol(&sym, "src/ArchiveService.cs").unwrap();
index.commit_batch().unwrap();
println!("HOW NGRAM TOKENIZATION WORKS:");
println!("'ArchiveService' gets broken into ngrams (min=3, max=10):");
println!(" 3-grams: Arc, rch, chi, hiv, ive, veS, eSe, Ser, erv, rvi, vic, ice");
println!(" 4-grams: Arch, rchi, chiv, hive, iveS, veSe, eSer, Serv, ervi, rvic, vice");
println!(" ... up to 10-grams\n");
println!("TEST CASES:\n");
println!("1. Query: 'Arch' (4 chars, exact ngram match)");
let short_match = index.search("Arch", 10, None, None, None).unwrap();
println!(" Result: {} match(es) ✓", short_match.len());
println!(" Why: 'Arch' is an exact 4-gram token in 'ArchiveService'\n");
println!("2. Query: 'Arsh' (1 typo: c→s, edit distance = 1)");
let short_typo = index.search("Arsh", 10, None, None, None).unwrap();
println!(" Result: {} match(es)", short_typo.len());
if short_typo.is_empty() {
println!(" Why: Fuzzy matches 'Arsh' against ngrams like 'Arch' (distance=1)");
println!(" But may not find it depending on Tantivy's fuzzy implementation\n");
} else {
println!(" Why: Fuzzy matched 'Arsh' to ngram 'Arch' (distance=1) ✓\n");
}
println!("3. Query: 'ArchivService' (missing 'e', 13 chars)");
let long_typo = index.search("ArchivService", 10, None, None, None).unwrap();
println!(" Result: {} match(es) ✓", long_typo.len());
println!(" Why: FIXED by adding fuzzy search on non-tokenized 'name' field!");
println!(" Fuzzy matches 'ArchivService' → 'ArchiveService' (edit distance=1)");
println!(" This works BEFORE ngram tokenization, avoiding misalignment\n");
assert!(
!long_typo.is_empty(),
"Should find ArchiveService with typo"
);
println!("4. Query: 'Archive' (7 chars, prefix of indexed word)");
let partial = index.search("Archive", 10, None, None, None).unwrap();
println!(" Result: {} match(es) ✓", partial.len());
println!(" Why: 'Archive' ngrams (Arc, rch, chi, hiv, ive, etc.)");
println!(" overlap with 'ArchiveService' ngrams\n");
println!("CONCLUSION:");
println!("- Ngram tokenizer: Great for partial matching (prefix/substring) ✓");
println!("- Fuzzy on ngrams: Works for typos in SHORT queries ✓");
println!("- Fuzzy on whole word: FIXED - Now handles typos in LONG words ✓");
println!("\nSOLUTION IMPLEMENTED:");
println!(" Added fuzzy search on non-tokenized 'name' field (STRING type)");
println!(" Now search queries try BOTH:");
println!(" 1. Fuzzy on ngram tokens (for short queries)");
println!(" 2. Fuzzy on whole words (for full symbol names)");
println!(" Result: 'ArchivService' correctly finds 'ArchiveService' ✓");
println!("\n=== TEST COMPLETE ===\n");
}
#[test]
fn test_import_persistence_across_reload() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
{
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = FileId::new(1).unwrap();
let registration = FileRegistration {
path: PathBuf::from("src/main.rs"),
file_id,
content_hash: "hash123".to_string(),
language_id: LanguageId::new("rust"),
timestamp: 1234567890,
mtime: 0,
};
index.store_file_registration(®istration).unwrap();
let import1 = crate::parsing::Import {
path: "indicatif::ProgressBar".to_string(),
alias: None,
file_id,
is_glob: false,
is_type_only: false,
};
let import2 = crate::parsing::Import {
path: "serde::Serialize".to_string(),
alias: Some("SerTrait".to_string()),
file_id,
is_glob: false,
is_type_only: false,
};
index.store_import(&import1).unwrap();
index.store_import(&import2).unwrap();
index.commit_batch().unwrap();
}
{
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
let loaded_imports = index.get_imports_for_file(FileId::new(1).unwrap()).unwrap();
assert_eq!(
loaded_imports.len(),
2,
"Should load 2 imports after reload"
);
let import1 = loaded_imports
.iter()
.find(|i| i.path == "indicatif::ProgressBar")
.unwrap();
assert_eq!(import1.alias, None);
assert!(!import1.is_glob);
assert!(!import1.is_type_only);
let import2 = loaded_imports
.iter()
.find(|i| i.path == "serde::Serialize")
.unwrap();
assert_eq!(import2.alias.as_deref(), Some("SerTrait"));
assert!(!import2.is_glob);
assert!(!import2.is_type_only);
}
}
#[test]
fn test_import_deletion_on_file_removal() {
let temp_dir = TempDir::new().unwrap();
let settings = crate::config::Settings::default();
let index = DocumentIndex::new(temp_dir.path(), &settings).unwrap();
index.start_batch().unwrap();
let file_id = FileId::new(1).unwrap();
let registration = FileRegistration {
path: PathBuf::from("src/main.rs"),
file_id,
content_hash: "hash123".to_string(),
language_id: LanguageId::new("rust"),
timestamp: 1234567890,
mtime: 0,
};
index.store_file_registration(®istration).unwrap();
let import = crate::parsing::Import {
path: "std::collections::HashMap".to_string(),
alias: None,
file_id,
is_glob: false,
is_type_only: false,
};
index.store_import(&import).unwrap();
index.commit_batch().unwrap();
let imports = index.get_imports_for_file(file_id).unwrap();
assert_eq!(imports.len(), 1);
index.start_batch().unwrap();
index.delete_imports_for_file(file_id).unwrap();
index.commit_batch().unwrap();
let imports_after = index.get_imports_for_file(file_id).unwrap();
assert_eq!(imports_after.len(), 0, "Imports should be deleted");
}
#[test]
#[ignore] fn test_java_owner_extends_person() {
let index_base = Path::new(".codanna/index");
let tantivy_path = index_base.join("tantivy");
if !tantivy_path.exists() {
eprintln!(
"Skipping test: .codanna/index/tantivy not found. Run: ./target/release/codanna index test_monorepos/spring-petclinic"
);
return;
}
let settings = crate::config::Settings::default();
let index =
DocumentIndex::new(&tantivy_path, &settings).expect("Failed to open production index");
let owner_candidates = index
.find_symbols_by_name("Owner", None)
.expect("Failed to find Owner");
println!("Found {} Owner symbols:", owner_candidates.len());
for candidate in &owner_candidates {
println!(
" - Owner at {:?}, module: {:?}",
candidate.file_id, candidate.module_path
);
}
let owner = owner_candidates
.iter()
.find(|s| {
s.module_path
.as_ref()
.map(|m| m.as_ref() == "org.springframework.samples.petclinic.owner")
.unwrap_or(false)
})
.expect("Owner class in petclinic.owner package should exist");
println!(
"Found Owner: symbol_id={}, module={:?}",
owner.id.0, owner.module_path
);
let person_candidates = index
.find_symbols_by_name("Person", None)
.expect("Failed to find Person");
let person = person_candidates
.iter()
.find(|s| {
s.module_path
.as_ref()
.map(|m| m.as_ref() == "org.springframework.samples.petclinic.model")
.unwrap_or(false)
})
.expect("Person class in petclinic.model package should exist");
println!(
"Found Person: symbol_id={}, module={:?}",
person.id.0, person.module_path
);
let all_rel_kinds = [
RelationKind::Calls,
RelationKind::Extends,
RelationKind::Implements,
RelationKind::Uses,
RelationKind::Defines,
RelationKind::References,
];
for kind in &all_rel_kinds {
let rels = index
.get_relationships_from(owner.id, *kind)
.expect("Failed to query relationships");
if !rels.is_empty() {
println!("Owner has {} {:?} relationships", rels.len(), kind);
for (from_id, to_id, rel) in rels.iter().take(5) {
println!(
" {:?}: {} -> {} (weight: {})",
kind, from_id.0, to_id.0, rel.weight
);
if let Ok(Some(target)) = index.find_symbol_by_id(*to_id) {
println!(" -> {}", target.name);
}
}
}
}
let extends_rels = index
.get_relationships_from(owner.id, RelationKind::Extends)
.expect("Failed to query Extends relationships");
println!(
"\nFinal check: Owner has {} Extends relationships",
extends_rels.len()
);
let person_extends = index
.get_relationships_to(person.id, RelationKind::Extends)
.expect("Failed to query who extends Person");
println!("Person is extended by {} classes", person_extends.len());
let total_rel_count = index
.count_relationships()
.expect("Failed to count relationships");
println!("Total relationships in index: {total_rel_count}");
let all_extends = index
.get_all_relationships_by_kind(RelationKind::Extends)
.expect("Failed to query all Extends relationships");
println!(
"Total Extends relationships in entire index: {}",
all_extends.len()
);
if !all_extends.is_empty() {
println!("Sample Extends relationships:");
for (from_id, to_id, _) in all_extends.iter().take(5) {
if let (Ok(Some(from_sym)), Ok(Some(to_sym))) = (
index.find_symbol_by_id(*from_id),
index.find_symbol_by_id(*to_id),
) {
println!(" {} extends {}", from_sym.name, to_sym.name);
}
}
}
let extends_person = extends_rels
.iter()
.any(|(_from, to_id, _rel)| *to_id == person.id);
assert!(
extends_person,
"Owner (id={}) should extend Person (id={}), but Extends relationship not found. Found {} Extends relationships.",
owner.id.0,
person.id.0,
extends_rels.len()
);
println!("✅ SUCCESS: Owner extends Person relationship verified!");
}
}