Skip to main content

dk_engine/graph/
index.rs

1use std::path::Path;
2
3use dk_core::{Error, RepoId, Symbol, SymbolId};
4use tantivy::collector::TopDocs;
5use tantivy::query::{BooleanQuery, Occur, QueryParser, TermQuery};
6use tantivy::schema::*;
7use tantivy::{Directory, Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument};
8use uuid::Uuid;
9
10/// Full-text search index for symbols, backed by Tantivy.
11///
12/// Indexes symbol metadata across multiple text fields and supports
13/// filtering by repository. The index is stored on disk at the path
14/// provided to [`SearchIndex::open`].
15pub struct SearchIndex {
16    index: Index,
17    reader: IndexReader,
18    writer: IndexWriter,
19    // Field handles kept for building queries and documents.
20    f_symbol_id: Field,
21    f_repo_id: Field,
22    f_name: Field,
23    f_qualified_name: Field,
24    f_signature: Field,
25    f_doc_comment: Field,
26    f_file_path: Field,
27    f_kind: Field,
28}
29
30impl SearchIndex {
31    /// Open or create a Tantivy index at the given directory path.
32    ///
33    /// Defines the schema with the following fields:
34    /// - `symbol_id` — stored string (UUID)
35    /// - `repo_id` — indexed string (not tokenized) for filtering
36    /// - `name` — tokenized text field
37    /// - `qualified_name` — tokenized text field
38    /// - `signature` — tokenized text field
39    /// - `doc_comment` — tokenized text field
40    /// - `file_path` — tokenized text field
41    /// - `kind` — indexed string (not tokenized)
42    pub fn open(path: &Path) -> dk_core::Result<Self> {
43        let mut schema_builder = Schema::builder();
44
45        let f_symbol_id = schema_builder.add_text_field("symbol_id", STRING | STORED);
46        let f_repo_id = schema_builder.add_text_field("repo_id", STRING);
47        let f_name = schema_builder.add_text_field("name", TEXT);
48        let f_qualified_name = schema_builder.add_text_field("qualified_name", TEXT);
49        let f_signature = schema_builder.add_text_field("signature", TEXT);
50        let f_doc_comment = schema_builder.add_text_field("doc_comment", TEXT);
51        let f_file_path = schema_builder.add_text_field("file_path", TEXT);
52        let f_kind = schema_builder.add_text_field("kind", STRING);
53
54        let schema = schema_builder.build();
55
56        let dir: Box<dyn Directory> = if path.exists() && path.join("meta.json").exists() {
57            Box::new(
58                tantivy::directory::MmapDirectory::open(path)
59                    .map_err(|e| Error::Internal(format!("Failed to open index directory: {e}")))?,
60            )
61        } else {
62            std::fs::create_dir_all(path)?;
63            Box::new(
64                tantivy::directory::MmapDirectory::open(path)
65                    .map_err(|e| Error::Internal(format!("Failed to open index directory: {e}")))?,
66            )
67        };
68
69        let index = Index::open_or_create(dir, schema.clone())
70            .map_err(|e| Error::Internal(format!("Failed to open or create index: {e}")))?;
71
72        let reader = index
73            .reader_builder()
74            .reload_policy(ReloadPolicy::OnCommitWithDelay)
75            .try_into()
76            .map_err(|e| Error::Internal(format!("Failed to create index reader: {e}")))?;
77
78        let writer = index
79            .writer(50_000_000) // 50 MB memory budget
80            .map_err(|e| Error::Internal(format!("Failed to create index writer: {e}")))?;
81
82        Ok(Self {
83            index,
84            reader,
85            writer,
86            f_symbol_id,
87            f_repo_id,
88            f_name,
89            f_qualified_name,
90            f_signature,
91            f_doc_comment,
92            f_file_path,
93            f_kind,
94        })
95    }
96
97    /// Add a symbol document to the index.
98    pub fn index_symbol(&mut self, repo_id: RepoId, sym: &Symbol) -> dk_core::Result<()> {
99        let mut doc = TantivyDocument::new();
100        doc.add_text(self.f_symbol_id, sym.id.to_string());
101        doc.add_text(self.f_repo_id, repo_id.to_string());
102        doc.add_text(self.f_name, &sym.name);
103        doc.add_text(self.f_qualified_name, &sym.qualified_name);
104        if let Some(ref sig) = sym.signature {
105            doc.add_text(self.f_signature, sig);
106        }
107        if let Some(ref doc_comment) = sym.doc_comment {
108            doc.add_text(self.f_doc_comment, doc_comment);
109        }
110        doc.add_text(self.f_file_path, sym.file_path.to_string_lossy().as_ref());
111        doc.add_text(self.f_kind, sym.kind.to_string());
112
113        self.writer
114            .add_document(doc)
115            .map_err(|e| Error::Internal(format!("Failed to add document: {e}")))?;
116
117        Ok(())
118    }
119
120    /// Delete a document by `symbol_id`.
121    pub fn remove_symbol(&mut self, symbol_id: SymbolId) -> dk_core::Result<()> {
122        let term = tantivy::Term::from_field_text(self.f_symbol_id, &symbol_id.to_string());
123        self.writer.delete_term(term);
124        Ok(())
125    }
126
127    /// Delete all documents belonging to a repository.
128    ///
129    /// **Note:** This only stages the deletion. You must call [`commit`] afterwards
130    /// for the deletion to be persisted and visible to readers.
131    pub fn delete_by_repo(&mut self, repo_id: RepoId) -> dk_core::Result<()> {
132        let term = tantivy::Term::from_field_text(self.f_repo_id, &repo_id.to_string());
133        self.writer.delete_term(term);
134        Ok(())
135    }
136
137    /// Commit the index writer, making all pending additions and deletions
138    /// visible to subsequent searches.
139    pub fn commit(&mut self) -> dk_core::Result<()> {
140        self.writer
141            .commit()
142            .map_err(|e| Error::Internal(format!("Failed to commit index: {e}")))?;
143
144        // Reload the reader so subsequent searches see the latest commit.
145        self.reader
146            .reload()
147            .map_err(|e| Error::Internal(format!("Failed to reload reader: {e}")))?;
148
149        Ok(())
150    }
151
152    /// Search across all text fields, filtered by `repo_id`.
153    ///
154    /// Returns up to `limit` matching [`SymbolId`]s, ranked by relevance.
155    pub fn search(
156        &self,
157        repo_id: RepoId,
158        query: &str,
159        limit: usize,
160    ) -> dk_core::Result<Vec<SymbolId>> {
161        let searcher = self.reader.searcher();
162
163        // Build a repo_id filter as a TermQuery.
164        let repo_term =
165            tantivy::Term::from_field_text(self.f_repo_id, &repo_id.to_string());
166        let repo_query = TermQuery::new(repo_term, IndexRecordOption::Basic);
167
168        // Build a full-text query across the text fields using QueryParser.
169        let text_fields = vec![
170            self.f_name,
171            self.f_qualified_name,
172            self.f_signature,
173            self.f_doc_comment,
174            self.f_file_path,
175        ];
176        let query_parser = QueryParser::for_index(&self.index, text_fields);
177        let text_query = query_parser
178            .parse_query(query)
179            .map_err(|e| Error::Internal(format!("Failed to parse query: {e}")))?;
180
181        // Combine: MUST match repo_id AND MUST match text query.
182        let combined = BooleanQuery::new(vec![
183            (Occur::Must, Box::new(repo_query)),
184            (Occur::Must, text_query),
185        ]);
186
187        let top_docs = searcher
188            .search(&combined, &TopDocs::with_limit(limit))
189            .map_err(|e| Error::Internal(format!("Search failed: {e}")))?;
190
191        let mut results = Vec::with_capacity(top_docs.len());
192        for (_score, doc_address) in top_docs {
193            let doc: TantivyDocument = searcher
194                .doc(doc_address)
195                .map_err(|e| Error::Internal(format!("Failed to retrieve doc: {e}")))?;
196
197            if let Some(id_value) = doc.get_first(self.f_symbol_id) {
198                if let Some(id_str) = id_value.as_str() {
199                    let uuid = Uuid::parse_str(id_str).map_err(|e| {
200                        Error::Internal(format!("Invalid UUID in index: {e}"))
201                    })?;
202                    results.push(uuid);
203                }
204            }
205        }
206
207        Ok(results)
208    }
209}