Skip to main content

mnemo_core/search/
tantivy_index.rs

1use std::path::Path;
2use std::sync::Mutex;
3
4use tantivy::collector::TopDocs;
5use tantivy::query::QueryParser;
6use tantivy::schema::Value;
7use tantivy::schema::{STORED, STRING, Schema, TEXT};
8use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument};
9
10use crate::error::{Error, Result};
11use crate::search::FullTextIndex;
12use uuid::Uuid;
13
14pub struct TantivyFullTextIndex {
15    index: Index,
16    writer: Mutex<IndexWriter>,
17    reader: IndexReader,
18    id_field: tantivy::schema::Field,
19    content_field: tantivy::schema::Field,
20}
21
22fn build_schema() -> (Schema, tantivy::schema::Field, tantivy::schema::Field) {
23    let mut schema_builder = Schema::builder();
24    let id_field = schema_builder.add_text_field("id", STRING | STORED);
25    let content_field = schema_builder.add_text_field("content", TEXT);
26    (schema_builder.build(), id_field, content_field)
27}
28
29impl TantivyFullTextIndex {
30    pub fn new(path: &Path) -> Result<Self> {
31        let (schema, id_field, content_field) = build_schema();
32
33        std::fs::create_dir_all(path).map_err(|e| Error::Index(e.to_string()))?;
34
35        let dir = tantivy::directory::MmapDirectory::open(path)
36            .map_err(|e| Error::Index(e.to_string()))?;
37
38        let index = if Index::exists(&dir).map_err(|e| Error::Index(e.to_string()))? {
39            Index::open(dir).map_err(|e| Error::Index(e.to_string()))?
40        } else {
41            Index::create(dir, schema, tantivy::IndexSettings::default())
42                .map_err(|e| Error::Index(e.to_string()))?
43        };
44
45        let writer = index
46            .writer(50_000_000) // 50MB heap
47            .map_err(|e| Error::Index(e.to_string()))?;
48
49        let reader = index
50            .reader_builder()
51            .reload_policy(ReloadPolicy::OnCommitWithDelay)
52            .try_into()
53            .map_err(|e| Error::Index(e.to_string()))?;
54
55        Ok(Self {
56            index,
57            writer: Mutex::new(writer),
58            reader,
59            id_field,
60            content_field,
61        })
62    }
63
64    pub fn open_in_memory() -> Result<Self> {
65        let (schema, id_field, content_field) = build_schema();
66
67        let index = Index::create_in_ram(schema);
68
69        let writer = index
70            .writer(50_000_000)
71            .map_err(|e| Error::Index(e.to_string()))?;
72
73        let reader = index
74            .reader_builder()
75            .reload_policy(ReloadPolicy::OnCommitWithDelay)
76            .try_into()
77            .map_err(|e| Error::Index(e.to_string()))?;
78
79        Ok(Self {
80            index,
81            writer: Mutex::new(writer),
82            reader,
83            id_field,
84            content_field,
85        })
86    }
87}
88
89impl FullTextIndex for TantivyFullTextIndex {
90    fn add(&self, id: Uuid, content: &str) -> Result<()> {
91        let writer = self
92            .writer
93            .lock()
94            .map_err(|e| Error::Index(e.to_string()))?;
95
96        // Remove existing doc with this ID first
97        let id_term = tantivy::Term::from_field_text(self.id_field, &id.to_string());
98        writer.delete_term(id_term);
99
100        let mut doc = TantivyDocument::default();
101        doc.add_text(self.id_field, id.to_string());
102        doc.add_text(self.content_field, content);
103        writer
104            .add_document(doc)
105            .map_err(|e| Error::Index(e.to_string()))?;
106        Ok(())
107    }
108
109    fn remove(&self, id: Uuid) -> Result<()> {
110        let writer = self
111            .writer
112            .lock()
113            .map_err(|e| Error::Index(e.to_string()))?;
114        let id_term = tantivy::Term::from_field_text(self.id_field, &id.to_string());
115        writer.delete_term(id_term);
116        Ok(())
117    }
118
119    fn search(&self, query: &str, limit: usize) -> Result<Vec<(Uuid, f32)>> {
120        let searcher = self.reader.searcher();
121        let query_parser = QueryParser::for_index(&self.index, vec![self.content_field]);
122        let parsed_query = query_parser
123            .parse_query(query)
124            .map_err(|e| Error::Index(e.to_string()))?;
125
126        // tantivy 0.26 made TopDocs ordering explicit. 0.25's
127        // `TopDocs::with_limit(limit)` implicitly ordered by BM25
128        // score; 0.26 requires `.order_by_score()`.
129        let top_docs = searcher
130            .search(&parsed_query, &TopDocs::with_limit(limit).order_by_score())
131            .map_err(|e| Error::Index(e.to_string()))?;
132
133        let mut results = Vec::new();
134        for (score, doc_address) in top_docs {
135            let doc: TantivyDocument = searcher
136                .doc(doc_address)
137                .map_err(|e| Error::Index(e.to_string()))?;
138            if let Some(id_value) = doc.get_first(self.id_field)
139                && let Some(id_str) = id_value.as_str()
140                && let Ok(uuid) = Uuid::parse_str(id_str)
141            {
142                results.push((uuid, score));
143            }
144        }
145        Ok(results)
146    }
147
148    fn commit(&self) -> Result<()> {
149        let mut writer = self
150            .writer
151            .lock()
152            .map_err(|e| Error::Index(e.to_string()))?;
153        writer.commit().map_err(|e| Error::Index(e.to_string()))?;
154        self.reader
155            .reload()
156            .map_err(|e| Error::Index(e.to_string()))?;
157        Ok(())
158    }
159
160    fn save(&self) -> Result<()> {
161        self.commit()
162    }
163
164    fn len(&self) -> usize {
165        let searcher = self.reader.searcher();
166        searcher.num_docs() as usize
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173
174    #[test]
175    fn test_tantivy_add_and_search() {
176        let index = TantivyFullTextIndex::open_in_memory().unwrap();
177
178        let id1 = Uuid::now_v7();
179        let id2 = Uuid::now_v7();
180        let id3 = Uuid::now_v7();
181
182        index
183            .add(id1, "The user prefers dark mode for all applications")
184            .unwrap();
185        index
186            .add(id2, "Rust programming language is fast and safe")
187            .unwrap();
188        index.add(id3, "Python is great for data science").unwrap();
189        index.commit().unwrap();
190
191        assert_eq!(index.len(), 3);
192
193        let results = index.search("dark mode", 10).unwrap();
194        assert!(!results.is_empty());
195        assert_eq!(results[0].0, id1);
196
197        let results = index.search("Rust programming", 10).unwrap();
198        assert!(!results.is_empty());
199        assert_eq!(results[0].0, id2);
200    }
201
202    #[test]
203    fn test_tantivy_remove() {
204        let index = TantivyFullTextIndex::open_in_memory().unwrap();
205
206        let id1 = Uuid::now_v7();
207        index.add(id1, "test content to remove").unwrap();
208        index.commit().unwrap();
209        assert_eq!(index.len(), 1);
210
211        index.remove(id1).unwrap();
212        index.commit().unwrap();
213        assert_eq!(index.len(), 0);
214
215        let results = index.search("test content", 10).unwrap();
216        assert!(results.is_empty());
217    }
218
219    #[test]
220    fn test_tantivy_save_and_load() {
221        let dir = std::env::temp_dir().join(format!("tantivy_test_{}", Uuid::now_v7()));
222
223        let id1 = Uuid::now_v7();
224        {
225            let index = TantivyFullTextIndex::new(&dir).unwrap();
226            index.add(id1, "persistent test content").unwrap();
227            index.commit().unwrap();
228            index.save().unwrap();
229        }
230
231        // Reopen
232        {
233            let index = TantivyFullTextIndex::new(&dir).unwrap();
234            assert_eq!(index.len(), 1);
235            let results = index.search("persistent", 10).unwrap();
236            assert!(!results.is_empty());
237            assert_eq!(results[0].0, id1);
238        }
239
240        std::fs::remove_dir_all(&dir).ok();
241    }
242}