text_search/
indexer.rs

1use std::{collections::HashMap, fs, marker::PhantomData, path::Path};
2
3use tantivy::{
4    DocAddress, Index, IndexWriter, ReloadPolicy, Searcher, TantivyDocument, Term,
5    collector::TopDocs,
6    directory::MmapDirectory,
7    query::{
8        BooleanQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery, Query, QueryParser, RegexQuery,
9    },
10    schema::Schema,
11};
12use text_search_core::Indexable;
13
14pub struct Indexer<T: Indexable> {
15    index: Index,
16    schema: Schema,
17    index_writer: Option<IndexWriter>,
18    _marker: PhantomData<T>,
19}
20
21impl<T: Indexable> Indexer<T> {
22    pub fn new(path: &Path) -> Self {
23        if !path.exists() {
24            let _ = fs::create_dir(path);
25        }
26
27        let dir = MmapDirectory::open(&path).expect("Error while opening directory");
28        let schema = T::get_struct_info().generate_schema();
29        let index = Index::open_or_create(dir, schema.clone())
30            .expect("Error while opening or creating index. If schema has been updated, remove the old data.");
31
32        Self {
33            index,
34            schema,
35            index_writer: None,
36            _marker: PhantomData,
37        }
38    }
39
40    fn create_index_writer(&mut self) {
41        if self.index_writer.is_none() {
42            self.index_writer = Some(
43                self.index
44                    .writer(50_000_000)
45                    .expect("Error while creating index writer."),
46            );
47        }
48    }
49
50    pub fn index(&mut self, data: T) {
51        self.create_index_writer();
52
53        let doc = data.as_document();
54        self.index_writer
55            .as_ref()
56            .unwrap()
57            .add_document(doc)
58            .expect("Error while adding document.");
59    }
60
61    pub fn delete(&mut self, data: T) {
62        self.create_index_writer();
63        self.index_writer
64            .as_ref()
65            .unwrap()
66            .delete_term(data.get_id_term());
67    }
68
69    pub fn delete_using_term(&mut self, term: tantivy::Term) {
70        self.create_index_writer();
71        self.index_writer.as_ref().unwrap().delete_term(term);
72    }
73
74    pub fn delete_using_filters(&mut self, filters: HashMap<&str, &str>) {
75        self.create_index_writer();
76        let query = BooleanQuery::from(self.new_boolean_query_filters(filters));
77        let _ = self
78            .index_writer
79            .as_ref()
80            .unwrap()
81            .delete_query(Box::new(query));
82    }
83
84    pub fn update(&mut self, data: T) {
85        self.delete(data.clone());
86        self.index(data);
87    }
88
89    pub fn commit(&mut self) {
90        if self.index_writer.is_some() {
91            self.index_writer
92                .as_mut()
93                .unwrap()
94                .commit()
95                .expect("Error while commiting index data.");
96        }
97
98        self.index_writer = None;
99    }
100
101    pub fn search(
102        &self,
103        filter: HashMap<&str, &str>,
104        field_name: &str,
105        query: &str,
106        result_count: usize,
107    ) -> Vec<T> {
108        let field = self
109            .schema
110            .get_field(field_name)
111            .expect("Field with provided field name does not exsit in schema.");
112
113        let search_query = QueryParser::for_index(&self.index, vec![field])
114            .parse_query(query)
115            .expect("Error while parsing query.");
116
117        self._search(filter, search_query, result_count)
118    }
119
120    pub fn fuzzy_search(
121        &self,
122        filter: HashMap<&str, &str>,
123        field_name: &str,
124        query: &str,
125        result_count: usize,
126    ) -> Vec<T> {
127        let field = self
128            .schema
129            .get_field(field_name)
130            .expect("Field with provided field name does not exsit in schema.");
131
132        let term: Term = Term::from_field_text(field, query);
133        let query = FuzzyTermQuery::new(term, 2, true);
134
135        self._search(filter, Box::new(query), result_count)
136    }
137
138    pub fn regex_search(
139        &self,
140        filter: HashMap<&str, &str>,
141        field_name: &str,
142        query: &str,
143        result_count: usize,
144    ) -> Vec<T> {
145        let field = self
146            .schema
147            .get_field(field_name)
148            .expect("Field with provided field name does not exsit in schema.");
149
150        let query =
151            RegexQuery::from_pattern(query, field).expect("Error while building regex query.");
152
153        self._search(filter, Box::new(query), result_count)
154    }
155
156    ///Uses regex pattern matching query along with fuzzy search.
157    ///Maybe slow.
158    pub fn hybrid_search(
159        &self,
160        filter: HashMap<&str, &str>,
161        field_name: &str,
162        query: &str,
163        result_count: usize,
164    ) -> Vec<T> {
165        let field = self
166            .schema
167            .get_field(field_name)
168            .expect("Field with provided field name does not exsit in schema.");
169
170        let terms: Vec<Term> = query
171            .to_lowercase()
172            .split(" ")
173            .map(|term| Term::from_field_text(field, term))
174            .collect();
175
176        let fuzzy_queries: Vec<(Occur, Box<dyn Query>)> = terms
177            .iter()
178            .map(|term| {
179                (
180                    Occur::Should,
181                    Box::new(FuzzyTermQuery::new(term.clone(), 2, true)) as Box<dyn Query>,
182                )
183            })
184            .collect();
185
186        let phrase_prefix_query: (Occur, Box<dyn Query>) = (
187            Occur::Should,
188            Box::new(PhrasePrefixQuery::new(terms)) as Box<dyn Query>,
189        );
190
191        let mut boolean_quries: Vec<(Occur, Box<dyn Query>)> = vec![phrase_prefix_query];
192        boolean_quries.extend(fuzzy_queries);
193
194        let query = BooleanQuery::new(boolean_quries);
195        self._search(filter, Box::new(query), result_count)
196    }
197
198    fn filter_query(&self, filters: HashMap<&str, &str>, query: Box<dyn Query>) -> Box<dyn Query> {
199        let filter_query = if filters.is_empty() {
200            None
201        } else {
202            Some(self.new_boolean_query_filters(filters))
203        };
204
205        match filter_query {
206            Some(mut x) => {
207                x.push((Occur::Must, Box::new(query)));
208                Box::new(BooleanQuery::from(x))
209            }
210            None => query,
211        }
212    }
213
214    fn new_boolean_query_filters(
215        &self,
216        filters: HashMap<&str, &str>,
217    ) -> Vec<(Occur, Box<dyn Query>)> {
218        filters
219            .iter()
220            .map(|x| {
221                let field = self.schema.get_field(x.0).expect(&format!(
222                    "Field with provided field name `{}` does not exists in schema.",
223                    x.0
224                ));
225                let phrase = format!("\"{}\"", x.1);
226
227                let filter_query = QueryParser::for_index(&self.index, vec![field])
228                    .parse_query(&phrase)
229                    .expect("Error while parsing query.");
230                (Occur::Must, filter_query)
231            })
232            .collect()
233    }
234
235    fn _search(
236        &self,
237        filter: HashMap<&str, &str>,
238        query: Box<dyn Query>,
239        result_count: usize,
240    ) -> Vec<T> {
241        let reader = self
242            .index
243            .reader_builder()
244            .reload_policy(ReloadPolicy::OnCommitWithDelay)
245            .try_into()
246            .expect("Error while constructing reader for search operation.");
247        let searcher = reader.searcher();
248
249        let query = self.filter_query(filter, query);
250
251        let top_docs = searcher
252            .search(&query, &TopDocs::with_limit(result_count))
253            .expect("Error while performing search operation.");
254
255        Self::docs_to_t(top_docs, &searcher)
256    }
257
258    fn docs_to_t(top_docs: Vec<(f32, DocAddress)>, searcher: &Searcher) -> Vec<T> {
259        let mut result: Vec<T> = vec![];
260        for (_score, doc_address) in top_docs {
261            let doc: TantivyDocument = searcher
262                .doc(doc_address)
263                .expect("Error while trying to find search document.");
264            result.push(T::from_doc(doc));
265        }
266        result
267    }
268}