text_search/
indexer.rs

1use std::{collections::HashMap, fs, marker::PhantomData, path::Path};
2
3use tantivy::{
4    DocAddress, Index, IndexWriter, ReloadPolicy, Searcher, TantivyDocument, Term,
5    collector::TopDocs,
6    directory::MmapDirectory,
7    query::{
8        BooleanQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery, Query, QueryParser, RegexQuery,
9    },
10    schema::Schema,
11};
12use text_search_core::Indexable;
13
14pub struct Indexer<T: Indexable> {
15    index: Index,
16    schema: Schema,
17    index_writer: Option<IndexWriter>,
18    _marker: PhantomData<T>,
19}
20
21impl<T: Indexable> Indexer<T> {
22    pub fn new(path: &Path) -> Self {
23        if !path.exists() {
24            let _ = fs::create_dir(path);
25        }
26
27        let dir = MmapDirectory::open(&path).expect("Error while opening directory");
28        let schema = T::get_struct_info().generate_schema();
29        let index = Index::open_or_create(dir, schema.clone())
30            .expect("Error while opening or creating index. If schema has been updated, remove the old data.");
31
32        Self {
33            index,
34            schema,
35            index_writer: None,
36            _marker: PhantomData,
37        }
38    }
39
40    fn create_index_writer(&mut self) {
41        if self.index_writer.is_none() {
42            self.index_writer = Some(
43                self.index
44                    .writer(50_000_000)
45                    .expect("Error while creating index writer."),
46            );
47        }
48    }
49
50    pub fn index(&mut self, data: T) {
51        self.create_index_writer();
52
53        let doc = data.as_document();
54        self.index_writer
55            .as_ref()
56            .unwrap()
57            .add_document(doc)
58            .expect("Error while adding document.");
59    }
60
61    pub fn delete(&mut self, data: T) {
62        self.create_index_writer();
63        self.index_writer
64            .as_ref()
65            .unwrap()
66            .delete_term(data.get_id_term());
67    }
68
69    pub fn delete_using_term(&mut self, term: tantivy::Term) {
70        self.create_index_writer();
71        self.index_writer.as_ref().unwrap().delete_term(term);
72    }
73
74    pub fn delete_using_filters(&mut self, filters: HashMap<&str, &str>) {
75        self.create_index_writer();
76        let query = BooleanQuery::from(self.new_boolean_query_filters(filters));
77        println!("query : {:?}", query);
78        let _ = self
79            .index_writer
80            .as_ref()
81            .unwrap()
82            .delete_query(Box::new(query));
83    }
84
85    pub fn update(&mut self, data: T) {
86        self.delete(data.clone());
87        self.index(data);
88    }
89
90    pub fn commit(&mut self) {
91        if self.index_writer.is_some() {
92            self.index_writer
93                .as_mut()
94                .unwrap()
95                .commit()
96                .expect("Error while commiting index data.");
97        }
98
99        self.index_writer = None;
100    }
101
102    pub fn search(
103        &self,
104        filter: HashMap<&str, &str>,
105        field_name: &str,
106        query: &str,
107        result_count: usize,
108    ) -> Vec<T> {
109        let field = self
110            .schema
111            .get_field(field_name)
112            .expect("Field with provided field name does not exsit in schema.");
113
114        let search_query = QueryParser::for_index(&self.index, vec![field])
115            .parse_query(query)
116            .expect("Error while parsing query.");
117
118        self._search(filter, search_query, result_count)
119    }
120
121    pub fn fuzzy_search(
122        &self,
123        filter: HashMap<&str, &str>,
124        field_name: &str,
125        query: &str,
126        result_count: usize,
127    ) -> Vec<T> {
128        let field = self
129            .schema
130            .get_field(field_name)
131            .expect("Field with provided field name does not exsit in schema.");
132
133        let term: Term = Term::from_field_text(field, query);
134        let query = FuzzyTermQuery::new(term, 2, true);
135
136        self._search(filter, Box::new(query), result_count)
137    }
138
139    pub fn regex_search(
140        &self,
141        filter: HashMap<&str, &str>,
142        field_name: &str,
143        query: &str,
144        result_count: usize,
145    ) -> Vec<T> {
146        let field = self
147            .schema
148            .get_field(field_name)
149            .expect("Field with provided field name does not exsit in schema.");
150
151        let query =
152            RegexQuery::from_pattern(query, field).expect("Error while building regex query.");
153
154        self._search(filter, Box::new(query), result_count)
155    }
156
157    ///Uses regex pattern matching query along with fuzzy search.
158    ///Maybe slow.
159    pub fn hybrid_search(
160        &self,
161        filter: HashMap<&str, &str>,
162        field_name: &str,
163        query: &str,
164        result_count: usize,
165    ) -> Vec<T> {
166        let field = self
167            .schema
168            .get_field(field_name)
169            .expect("Field with provided field name does not exsit in schema.");
170
171        let terms: Vec<Term> = query
172            .to_lowercase()
173            .split(" ")
174            .map(|term| Term::from_field_text(field, term))
175            .collect();
176
177        let fuzzy_queries: Vec<(Occur, Box<dyn Query>)> = terms
178            .iter()
179            .map(|term| {
180                (
181                    Occur::Should,
182                    Box::new(FuzzyTermQuery::new(term.clone(), 2, true)) as Box<dyn Query>,
183                )
184            })
185            .collect();
186
187        let phrase_prefix_query: (Occur, Box<dyn Query>) = (
188            Occur::Should,
189            Box::new(PhrasePrefixQuery::new(terms)) as Box<dyn Query>,
190        );
191
192        let mut boolean_quries: Vec<(Occur, Box<dyn Query>)> = vec![phrase_prefix_query];
193        boolean_quries.extend(fuzzy_queries);
194
195        let query = BooleanQuery::new(boolean_quries);
196        self._search(filter, Box::new(query), result_count)
197    }
198
199    fn filter_query(&self, filters: HashMap<&str, &str>, query: Box<dyn Query>) -> Box<dyn Query> {
200        let filter_query = if filters.is_empty() {
201            None
202        } else {
203            Some(self.new_boolean_query_filters(filters))
204        };
205
206        match filter_query {
207            Some(mut x) => {
208                x.push((Occur::Should, Box::new(query)));
209                Box::new(BooleanQuery::from(x))
210            }
211            None => query,
212        }
213    }
214
215    fn new_boolean_query_filters(
216        &self,
217        filters: HashMap<&str, &str>,
218    ) -> Vec<(Occur, Box<dyn Query>)> {
219        filters
220            .iter()
221            .map(|x| {
222                let field = self.schema.get_field(x.0).expect(&format!(
223                    "Field with provided field name `{}` does not exists in schema.",
224                    x.0
225                ));
226                let phrase = format!("\"{}\"", x.1);
227
228                let filter_query = QueryParser::for_index(&self.index, vec![field])
229                    .parse_query(&phrase)
230                    .expect("Error while parsing query.");
231                (Occur::Must, filter_query)
232            })
233            .collect()
234    }
235
236    fn _search(
237        &self,
238        filter: HashMap<&str, &str>,
239        query: Box<dyn Query>,
240        result_count: usize,
241    ) -> Vec<T> {
242        let reader = self
243            .index
244            .reader_builder()
245            .reload_policy(ReloadPolicy::OnCommitWithDelay)
246            .try_into()
247            .expect("Error while constructing reader for search operation.");
248        let searcher = reader.searcher();
249
250        let query = self.filter_query(filter, query);
251
252        let top_docs = searcher
253            .search(&query, &TopDocs::with_limit(result_count))
254            .expect("Error while performing search operation.");
255
256        Self::docs_to_t(top_docs, &searcher)
257    }
258
259    fn docs_to_t(top_docs: Vec<(f32, DocAddress)>, searcher: &Searcher) -> Vec<T> {
260        let mut result: Vec<T> = vec![];
261        for (_score, doc_address) in top_docs {
262            let doc: TantivyDocument = searcher
263                .doc(doc_address)
264                .expect("Error while trying to find search document.");
265            result.push(T::from_doc(doc));
266        }
267        result
268    }
269}