summa_core/components/
default_tokenizers.rs

1use std::collections::HashSet;
2
3use tantivy::tokenizer::{LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, StopWordFilter, TextAnalyzer, WhitespaceTokenizer};
4
5use super::tokenizers::{DictTokenizer, HtmlTokenizer, Tokenizer};
6use crate::components::tokenizers::MmdTokenizer;
7
8/// List of stop words mixed for multiple languages
9pub const STOP_WORDS: [&str; 321] = [
10    "a",
11    "an",
12    "and",
13    "are",
14    "as",
15    "at",
16    "be",
17    "by",
18    "for",
19    "from",
20    "if",
21    "in",
22    "is",
23    "it",
24    "of",
25    "on",
26    "or",
27    "s",
28    "that",
29    "the",
30    "these",
31    "this",
32    "to",
33    "was",
34    "were",
35    "which",
36    "with",
37    "aber",
38    "alle",
39    "allem",
40    "allen",
41    "aller",
42    "alles",
43    "als",
44    "also",
45    "am",
46    "an",
47    "ander",
48    "andere",
49    "anderem",
50    "anderen",
51    "anderer",
52    "anderes",
53    "anderm",
54    "andern",
55    "anderr",
56    "anders",
57    "auch",
58    "auf",
59    "aus",
60    "bei",
61    "bin",
62    "bis",
63    "bist",
64    "da",
65    "dann",
66    "der",
67    "den",
68    "des",
69    "dem",
70    "das",
71    "dass",
72    "daß",
73    "derselbe",
74    "derselben",
75    "denselben",
76    "desselben",
77    "demselben",
78    "dieselbe",
79    "dieselben",
80    "dasselbe",
81    "dazu",
82    "dein",
83    "deine",
84    "deinem",
85    "deinen",
86    "deiner",
87    "deines",
88    "denn",
89    "derer",
90    "dessen",
91    "dich",
92    "dir",
93    "du",
94    "dies",
95    "diese",
96    "diesem",
97    "diesen",
98    "dieser",
99    "dieses",
100    "doch",
101    "dort",
102    "durch",
103    "ein",
104    "eine",
105    "einem",
106    "einen",
107    "einer",
108    "eines",
109    "einig",
110    "einige",
111    "einigem",
112    "einigen",
113    "einiger",
114    "einiges",
115    "einmal",
116    "er",
117    "ihn",
118    "ihm",
119    "es",
120    "etwas",
121    "euer",
122    "eure",
123    "eurem",
124    "euren",
125    "eurer",
126    "eures",
127    "für",
128    "gegen",
129    "gewesen",
130    "hab",
131    "habe",
132    "haben",
133    "hat",
134    "hatte",
135    "hatten",
136    "hier",
137    "hin",
138    "hinter",
139    "ich",
140    "mich",
141    "mir",
142    "ihr",
143    "ihre",
144    "ihrem",
145    "ihren",
146    "ihrer",
147    "ihres",
148    "euch",
149    "im",
150    "in",
151    "indem",
152    "ins",
153    "ist",
154    "jede",
155    "jedem",
156    "jeden",
157    "jeder",
158    "jedes",
159    "jene",
160    "jenem",
161    "jenen",
162    "jener",
163    "jenes",
164    "jetzt",
165    "kann",
166    "kein",
167    "keine",
168    "keinem",
169    "keinen",
170    "keiner",
171    "keines",
172    "können",
173    "könnte",
174    "machen",
175    "man",
176    "manche",
177    "manchem",
178    "manchen",
179    "mancher",
180    "manches",
181    "mein",
182    "meine",
183    "meinem",
184    "meinen",
185    "meiner",
186    "meines",
187    "mit",
188    "muss",
189    "musste",
190    "nach",
191    "nicht",
192    "nichts",
193    "noch",
194    "nun",
195    "nur",
196    "ob",
197    "oder",
198    "ohne",
199    "sehr",
200    "sein",
201    "seine",
202    "seinem",
203    "seinen",
204    "seiner",
205    "seines",
206    "selbst",
207    "sich",
208    "sie",
209    "ihnen",
210    "sind",
211    "so",
212    "solche",
213    "solchem",
214    "solchen",
215    "solcher",
216    "solches",
217    "soll",
218    "sollte",
219    "sondern",
220    "sonst",
221    "um",
222    "und",
223    "uns",
224    "unsere",
225    "unserem",
226    "unseren",
227    "unser",
228    "unseres",
229    "unter",
230    "viel",
231    "vom",
232    "von",
233    "vor",
234    "während",
235    "waren",
236    "warst",
237    "weg",
238    "weil",
239    "weiter",
240    "welche",
241    "welchem",
242    "welchen",
243    "welcher",
244    "welches",
245    "wenn",
246    "werde",
247    "werden",
248    "wie",
249    "wieder",
250    "wir",
251    "wird",
252    "wirst",
253    "wo",
254    "wollen",
255    "wollte",
256    "würde",
257    "würden",
258    "zu",
259    "zum",
260    "zur",
261    "zwar",
262    "zwischen",
263    "и",
264    "в",
265    "во",
266    "не",
267    "что",
268    "он",
269    "на",
270    "я",
271    "с",
272    "со",
273    "как",
274    "а",
275    "то",
276    "все",
277    "она",
278    "так",
279    "его",
280    "но",
281    "да",
282    "ты",
283    "к",
284    "у",
285    "же",
286    "вы",
287    "за",
288    "бы",
289    "по",
290    "ее",
291    "мне",
292    "было",
293    "вот",
294    "от",
295    "о",
296    "из",
297    "ему",
298    "ей",
299    "им",
300    "de",
301    "la",
302    "que",
303    "el",
304    "en",
305    "y",
306    "a",
307    "los",
308    "del",
309    "se",
310    "las",
311    "por",
312    "un",
313    "para",
314    "con",
315    "una",
316    "su",
317    "al",
318    "lo",
319    "como",
320    "más",
321    "pero",
322    "sus",
323    "le",
324    "ya",
325    "o",
326    "este",
327    "sí",
328    "lt",
329    "gt",
330    "amp",
331];
332
333/// Instantiate default tokenizers
334pub fn default_tokenizers() -> [(String, TextAnalyzer); 8] {
335    let summa_tokenizer = TextAnalyzer::builder(Tokenizer)
336        .filter(RemoveLongFilter::limit(100))
337        .filter(LowerCaser)
338        .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
339        .build();
340    let summa_dict_tokenizer = TextAnalyzer::builder(DictTokenizer::new()).build();
341    let summa_html_tokenizer = TextAnalyzer::builder(HtmlTokenizer::new(
342        HashSet::from_iter(vec![
343            "formula".to_string(),
344            "figure".to_string(),
345            "math".to_string(),
346            "ref".to_string(),
347            "table".to_string(),
348        ]),
349        HashSet::from_iter(vec![
350            "sup".to_string(),
351            "sub".to_string(),
352            "i".to_string(),
353            "b".to_string(),
354            "u".to_string(),
355            "scp".to_string(),
356            "tt".to_string(),
357        ]),
358    ))
359    .filter(RemoveLongFilter::limit(100))
360    .filter(LowerCaser)
361    .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
362    .build();
363    let summa_mmd_tokenizer = TextAnalyzer::builder(MmdTokenizer::default())
364        .filter(RemoveLongFilter::limit(100))
365        .filter(LowerCaser)
366        .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
367        .build();
368    let summa_without_stop_words_tokenizer = TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(100)).filter(LowerCaser).build();
369    let default_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
370        .filter(RemoveLongFilter::limit(100))
371        .filter(LowerCaser)
372        .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
373        .build();
374    let whitespace_tokenizer = TextAnalyzer::builder(WhitespaceTokenizer::default()).filter(LowerCaser).build();
375    let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()).filter(LowerCaser).build();
376    [
377        ("summa".to_owned(), summa_tokenizer),
378        ("summa_dict".to_owned(), summa_dict_tokenizer),
379        ("summa_html".to_owned(), summa_html_tokenizer),
380        ("summa_mmd".to_owned(), summa_mmd_tokenizer),
381        ("summa_without_stop_words".to_owned(), summa_without_stop_words_tokenizer),
382        ("default".to_owned(), default_tokenizer),
383        ("raw".to_owned(), raw_tokenizer),
384        ("whitespace".to_owned(), whitespace_tokenizer),
385    ]
386}