1use std::collections::HashSet;
2
3use tantivy::tokenizer::{LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, StopWordFilter, TextAnalyzer, WhitespaceTokenizer};
4
5use super::tokenizers::{DictTokenizer, HtmlTokenizer, Tokenizer};
6use crate::components::tokenizers::MmdTokenizer;
7
8pub const STOP_WORDS: [&str; 321] = [
10 "a",
11 "an",
12 "and",
13 "are",
14 "as",
15 "at",
16 "be",
17 "by",
18 "for",
19 "from",
20 "if",
21 "in",
22 "is",
23 "it",
24 "of",
25 "on",
26 "or",
27 "s",
28 "that",
29 "the",
30 "these",
31 "this",
32 "to",
33 "was",
34 "were",
35 "which",
36 "with",
37 "aber",
38 "alle",
39 "allem",
40 "allen",
41 "aller",
42 "alles",
43 "als",
44 "also",
45 "am",
46 "an",
47 "ander",
48 "andere",
49 "anderem",
50 "anderen",
51 "anderer",
52 "anderes",
53 "anderm",
54 "andern",
55 "anderr",
56 "anders",
57 "auch",
58 "auf",
59 "aus",
60 "bei",
61 "bin",
62 "bis",
63 "bist",
64 "da",
65 "dann",
66 "der",
67 "den",
68 "des",
69 "dem",
70 "das",
71 "dass",
72 "daß",
73 "derselbe",
74 "derselben",
75 "denselben",
76 "desselben",
77 "demselben",
78 "dieselbe",
79 "dieselben",
80 "dasselbe",
81 "dazu",
82 "dein",
83 "deine",
84 "deinem",
85 "deinen",
86 "deiner",
87 "deines",
88 "denn",
89 "derer",
90 "dessen",
91 "dich",
92 "dir",
93 "du",
94 "dies",
95 "diese",
96 "diesem",
97 "diesen",
98 "dieser",
99 "dieses",
100 "doch",
101 "dort",
102 "durch",
103 "ein",
104 "eine",
105 "einem",
106 "einen",
107 "einer",
108 "eines",
109 "einig",
110 "einige",
111 "einigem",
112 "einigen",
113 "einiger",
114 "einiges",
115 "einmal",
116 "er",
117 "ihn",
118 "ihm",
119 "es",
120 "etwas",
121 "euer",
122 "eure",
123 "eurem",
124 "euren",
125 "eurer",
126 "eures",
127 "für",
128 "gegen",
129 "gewesen",
130 "hab",
131 "habe",
132 "haben",
133 "hat",
134 "hatte",
135 "hatten",
136 "hier",
137 "hin",
138 "hinter",
139 "ich",
140 "mich",
141 "mir",
142 "ihr",
143 "ihre",
144 "ihrem",
145 "ihren",
146 "ihrer",
147 "ihres",
148 "euch",
149 "im",
150 "in",
151 "indem",
152 "ins",
153 "ist",
154 "jede",
155 "jedem",
156 "jeden",
157 "jeder",
158 "jedes",
159 "jene",
160 "jenem",
161 "jenen",
162 "jener",
163 "jenes",
164 "jetzt",
165 "kann",
166 "kein",
167 "keine",
168 "keinem",
169 "keinen",
170 "keiner",
171 "keines",
172 "können",
173 "könnte",
174 "machen",
175 "man",
176 "manche",
177 "manchem",
178 "manchen",
179 "mancher",
180 "manches",
181 "mein",
182 "meine",
183 "meinem",
184 "meinen",
185 "meiner",
186 "meines",
187 "mit",
188 "muss",
189 "musste",
190 "nach",
191 "nicht",
192 "nichts",
193 "noch",
194 "nun",
195 "nur",
196 "ob",
197 "oder",
198 "ohne",
199 "sehr",
200 "sein",
201 "seine",
202 "seinem",
203 "seinen",
204 "seiner",
205 "seines",
206 "selbst",
207 "sich",
208 "sie",
209 "ihnen",
210 "sind",
211 "so",
212 "solche",
213 "solchem",
214 "solchen",
215 "solcher",
216 "solches",
217 "soll",
218 "sollte",
219 "sondern",
220 "sonst",
221 "um",
222 "und",
223 "uns",
224 "unsere",
225 "unserem",
226 "unseren",
227 "unser",
228 "unseres",
229 "unter",
230 "viel",
231 "vom",
232 "von",
233 "vor",
234 "während",
235 "waren",
236 "warst",
237 "weg",
238 "weil",
239 "weiter",
240 "welche",
241 "welchem",
242 "welchen",
243 "welcher",
244 "welches",
245 "wenn",
246 "werde",
247 "werden",
248 "wie",
249 "wieder",
250 "wir",
251 "wird",
252 "wirst",
253 "wo",
254 "wollen",
255 "wollte",
256 "würde",
257 "würden",
258 "zu",
259 "zum",
260 "zur",
261 "zwar",
262 "zwischen",
263 "и",
264 "в",
265 "во",
266 "не",
267 "что",
268 "он",
269 "на",
270 "я",
271 "с",
272 "со",
273 "как",
274 "а",
275 "то",
276 "все",
277 "она",
278 "так",
279 "его",
280 "но",
281 "да",
282 "ты",
283 "к",
284 "у",
285 "же",
286 "вы",
287 "за",
288 "бы",
289 "по",
290 "ее",
291 "мне",
292 "было",
293 "вот",
294 "от",
295 "о",
296 "из",
297 "ему",
298 "ей",
299 "им",
300 "de",
301 "la",
302 "que",
303 "el",
304 "en",
305 "y",
306 "a",
307 "los",
308 "del",
309 "se",
310 "las",
311 "por",
312 "un",
313 "para",
314 "con",
315 "una",
316 "su",
317 "al",
318 "lo",
319 "como",
320 "más",
321 "pero",
322 "sus",
323 "le",
324 "ya",
325 "o",
326 "este",
327 "sí",
328 "lt",
329 "gt",
330 "amp",
331];
332
333pub fn default_tokenizers() -> [(String, TextAnalyzer); 8] {
335 let summa_tokenizer = TextAnalyzer::builder(Tokenizer)
336 .filter(RemoveLongFilter::limit(100))
337 .filter(LowerCaser)
338 .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
339 .build();
340 let summa_dict_tokenizer = TextAnalyzer::builder(DictTokenizer::new()).build();
341 let summa_html_tokenizer = TextAnalyzer::builder(HtmlTokenizer::new(
342 HashSet::from_iter(vec![
343 "formula".to_string(),
344 "figure".to_string(),
345 "math".to_string(),
346 "ref".to_string(),
347 "table".to_string(),
348 ]),
349 HashSet::from_iter(vec![
350 "sup".to_string(),
351 "sub".to_string(),
352 "i".to_string(),
353 "b".to_string(),
354 "u".to_string(),
355 "scp".to_string(),
356 "tt".to_string(),
357 ]),
358 ))
359 .filter(RemoveLongFilter::limit(100))
360 .filter(LowerCaser)
361 .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
362 .build();
363 let summa_mmd_tokenizer = TextAnalyzer::builder(MmdTokenizer::default())
364 .filter(RemoveLongFilter::limit(100))
365 .filter(LowerCaser)
366 .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
367 .build();
368 let summa_without_stop_words_tokenizer = TextAnalyzer::builder(Tokenizer).filter(RemoveLongFilter::limit(100)).filter(LowerCaser).build();
369 let default_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
370 .filter(RemoveLongFilter::limit(100))
371 .filter(LowerCaser)
372 .filter(StopWordFilter::remove(STOP_WORDS.map(String::from).to_vec()))
373 .build();
374 let whitespace_tokenizer = TextAnalyzer::builder(WhitespaceTokenizer::default()).filter(LowerCaser).build();
375 let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()).filter(LowerCaser).build();
376 [
377 ("summa".to_owned(), summa_tokenizer),
378 ("summa_dict".to_owned(), summa_dict_tokenizer),
379 ("summa_html".to_owned(), summa_html_tokenizer),
380 ("summa_mmd".to_owned(), summa_mmd_tokenizer),
381 ("summa_without_stop_words".to_owned(), summa_without_stop_words_tokenizer),
382 ("default".to_owned(), default_tokenizer),
383 ("raw".to_owned(), raw_tokenizer),
384 ("whitespace".to_owned(), whitespace_tokenizer),
385 ]
386}