summa_core/components/tokenizers/
mmd_tokenizer.rs

1use std::collections::{HashMap, HashSet};
2use std::str::CharIndices;
3
4/// Tokenize the text by splitting on whitespaces and punctuation.
5#[derive(Clone)]
6pub struct MmdTokenizer {
7    skip_commands: HashMap<&'static str, &'static str>,
8    skip_tokens: HashSet<&'static str>,
9    drop_commands: HashSet<&'static str>,
10    known_commands: HashSet<&'static str>,
11}
12
13impl Default for MmdTokenizer {
14    fn default() -> Self {
15        static SKIP_COMMANDS: [(&str, &str); 6] = [
16            ("\\begin{table}", "\\end{table}"),
17            ("\\(", "\\)"),
18            ("\\[", "\\]"),
19            ("\\begin{tabular}", "\\end{tabular}"),
20            ("\\begin{figure}", "\\end{figure}"),
21            ("$$", "$$"),
22        ];
23        static SKIP_TOKENS: [&str; 33] = [
24            "#",
25            "##",
26            "###",
27            "####",
28            "#####",
29            "######",
30            "\\",
31            "\\begin{theorem}",
32            "\\end{theorem}",
33            "\\begin{lemma}",
34            "\\end{lemma}",
35            "\\begin{itemize}",
36            "\\end{itemize}",
37            "\\begin{equation}",
38            "\\end{equation}",
39            "\\begin{equation*}",
40            "\\end{equation*}",
41            "\\begin{align}",
42            "\\end{align}",
43            "\\begin{align*}",
44            "\\end{align*}",
45            "\\begin{split}",
46            "\\end{split}",
47            "\\begin{split*}",
48            "\\end{split*}",
49            "\\begin{gather}",
50            "\\end{gather}",
51            "\\begin{gather*}",
52            "\\end{gather*}",
53            "\\end{table}",
54            "\\end{tabular}",
55            "\\end{figure}",
56            "\\pagebreak",
57        ];
58        static DROP_COMMANDS: [&str; 17] = [
59            "\\footnote",
60            "\\footnotemark",
61            "\\underline",
62            "\\uline",
63            "\\uwave",
64            "\\dashuline",
65            "\\dotuline",
66            "\\sout",
67            "\\xout",
68            "\\title",
69            "\\author",
70            "\\section",
71            "\\subsection",
72            "\\subsubsection",
73            "\\textit",
74            "\\textbf",
75            "\\url",
76        ];
77        static KNOWN_COMMANDS: [&str; 3] = ["\\pagebreak", "\\begin", "\\end"];
78        MmdTokenizer {
79            skip_commands: HashMap::from_iter(SKIP_COMMANDS),
80            skip_tokens: HashSet::from_iter(SKIP_TOKENS),
81            drop_commands: HashSet::from_iter(DROP_COMMANDS),
82            known_commands: HashSet::from_iter(KNOWN_COMMANDS),
83        }
84    }
85}
86
87pub struct MmdTokenStream<'a> {
88    skip_list: Option<Vec<(usize, usize)>>,
89    skip_iter: usize,
90    chars: CharIndices<'a>,
91    token: tantivy::tokenizer::Token,
92    stacked_char: Option<(char, usize)>,
93    skip_commands: &'a HashMap<&'static str, &'static str>,
94    skip_tokens: &'a HashSet<&'static str>,
95    drop_commands: &'a HashSet<&'static str>,
96    known_commands: &'a HashSet<&'static str>,
97    base_offset: usize,
98    maybe_link: bool,
99}
100
101#[inline]
102pub fn accept_char(token: &mut tantivy::tokenizer::Token, c: char, offset: usize) {
103    if token.offset_from == usize::MAX {
104        token.offset_from = offset;
105    }
106    token.offset_to = offset + c.len_utf8();
107    token.text.push(c);
108}
109
110impl<'a> MmdTokenStream<'a> {
111    pub fn new(
112        text: &'a str,
113        skip_commands: &'a HashMap<&'static str, &'static str>,
114        skip_tokens: &'a HashSet<&'static str>,
115        drop_commands: &'a HashSet<&'static str>,
116        known_commands: &'a HashSet<&'static str>,
117    ) -> MmdTokenStream<'a> {
118        MmdTokenStream {
119            skip_list: None,
120            skip_iter: 0,
121            chars: text.char_indices(),
122            token: tantivy::tokenizer::Token::default(),
123            stacked_char: None,
124            skip_commands,
125            skip_tokens,
126            drop_commands,
127            known_commands,
128            base_offset: 0,
129            maybe_link: false,
130        }
131    }
132
133    pub fn new_with_offset_and_position(
134        text: &'a str,
135        offset: usize,
136        position: usize,
137        skip_list: Option<Vec<(usize, usize)>>,
138        skip_commands: &'a HashMap<&'static str, &'static str>,
139        skip_tokens: &'a HashSet<&'static str>,
140        drop_commands: &'a HashSet<&'static str>,
141        known_commands: &'a HashSet<&'static str>,
142    ) -> MmdTokenStream<'a> {
143        let token = tantivy::tokenizer::Token {
144            position,
145            ..Default::default()
146        };
147        MmdTokenStream {
148            skip_list,
149            skip_iter: 0,
150            chars: text.char_indices(),
151            token,
152            stacked_char: None,
153            skip_commands,
154            skip_tokens,
155            drop_commands,
156            known_commands,
157            base_offset: offset,
158            maybe_link: false,
159        }
160    }
161
162    pub fn token(&self) -> &tantivy::tokenizer::Token {
163        &self.token
164    }
165
166    pub fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
167        &mut self.token
168    }
169
170    fn advance_token(&mut self, update_position: bool) -> bool {
171        self.token.text.clear();
172        if update_position {
173            self.token.position = self.token.position.wrapping_add(1);
174        }
175        self.token.offset_from = usize::MAX;
176        let mut is_command = false;
177        let mut spec_counter = 0;
178        let mut start_skipping_round_bracket = false;
179        let mut skipped_round_bracket = 0;
180        let mut start_skipping_figure_bracket = false;
181        let mut skipped_figure_bracket = 0;
182
183        if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
184            accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
185            if is_cjk(&stacked_char) {
186                return true;
187            }
188            if stacked_char == '\\' {
189                is_command = true;
190            }
191            if stacked_char == '[' {
192                self.maybe_link = true;
193            }
194        }
195
196        for (offset, c) in &mut self.chars {
197            let real_offset = self.base_offset + offset;
198
199            if let Some(skip_list) = &self.skip_list {
200                while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
201                    self.skip_iter += 1;
202                }
203                if self.skip_iter < skip_list.len() && skip_list[self.skip_iter].0 <= real_offset && real_offset < skip_list[self.skip_iter].1 {
204                    continue;
205                }
206            }
207
208            if start_skipping_round_bracket || skipped_round_bracket > 0 {
209                start_skipping_round_bracket = false;
210                if c == '(' {
211                    skipped_round_bracket += 1;
212                    continue;
213                } else if c == ')' {
214                    skipped_round_bracket -= 1;
215                    if skipped_round_bracket == 0 {
216                        start_skipping_figure_bracket = true;
217                    }
218                    continue;
219                } else if skipped_round_bracket > 0 {
220                    continue;
221                }
222            }
223
224            if start_skipping_figure_bracket || skipped_figure_bracket > 0 {
225                start_skipping_figure_bracket = false;
226                if c == '{' {
227                    skipped_figure_bracket += 1;
228                    continue;
229                } else if c == '}' {
230                    skipped_figure_bracket -= 1;
231                    continue;
232                } else if skipped_figure_bracket > 0 {
233                    continue;
234                }
235            }
236
237            if is_cjk(&c) {
238                if !self.token.text.is_empty() {
239                    self.stacked_char = Some((c, offset));
240                    return true;
241                }
242                accept_char(&mut self.token, c, real_offset);
243                return true;
244            }
245
246            if c == '\\' {
247                if !self.token.text.is_empty() {
248                    self.stacked_char = Some((c, offset));
249                    return true;
250                }
251                is_command = true;
252                accept_char(&mut self.token, c, real_offset);
253            } else if c == '[' && !is_command {
254                if !self.token.text.is_empty() {
255                    self.stacked_char = Some((c, offset));
256                    return true;
257                }
258                self.maybe_link = true;
259            } else if c == ']' && self.maybe_link && !is_command {
260                self.maybe_link = false;
261                start_skipping_round_bracket = true;
262            } else if c == '^' || c == '~' {
263                self.token.offset_to += 1;
264            } else if c == '*' || c == '_' {
265                spec_counter += 1;
266            } else if c.is_alphanumeric() || c == '#' || c == '+' {
267                if spec_counter == 1 {
268                    self.stacked_char = Some((c, offset));
269                    return true;
270                } else if spec_counter > 1 {
271                    self.token.offset_to += spec_counter;
272                    spec_counter = 0;
273                };
274                accept_char(&mut self.token, c, real_offset);
275            } else if is_command && (c == '(' || c == ')' || c == '[' || c == ']') && self.token.text.len() == 1 {
276                accept_char(&mut self.token, c, real_offset);
277                break;
278            } else if is_command && (c == '{' || c == '}') {
279                if self.drop_commands.contains(&self.token.text.as_str()) {
280                    is_command = false;
281                    self.token.text.clear();
282                    self.token.offset_from = usize::MAX;
283                    continue;
284                } else if c == '{' && !self.known_commands.contains(&self.token.text.as_str()) {
285                    break;
286                }
287                accept_char(&mut self.token, c, real_offset);
288                if c == '}' {
289                    break;
290                }
291            } else if !self.token.text.is_empty() {
292                break;
293            }
294        }
295        !self.token.text.is_empty()
296    }
297}
298
299impl tantivy::tokenizer::Tokenizer for MmdTokenizer {
300    type TokenStream<'a> = MmdTokenStream<'a>;
301
302    fn token_stream<'a>(&'a mut self, text: &'a str) -> MmdTokenStream<'a> {
303        MmdTokenStream::new(text, &self.skip_commands, &self.skip_tokens, &self.drop_commands, &self.known_commands)
304    }
305}
306
307#[inline]
308fn is_cjk(c: &char) -> bool {
309    (0x4e00 <= *c as u32 && *c as u32 <= 0x9FFF)
310        || (0x3400 <= *c as u32 && *c as u32 <= 0x4DBF)
311        || (0x20000 <= *c as u32 && *c as u32 <= 0x2A6DF)
312        || (0x2A700 <= *c as u32 && *c as u32 <= 0x2B73F)
313        || (0x2B740 <= *c as u32 && *c as u32 <= 0x2B81F)
314}
315
316impl<'a> tantivy::tokenizer::TokenStream for MmdTokenStream<'a> {
317    fn advance(&mut self) -> bool {
318        let mut result = self.advance_token(true);
319        while result {
320            if self.skip_tokens.contains(&self.token.text.as_str()) {
321                result = self.advance_token(false);
322            } else if let Some(end_command) = self.skip_commands.get(self.token.text.as_str()) {
323                while result && self.token.text != *end_command {
324                    result = self.advance_token(false);
325                }
326                result = self.advance_token(false);
327            } else {
328                while self.token.text.starts_with('\\') {
329                    self.token.offset_from += 1;
330                    self.token.text = self.token.text[1..].to_string()
331                }
332                if self.token.text == "]" || self.token.text == "}" || self.token.text == ")" {
333                    result = self.advance_token(false);
334                } else {
335                    break;
336                }
337            }
338        }
339        result
340    }
341
342    fn token(&self) -> &tantivy::tokenizer::Token {
343        &self.token
344    }
345
346    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
347        &mut self.token
348    }
349}
350
351#[cfg(test)]
352pub mod tests {
353    use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenizerManager};
354
355    use super::MmdTokenizer;
356
357    pub fn assert_tokenization(tokenizer: &mut TextAnalyzer, text: &str, response: &[Token]) {
358        let mut tokens: Vec<Token> = vec![];
359        {
360            let mut add_token = |token: &Token| {
361                tokens.push(token.clone());
362            };
363            tokenizer.token_stream(text).process(&mut add_token);
364        }
365        assert_eq!(tokens, response);
366    }
367
368    #[test]
369    fn test_en_tokenizer() {
370        let tokenizer_manager = TokenizerManager::default();
371        tokenizer_manager.register(
372            "tokenizer",
373            TextAnalyzer::builder(MmdTokenizer::default())
374                .filter(RemoveLongFilter::limit(40))
375                .filter(LowerCaser)
376                .build(),
377        );
378        let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
379        assert_tokenization(&mut tokenizer, "#", &[]);
380        assert_tokenization(
381            &mut tokenizer,
382            "# Header1",
383            &[Token {
384                offset_from: 2,
385                offset_to: 9,
386                position: 0,
387                text: "header1".to_string(),
388                position_length: 1,
389            }],
390        );
391        assert_tokenization(&mut tokenizer, "\\begin{table}\\end{table}", &[]);
392        assert_tokenization(
393            &mut tokenizer,
394            "\\begin{table}\\end{table}a",
395            &[Token {
396                offset_from: 24,
397                offset_to: 25,
398                position: 0,
399                text: "a".to_string(),
400                position_length: 1,
401            }],
402        );
403        assert_tokenization(&mut tokenizer, "\\begin{table}# Header 1\\end{table}", &[]);
404        assert_tokenization(&mut tokenizer, "\\end{table}", &[]);
405        assert_tokenization(
406            &mut tokenizer,
407            "# Header1\nHello, 1 \\ 2 world! \\begin{table}table content\\end{table}\n\\begin{theorem}\ntheorem content\\end{theorem}",
408            &[
409                Token {
410                    offset_from: 2,
411                    offset_to: 9,
412                    position: 0,
413                    text: "header1".to_string(),
414                    position_length: 1,
415                },
416                Token {
417                    offset_from: 10,
418                    offset_to: 15,
419                    position: 1,
420                    text: "hello".to_string(),
421                    position_length: 1,
422                },
423                Token {
424                    offset_from: 17,
425                    offset_to: 18,
426                    position: 2,
427                    text: "1".to_string(),
428                    position_length: 1,
429                },
430                Token {
431                    offset_from: 21,
432                    offset_to: 22,
433                    position: 3,
434                    text: "2".to_string(),
435                    position_length: 1,
436                },
437                Token {
438                    offset_from: 23,
439                    offset_to: 28,
440                    position: 4,
441                    text: "world".to_string(),
442                    position_length: 1,
443                },
444                Token {
445                    offset_from: 84,
446                    offset_to: 91,
447                    position: 5,
448                    text: "theorem".to_string(),
449                    position_length: 1,
450                },
451                Token {
452                    offset_from: 92,
453                    offset_to: 99,
454                    position: 6,
455                    text: "content".to_string(),
456                    position_length: 1,
457                },
458            ],
459        );
460        assert_tokenization(
461            &mut tokenizer,
462            "# Header1 \\footnote{footnote text}# Header2 \\uline{\\uline{double line}}",
463            &[
464                Token {
465                    offset_from: 2,
466                    offset_to: 9,
467                    position: 0,
468                    text: "header1".to_string(),
469                    position_length: 1,
470                },
471                Token {
472                    offset_from: 20,
473                    offset_to: 28,
474                    position: 1,
475                    text: "footnote".to_string(),
476                    position_length: 1,
477                },
478                Token {
479                    offset_from: 29,
480                    offset_to: 33,
481                    position: 2,
482                    text: "text".to_string(),
483                    position_length: 1,
484                },
485                Token {
486                    offset_from: 36,
487                    offset_to: 43,
488                    position: 3,
489                    text: "header2".to_string(),
490                    position_length: 1,
491                },
492                Token {
493                    offset_from: 58,
494                    offset_to: 64,
495                    position: 4,
496                    text: "double".to_string(),
497                    position_length: 1,
498                },
499                Token {
500                    offset_from: 65,
501                    offset_to: 69,
502                    position: 5,
503                    text: "line".to_string(),
504                    position_length: 1,
505                },
506            ],
507        );
508        assert_tokenization(
509            &mut tokenizer,
510            "\\noncommand \\noncommand2 \\",
511            &[
512                Token {
513                    offset_from: 1,
514                    offset_to: 11,
515                    position: 0,
516                    text: "noncommand".to_string(),
517                    position_length: 1,
518                },
519                Token {
520                    offset_from: 13,
521                    offset_to: 24,
522                    position: 1,
523                    text: "noncommand2".to_string(),
524                    position_length: 1,
525                },
526            ],
527        );
528        assert_tokenization(
529            &mut tokenizer,
530            "\\command{weird}",
531            &[
532                Token {
533                    offset_from: 1,
534                    offset_to: 8,
535                    position: 0,
536                    text: "command".to_string(),
537                    position_length: 1,
538                },
539                Token {
540                    offset_from: 9,
541                    offset_to: 14,
542                    position: 1,
543                    text: "weird".to_string(),
544                    position_length: 1,
545                },
546            ],
547        );
548        assert_tokenization(
549            &mut tokenizer,
550            "word1 \\(x_1 + x_2\\) \\word2",
551            &[
552                Token {
553                    offset_from: 0,
554                    offset_to: 5,
555                    position: 0,
556                    text: "word1".to_string(),
557                    position_length: 1,
558                },
559                Token {
560                    offset_from: 21,
561                    offset_to: 26,
562                    position: 1,
563                    text: "word2".to_string(),
564                    position_length: 1,
565                },
566            ],
567        );
568        assert_tokenization(
569            &mut tokenizer,
570            "Love**is**bold",
571            &[Token {
572                offset_from: 0,
573                offset_to: 14,
574                position: 0,
575                text: "loveisbold".to_string(),
576                position_length: 1,
577            }],
578        );
579        assert_tokenization(
580            &mut tokenizer,
581            "Love*is*bold",
582            &[
583                Token {
584                    offset_from: 0,
585                    offset_to: 4,
586                    position: 0,
587                    text: "love".to_string(),
588                    position_length: 1,
589                },
590                Token {
591                    offset_from: 5,
592                    offset_to: 7,
593                    position: 1,
594                    text: "is".to_string(),
595                    position_length: 1,
596                },
597                Token {
598                    offset_from: 8,
599                    offset_to: 12,
600                    position: 2,
601                    text: "bold".to_string(),
602                    position_length: 1,
603                },
604            ],
605        );
606        assert_tokenization(
607            &mut tokenizer,
608            "Love **is*bold",
609            &[
610                Token {
611                    offset_from: 0,
612                    offset_to: 4,
613                    position: 0,
614                    text: "love".to_string(),
615                    position_length: 1,
616                },
617                Token {
618                    offset_from: 7,
619                    offset_to: 9,
620                    position: 1,
621                    text: "is".to_string(),
622                    position_length: 1,
623                },
624                Token {
625                    offset_from: 10,
626                    offset_to: 14,
627                    position: 2,
628                    text: "bold".to_string(),
629                    position_length: 1,
630                },
631            ],
632        );
633        assert_tokenization(
634            &mut tokenizer,
635            "How to do x^2",
636            &[
637                Token {
638                    offset_from: 0,
639                    offset_to: 3,
640                    position: 0,
641                    text: "how".to_string(),
642                    position_length: 1,
643                },
644                Token {
645                    offset_from: 4,
646                    offset_to: 6,
647                    position: 1,
648                    text: "to".to_string(),
649                    position_length: 1,
650                },
651                Token {
652                    offset_from: 7,
653                    offset_to: 9,
654                    position: 2,
655                    text: "do".to_string(),
656                    position_length: 1,
657                },
658                Token {
659                    offset_from: 10,
660                    offset_to: 13,
661                    position: 3,
662                    text: "x2".to_string(),
663                    position_length: 1,
664                },
665            ],
666        );
667        assert_tokenization(&mut tokenizer, "![]()", &[]);
668        assert_tokenization(
669            &mut tokenizer,
670            "![image text](https://example.com/image.jpg){width=1}",
671            &[
672                Token {
673                    offset_from: 2,
674                    offset_to: 7,
675                    position: 0,
676                    text: "image".to_string(),
677                    position_length: 1,
678                },
679                Token {
680                    offset_from: 8,
681                    offset_to: 12,
682                    position: 1,
683                    text: "text".to_string(),
684                    position_length: 1,
685                },
686            ],
687        );
688        assert_tokenization(
689            &mut tokenizer,
690            "[ref] (author)",
691            &[
692                Token {
693                    offset_from: 1,
694                    offset_to: 4,
695                    position: 0,
696                    text: "ref".to_string(),
697                    position_length: 1,
698                },
699                Token {
700                    offset_from: 7,
701                    offset_to: 13,
702                    position: 1,
703                    text: "author".to_string(),
704                    position_length: 1,
705                },
706            ],
707        );
708        assert_tokenization(
709            &mut tokenizer,
710            "[ref]test [ref](l)test",
711            &[
712                Token {
713                    offset_from: 1,
714                    offset_to: 9,
715                    position: 0,
716                    text: "reftest".to_string(),
717                    position_length: 1,
718                },
719                Token {
720                    offset_from: 11,
721                    offset_to: 22,
722                    position: 1,
723                    text: "reftest".to_string(),
724                    position_length: 1,
725                },
726            ],
727        );
728        assert_tokenization(
729            &mut tokenizer,
730            "\\title{This is title}\n\\author{Author}\n\\section{Section 1}\n\\subsection{Section 1.1}\n\\subsubsection{Section 1.1.1}",
731            &[
732                Token {
733                    offset_from: 7,
734                    offset_to: 11,
735                    position: 0,
736                    text: "this".to_string(),
737                    position_length: 1,
738                },
739                Token {
740                    offset_from: 12,
741                    offset_to: 14,
742                    position: 1,
743                    text: "is".to_string(),
744                    position_length: 1,
745                },
746                Token {
747                    offset_from: 15,
748                    offset_to: 20,
749                    position: 2,
750                    text: "title".to_string(),
751                    position_length: 1,
752                },
753                Token {
754                    offset_from: 30,
755                    offset_to: 36,
756                    position: 3,
757                    text: "author".to_string(),
758                    position_length: 1,
759                },
760                Token {
761                    offset_from: 47,
762                    offset_to: 54,
763                    position: 4,
764                    text: "section".to_string(),
765                    position_length: 1,
766                },
767                Token {
768                    offset_from: 55,
769                    offset_to: 56,
770                    position: 5,
771                    text: "1".to_string(),
772                    position_length: 1,
773                },
774                Token {
775                    offset_from: 70,
776                    offset_to: 77,
777                    position: 6,
778                    text: "section".to_string(),
779                    position_length: 1,
780                },
781                Token {
782                    offset_from: 78,
783                    offset_to: 79,
784                    position: 7,
785                    text: "1".to_string(),
786                    position_length: 1,
787                },
788                Token {
789                    offset_from: 80,
790                    offset_to: 81,
791                    position: 8,
792                    text: "1".to_string(),
793                    position_length: 1,
794                },
795                Token {
796                    offset_from: 98,
797                    offset_to: 105,
798                    position: 9,
799                    text: "section".to_string(),
800                    position_length: 1,
801                },
802                Token {
803                    offset_from: 106,
804                    offset_to: 107,
805                    position: 10,
806                    text: "1".to_string(),
807                    position_length: 1,
808                },
809                Token {
810                    offset_from: 108,
811                    offset_to: 109,
812                    position: 11,
813                    text: "1".to_string(),
814                    position_length: 1,
815                },
816                Token {
817                    offset_from: 110,
818                    offset_to: 111,
819                    position: 12,
820                    text: "1".to_string(),
821                    position_length: 1,
822                },
823            ],
824        );
825        assert_tokenization(
826            &mut tokenizer,
827            "![ref](hehe)-abc{} \\[34\\] \\] \\) \\} 1 ### abc \\(",
828            &[
829                Token {
830                    offset_from: 2,
831                    offset_to: 5,
832                    position: 0,
833                    text: "ref".to_string(),
834                    position_length: 1,
835                },
836                Token {
837                    offset_from: 13,
838                    offset_to: 16,
839                    position: 1,
840                    text: "abc".to_string(),
841                    position_length: 1,
842                },
843                Token {
844                    offset_from: 35,
845                    offset_to: 36,
846                    position: 2,
847                    text: "1".to_string(),
848                    position_length: 1,
849                },
850                Token {
851                    offset_from: 41,
852                    offset_to: 44,
853                    position: 3,
854                    text: "abc".to_string(),
855                    position_length: 1,
856                },
857            ],
858        );
859    }
860
861    #[test]
862    fn test_zh_tokenizer() {
863        let tokenizer_manager = TokenizerManager::default();
864        tokenizer_manager.register(
865            "tokenizer",
866            TextAnalyzer::builder(MmdTokenizer::default())
867                .filter(RemoveLongFilter::limit(40))
868                .filter(LowerCaser)
869                .build(),
870        );
871        let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
872        assert_tokenization(
873            &mut tokenizer,
874            "在查hello, worl土d动!",
875            &[
876                Token {
877                    offset_from: 0,
878                    offset_to: 3,
879                    position: 0,
880                    text: "在".to_string(),
881                    position_length: 1,
882                },
883                Token {
884                    offset_from: 3,
885                    offset_to: 6,
886                    position: 1,
887                    text: "查".to_string(),
888                    position_length: 1,
889                },
890                Token {
891                    offset_from: 6,
892                    offset_to: 11,
893                    position: 2,
894                    text: "hello".to_string(),
895                    position_length: 1,
896                },
897                Token {
898                    offset_from: 13,
899                    offset_to: 17,
900                    position: 3,
901                    text: "worl".to_string(),
902                    position_length: 1,
903                },
904                Token {
905                    offset_from: 17,
906                    offset_to: 20,
907                    position: 4,
908                    text: "土".to_string(),
909                    position_length: 1,
910                },
911                Token {
912                    offset_from: 20,
913                    offset_to: 21,
914                    position: 5,
915                    text: "d".to_string(),
916                    position_length: 1,
917                },
918                Token {
919                    offset_from: 21,
920                    offset_to: 24,
921                    position: 6,
922                    text: "动".to_string(),
923                    position_length: 1,
924                },
925            ],
926        );
927        assert_tokenization(
928            &mut tokenizer,
929            "在查土d动",
930            &[
931                Token {
932                    offset_from: 0,
933                    offset_to: 3,
934                    position: 0,
935                    text: "在".to_string(),
936                    position_length: 1,
937                },
938                Token {
939                    offset_from: 3,
940                    offset_to: 6,
941                    position: 1,
942                    text: "查".to_string(),
943                    position_length: 1,
944                },
945                Token {
946                    offset_from: 6,
947                    offset_to: 9,
948                    position: 2,
949                    text: "土".to_string(),
950                    position_length: 1,
951                },
952                Token {
953                    offset_from: 9,
954                    offset_to: 10,
955                    position: 3,
956                    text: "d".to_string(),
957                    position_length: 1,
958                },
959                Token {
960                    offset_from: 10,
961                    offset_to: 13,
962                    position: 4,
963                    text: "动".to_string(),
964                    position_length: 1,
965                },
966            ],
967        );
968        assert_tokenization(
969            &mut tokenizer,
970            "Veri 在查hello, c查m p查 查lex  worl土d动!",
971            &[
972                Token {
973                    offset_from: 0,
974                    offset_to: 4,
975                    position: 0,
976                    text: "veri".to_string(),
977                    position_length: 1,
978                },
979                Token {
980                    offset_from: 5,
981                    offset_to: 8,
982                    position: 1,
983                    text: "在".to_string(),
984                    position_length: 1,
985                },
986                Token {
987                    offset_from: 8,
988                    offset_to: 11,
989                    position: 2,
990                    text: "查".to_string(),
991                    position_length: 1,
992                },
993                Token {
994                    offset_from: 11,
995                    offset_to: 16,
996                    position: 3,
997                    text: "hello".to_string(),
998                    position_length: 1,
999                },
1000                Token {
1001                    offset_from: 18,
1002                    offset_to: 19,
1003                    position: 4,
1004                    text: "c".to_string(),
1005                    position_length: 1,
1006                },
1007                Token {
1008                    offset_from: 19,
1009                    offset_to: 22,
1010                    position: 5,
1011                    text: "查".to_string(),
1012                    position_length: 1,
1013                },
1014                Token {
1015                    offset_from: 22,
1016                    offset_to: 23,
1017                    position: 6,
1018                    text: "m".to_string(),
1019                    position_length: 1,
1020                },
1021                Token {
1022                    offset_from: 24,
1023                    offset_to: 25,
1024                    position: 7,
1025                    text: "p".to_string(),
1026                    position_length: 1,
1027                },
1028                Token {
1029                    offset_from: 25,
1030                    offset_to: 28,
1031                    position: 8,
1032                    text: "查".to_string(),
1033                    position_length: 1,
1034                },
1035                Token {
1036                    offset_from: 29,
1037                    offset_to: 32,
1038                    position: 9,
1039                    text: "查".to_string(),
1040                    position_length: 1,
1041                },
1042                Token {
1043                    offset_from: 32,
1044                    offset_to: 35,
1045                    position: 10,
1046                    text: "lex".to_string(),
1047                    position_length: 1,
1048                },
1049                Token {
1050                    offset_from: 37,
1051                    offset_to: 41,
1052                    position: 11,
1053                    text: "worl".to_string(),
1054                    position_length: 1,
1055                },
1056                Token {
1057                    offset_from: 41,
1058                    offset_to: 44,
1059                    position: 12,
1060                    text: "土".to_string(),
1061                    position_length: 1,
1062                },
1063                Token {
1064                    offset_from: 44,
1065                    offset_to: 45,
1066                    position: 13,
1067                    text: "d".to_string(),
1068                    position_length: 1,
1069                },
1070                Token {
1071                    offset_from: 45,
1072                    offset_to: 48,
1073                    position: 14,
1074                    text: "动".to_string(),
1075                    position_length: 1,
1076                },
1077            ],
1078        );
1079        assert_tokenization(&mut tokenizer, "。", &[]);
1080    }
1081}
summa_core/components/tokenizers/mmd_tokenizer.rs

summa_core/components/tokenizers/
mmd_tokenizer.rs