1use std::collections::{HashMap, HashSet};
2use std::str::CharIndices;
3
4#[derive(Clone)]
6pub struct MmdTokenizer {
7 skip_commands: HashMap<&'static str, &'static str>,
8 skip_tokens: HashSet<&'static str>,
9 drop_commands: HashSet<&'static str>,
10 known_commands: HashSet<&'static str>,
11}
12
13impl Default for MmdTokenizer {
14 fn default() -> Self {
15 static SKIP_COMMANDS: [(&str, &str); 6] = [
16 ("\\begin{table}", "\\end{table}"),
17 ("\\(", "\\)"),
18 ("\\[", "\\]"),
19 ("\\begin{tabular}", "\\end{tabular}"),
20 ("\\begin{figure}", "\\end{figure}"),
21 ("$$", "$$"),
22 ];
23 static SKIP_TOKENS: [&str; 33] = [
24 "#",
25 "##",
26 "###",
27 "####",
28 "#####",
29 "######",
30 "\\",
31 "\\begin{theorem}",
32 "\\end{theorem}",
33 "\\begin{lemma}",
34 "\\end{lemma}",
35 "\\begin{itemize}",
36 "\\end{itemize}",
37 "\\begin{equation}",
38 "\\end{equation}",
39 "\\begin{equation*}",
40 "\\end{equation*}",
41 "\\begin{align}",
42 "\\end{align}",
43 "\\begin{align*}",
44 "\\end{align*}",
45 "\\begin{split}",
46 "\\end{split}",
47 "\\begin{split*}",
48 "\\end{split*}",
49 "\\begin{gather}",
50 "\\end{gather}",
51 "\\begin{gather*}",
52 "\\end{gather*}",
53 "\\end{table}",
54 "\\end{tabular}",
55 "\\end{figure}",
56 "\\pagebreak",
57 ];
58 static DROP_COMMANDS: [&str; 17] = [
59 "\\footnote",
60 "\\footnotemark",
61 "\\underline",
62 "\\uline",
63 "\\uwave",
64 "\\dashuline",
65 "\\dotuline",
66 "\\sout",
67 "\\xout",
68 "\\title",
69 "\\author",
70 "\\section",
71 "\\subsection",
72 "\\subsubsection",
73 "\\textit",
74 "\\textbf",
75 "\\url",
76 ];
77 static KNOWN_COMMANDS: [&str; 3] = ["\\pagebreak", "\\begin", "\\end"];
78 MmdTokenizer {
79 skip_commands: HashMap::from_iter(SKIP_COMMANDS),
80 skip_tokens: HashSet::from_iter(SKIP_TOKENS),
81 drop_commands: HashSet::from_iter(DROP_COMMANDS),
82 known_commands: HashSet::from_iter(KNOWN_COMMANDS),
83 }
84 }
85}
86
87pub struct MmdTokenStream<'a> {
88 skip_list: Option<Vec<(usize, usize)>>,
89 skip_iter: usize,
90 chars: CharIndices<'a>,
91 token: tantivy::tokenizer::Token,
92 stacked_char: Option<(char, usize)>,
93 skip_commands: &'a HashMap<&'static str, &'static str>,
94 skip_tokens: &'a HashSet<&'static str>,
95 drop_commands: &'a HashSet<&'static str>,
96 known_commands: &'a HashSet<&'static str>,
97 base_offset: usize,
98 maybe_link: bool,
99}
100
101#[inline]
102pub fn accept_char(token: &mut tantivy::tokenizer::Token, c: char, offset: usize) {
103 if token.offset_from == usize::MAX {
104 token.offset_from = offset;
105 }
106 token.offset_to = offset + c.len_utf8();
107 token.text.push(c);
108}
109
110impl<'a> MmdTokenStream<'a> {
111 pub fn new(
112 text: &'a str,
113 skip_commands: &'a HashMap<&'static str, &'static str>,
114 skip_tokens: &'a HashSet<&'static str>,
115 drop_commands: &'a HashSet<&'static str>,
116 known_commands: &'a HashSet<&'static str>,
117 ) -> MmdTokenStream<'a> {
118 MmdTokenStream {
119 skip_list: None,
120 skip_iter: 0,
121 chars: text.char_indices(),
122 token: tantivy::tokenizer::Token::default(),
123 stacked_char: None,
124 skip_commands,
125 skip_tokens,
126 drop_commands,
127 known_commands,
128 base_offset: 0,
129 maybe_link: false,
130 }
131 }
132
133 pub fn new_with_offset_and_position(
134 text: &'a str,
135 offset: usize,
136 position: usize,
137 skip_list: Option<Vec<(usize, usize)>>,
138 skip_commands: &'a HashMap<&'static str, &'static str>,
139 skip_tokens: &'a HashSet<&'static str>,
140 drop_commands: &'a HashSet<&'static str>,
141 known_commands: &'a HashSet<&'static str>,
142 ) -> MmdTokenStream<'a> {
143 let token = tantivy::tokenizer::Token {
144 position,
145 ..Default::default()
146 };
147 MmdTokenStream {
148 skip_list,
149 skip_iter: 0,
150 chars: text.char_indices(),
151 token,
152 stacked_char: None,
153 skip_commands,
154 skip_tokens,
155 drop_commands,
156 known_commands,
157 base_offset: offset,
158 maybe_link: false,
159 }
160 }
161
162 pub fn token(&self) -> &tantivy::tokenizer::Token {
163 &self.token
164 }
165
166 pub fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
167 &mut self.token
168 }
169
170 fn advance_token(&mut self, update_position: bool) -> bool {
171 self.token.text.clear();
172 if update_position {
173 self.token.position = self.token.position.wrapping_add(1);
174 }
175 self.token.offset_from = usize::MAX;
176 let mut is_command = false;
177 let mut spec_counter = 0;
178 let mut start_skipping_round_bracket = false;
179 let mut skipped_round_bracket = 0;
180 let mut start_skipping_figure_bracket = false;
181 let mut skipped_figure_bracket = 0;
182
183 if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
184 accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
185 if is_cjk(&stacked_char) {
186 return true;
187 }
188 if stacked_char == '\\' {
189 is_command = true;
190 }
191 if stacked_char == '[' {
192 self.maybe_link = true;
193 }
194 }
195
196 for (offset, c) in &mut self.chars {
197 let real_offset = self.base_offset + offset;
198
199 if let Some(skip_list) = &self.skip_list {
200 while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
201 self.skip_iter += 1;
202 }
203 if self.skip_iter < skip_list.len() && skip_list[self.skip_iter].0 <= real_offset && real_offset < skip_list[self.skip_iter].1 {
204 continue;
205 }
206 }
207
208 if start_skipping_round_bracket || skipped_round_bracket > 0 {
209 start_skipping_round_bracket = false;
210 if c == '(' {
211 skipped_round_bracket += 1;
212 continue;
213 } else if c == ')' {
214 skipped_round_bracket -= 1;
215 if skipped_round_bracket == 0 {
216 start_skipping_figure_bracket = true;
217 }
218 continue;
219 } else if skipped_round_bracket > 0 {
220 continue;
221 }
222 }
223
224 if start_skipping_figure_bracket || skipped_figure_bracket > 0 {
225 start_skipping_figure_bracket = false;
226 if c == '{' {
227 skipped_figure_bracket += 1;
228 continue;
229 } else if c == '}' {
230 skipped_figure_bracket -= 1;
231 continue;
232 } else if skipped_figure_bracket > 0 {
233 continue;
234 }
235 }
236
237 if is_cjk(&c) {
238 if !self.token.text.is_empty() {
239 self.stacked_char = Some((c, offset));
240 return true;
241 }
242 accept_char(&mut self.token, c, real_offset);
243 return true;
244 }
245
246 if c == '\\' {
247 if !self.token.text.is_empty() {
248 self.stacked_char = Some((c, offset));
249 return true;
250 }
251 is_command = true;
252 accept_char(&mut self.token, c, real_offset);
253 } else if c == '[' && !is_command {
254 if !self.token.text.is_empty() {
255 self.stacked_char = Some((c, offset));
256 return true;
257 }
258 self.maybe_link = true;
259 } else if c == ']' && self.maybe_link && !is_command {
260 self.maybe_link = false;
261 start_skipping_round_bracket = true;
262 } else if c == '^' || c == '~' {
263 self.token.offset_to += 1;
264 } else if c == '*' || c == '_' {
265 spec_counter += 1;
266 } else if c.is_alphanumeric() || c == '#' || c == '+' {
267 if spec_counter == 1 {
268 self.stacked_char = Some((c, offset));
269 return true;
270 } else if spec_counter > 1 {
271 self.token.offset_to += spec_counter;
272 spec_counter = 0;
273 };
274 accept_char(&mut self.token, c, real_offset);
275 } else if is_command && (c == '(' || c == ')' || c == '[' || c == ']') && self.token.text.len() == 1 {
276 accept_char(&mut self.token, c, real_offset);
277 break;
278 } else if is_command && (c == '{' || c == '}') {
279 if self.drop_commands.contains(&self.token.text.as_str()) {
280 is_command = false;
281 self.token.text.clear();
282 self.token.offset_from = usize::MAX;
283 continue;
284 } else if c == '{' && !self.known_commands.contains(&self.token.text.as_str()) {
285 break;
286 }
287 accept_char(&mut self.token, c, real_offset);
288 if c == '}' {
289 break;
290 }
291 } else if !self.token.text.is_empty() {
292 break;
293 }
294 }
295 !self.token.text.is_empty()
296 }
297}
298
299impl tantivy::tokenizer::Tokenizer for MmdTokenizer {
300 type TokenStream<'a> = MmdTokenStream<'a>;
301
302 fn token_stream<'a>(&'a mut self, text: &'a str) -> MmdTokenStream<'a> {
303 MmdTokenStream::new(text, &self.skip_commands, &self.skip_tokens, &self.drop_commands, &self.known_commands)
304 }
305}
306
307#[inline]
308fn is_cjk(c: &char) -> bool {
309 (0x4e00 <= *c as u32 && *c as u32 <= 0x9FFF)
310 || (0x3400 <= *c as u32 && *c as u32 <= 0x4DBF)
311 || (0x20000 <= *c as u32 && *c as u32 <= 0x2A6DF)
312 || (0x2A700 <= *c as u32 && *c as u32 <= 0x2B73F)
313 || (0x2B740 <= *c as u32 && *c as u32 <= 0x2B81F)
314}
315
316impl<'a> tantivy::tokenizer::TokenStream for MmdTokenStream<'a> {
317 fn advance(&mut self) -> bool {
318 let mut result = self.advance_token(true);
319 while result {
320 if self.skip_tokens.contains(&self.token.text.as_str()) {
321 result = self.advance_token(false);
322 } else if let Some(end_command) = self.skip_commands.get(self.token.text.as_str()) {
323 while result && self.token.text != *end_command {
324 result = self.advance_token(false);
325 }
326 result = self.advance_token(false);
327 } else {
328 while self.token.text.starts_with('\\') {
329 self.token.offset_from += 1;
330 self.token.text = self.token.text[1..].to_string()
331 }
332 if self.token.text == "]" || self.token.text == "}" || self.token.text == ")" {
333 result = self.advance_token(false);
334 } else {
335 break;
336 }
337 }
338 }
339 result
340 }
341
342 fn token(&self) -> &tantivy::tokenizer::Token {
343 &self.token
344 }
345
346 fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
347 &mut self.token
348 }
349}
350
351#[cfg(test)]
352pub mod tests {
353 use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenizerManager};
354
355 use super::MmdTokenizer;
356
357 pub fn assert_tokenization(tokenizer: &mut TextAnalyzer, text: &str, response: &[Token]) {
358 let mut tokens: Vec<Token> = vec![];
359 {
360 let mut add_token = |token: &Token| {
361 tokens.push(token.clone());
362 };
363 tokenizer.token_stream(text).process(&mut add_token);
364 }
365 assert_eq!(tokens, response);
366 }
367
368 #[test]
369 fn test_en_tokenizer() {
370 let tokenizer_manager = TokenizerManager::default();
371 tokenizer_manager.register(
372 "tokenizer",
373 TextAnalyzer::builder(MmdTokenizer::default())
374 .filter(RemoveLongFilter::limit(40))
375 .filter(LowerCaser)
376 .build(),
377 );
378 let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
379 assert_tokenization(&mut tokenizer, "#", &[]);
380 assert_tokenization(
381 &mut tokenizer,
382 "# Header1",
383 &[Token {
384 offset_from: 2,
385 offset_to: 9,
386 position: 0,
387 text: "header1".to_string(),
388 position_length: 1,
389 }],
390 );
391 assert_tokenization(&mut tokenizer, "\\begin{table}\\end{table}", &[]);
392 assert_tokenization(
393 &mut tokenizer,
394 "\\begin{table}\\end{table}a",
395 &[Token {
396 offset_from: 24,
397 offset_to: 25,
398 position: 0,
399 text: "a".to_string(),
400 position_length: 1,
401 }],
402 );
403 assert_tokenization(&mut tokenizer, "\\begin{table}# Header 1\\end{table}", &[]);
404 assert_tokenization(&mut tokenizer, "\\end{table}", &[]);
405 assert_tokenization(
406 &mut tokenizer,
407 "# Header1\nHello, 1 \\ 2 world! \\begin{table}table content\\end{table}\n\\begin{theorem}\ntheorem content\\end{theorem}",
408 &[
409 Token {
410 offset_from: 2,
411 offset_to: 9,
412 position: 0,
413 text: "header1".to_string(),
414 position_length: 1,
415 },
416 Token {
417 offset_from: 10,
418 offset_to: 15,
419 position: 1,
420 text: "hello".to_string(),
421 position_length: 1,
422 },
423 Token {
424 offset_from: 17,
425 offset_to: 18,
426 position: 2,
427 text: "1".to_string(),
428 position_length: 1,
429 },
430 Token {
431 offset_from: 21,
432 offset_to: 22,
433 position: 3,
434 text: "2".to_string(),
435 position_length: 1,
436 },
437 Token {
438 offset_from: 23,
439 offset_to: 28,
440 position: 4,
441 text: "world".to_string(),
442 position_length: 1,
443 },
444 Token {
445 offset_from: 84,
446 offset_to: 91,
447 position: 5,
448 text: "theorem".to_string(),
449 position_length: 1,
450 },
451 Token {
452 offset_from: 92,
453 offset_to: 99,
454 position: 6,
455 text: "content".to_string(),
456 position_length: 1,
457 },
458 ],
459 );
460 assert_tokenization(
461 &mut tokenizer,
462 "# Header1 \\footnote{footnote text}# Header2 \\uline{\\uline{double line}}",
463 &[
464 Token {
465 offset_from: 2,
466 offset_to: 9,
467 position: 0,
468 text: "header1".to_string(),
469 position_length: 1,
470 },
471 Token {
472 offset_from: 20,
473 offset_to: 28,
474 position: 1,
475 text: "footnote".to_string(),
476 position_length: 1,
477 },
478 Token {
479 offset_from: 29,
480 offset_to: 33,
481 position: 2,
482 text: "text".to_string(),
483 position_length: 1,
484 },
485 Token {
486 offset_from: 36,
487 offset_to: 43,
488 position: 3,
489 text: "header2".to_string(),
490 position_length: 1,
491 },
492 Token {
493 offset_from: 58,
494 offset_to: 64,
495 position: 4,
496 text: "double".to_string(),
497 position_length: 1,
498 },
499 Token {
500 offset_from: 65,
501 offset_to: 69,
502 position: 5,
503 text: "line".to_string(),
504 position_length: 1,
505 },
506 ],
507 );
508 assert_tokenization(
509 &mut tokenizer,
510 "\\noncommand \\noncommand2 \\",
511 &[
512 Token {
513 offset_from: 1,
514 offset_to: 11,
515 position: 0,
516 text: "noncommand".to_string(),
517 position_length: 1,
518 },
519 Token {
520 offset_from: 13,
521 offset_to: 24,
522 position: 1,
523 text: "noncommand2".to_string(),
524 position_length: 1,
525 },
526 ],
527 );
528 assert_tokenization(
529 &mut tokenizer,
530 "\\command{weird}",
531 &[
532 Token {
533 offset_from: 1,
534 offset_to: 8,
535 position: 0,
536 text: "command".to_string(),
537 position_length: 1,
538 },
539 Token {
540 offset_from: 9,
541 offset_to: 14,
542 position: 1,
543 text: "weird".to_string(),
544 position_length: 1,
545 },
546 ],
547 );
548 assert_tokenization(
549 &mut tokenizer,
550 "word1 \\(x_1 + x_2\\) \\word2",
551 &[
552 Token {
553 offset_from: 0,
554 offset_to: 5,
555 position: 0,
556 text: "word1".to_string(),
557 position_length: 1,
558 },
559 Token {
560 offset_from: 21,
561 offset_to: 26,
562 position: 1,
563 text: "word2".to_string(),
564 position_length: 1,
565 },
566 ],
567 );
568 assert_tokenization(
569 &mut tokenizer,
570 "Love**is**bold",
571 &[Token {
572 offset_from: 0,
573 offset_to: 14,
574 position: 0,
575 text: "loveisbold".to_string(),
576 position_length: 1,
577 }],
578 );
579 assert_tokenization(
580 &mut tokenizer,
581 "Love*is*bold",
582 &[
583 Token {
584 offset_from: 0,
585 offset_to: 4,
586 position: 0,
587 text: "love".to_string(),
588 position_length: 1,
589 },
590 Token {
591 offset_from: 5,
592 offset_to: 7,
593 position: 1,
594 text: "is".to_string(),
595 position_length: 1,
596 },
597 Token {
598 offset_from: 8,
599 offset_to: 12,
600 position: 2,
601 text: "bold".to_string(),
602 position_length: 1,
603 },
604 ],
605 );
606 assert_tokenization(
607 &mut tokenizer,
608 "Love **is*bold",
609 &[
610 Token {
611 offset_from: 0,
612 offset_to: 4,
613 position: 0,
614 text: "love".to_string(),
615 position_length: 1,
616 },
617 Token {
618 offset_from: 7,
619 offset_to: 9,
620 position: 1,
621 text: "is".to_string(),
622 position_length: 1,
623 },
624 Token {
625 offset_from: 10,
626 offset_to: 14,
627 position: 2,
628 text: "bold".to_string(),
629 position_length: 1,
630 },
631 ],
632 );
633 assert_tokenization(
634 &mut tokenizer,
635 "How to do x^2",
636 &[
637 Token {
638 offset_from: 0,
639 offset_to: 3,
640 position: 0,
641 text: "how".to_string(),
642 position_length: 1,
643 },
644 Token {
645 offset_from: 4,
646 offset_to: 6,
647 position: 1,
648 text: "to".to_string(),
649 position_length: 1,
650 },
651 Token {
652 offset_from: 7,
653 offset_to: 9,
654 position: 2,
655 text: "do".to_string(),
656 position_length: 1,
657 },
658 Token {
659 offset_from: 10,
660 offset_to: 13,
661 position: 3,
662 text: "x2".to_string(),
663 position_length: 1,
664 },
665 ],
666 );
667 assert_tokenization(&mut tokenizer, "![]()", &[]);
668 assert_tokenization(
669 &mut tokenizer,
670 "{width=1}",
671 &[
672 Token {
673 offset_from: 2,
674 offset_to: 7,
675 position: 0,
676 text: "image".to_string(),
677 position_length: 1,
678 },
679 Token {
680 offset_from: 8,
681 offset_to: 12,
682 position: 1,
683 text: "text".to_string(),
684 position_length: 1,
685 },
686 ],
687 );
688 assert_tokenization(
689 &mut tokenizer,
690 "[ref] (author)",
691 &[
692 Token {
693 offset_from: 1,
694 offset_to: 4,
695 position: 0,
696 text: "ref".to_string(),
697 position_length: 1,
698 },
699 Token {
700 offset_from: 7,
701 offset_to: 13,
702 position: 1,
703 text: "author".to_string(),
704 position_length: 1,
705 },
706 ],
707 );
708 assert_tokenization(
709 &mut tokenizer,
710 "[ref]test [ref](l)test",
711 &[
712 Token {
713 offset_from: 1,
714 offset_to: 9,
715 position: 0,
716 text: "reftest".to_string(),
717 position_length: 1,
718 },
719 Token {
720 offset_from: 11,
721 offset_to: 22,
722 position: 1,
723 text: "reftest".to_string(),
724 position_length: 1,
725 },
726 ],
727 );
728 assert_tokenization(
729 &mut tokenizer,
730 "\\title{This is title}\n\\author{Author}\n\\section{Section 1}\n\\subsection{Section 1.1}\n\\subsubsection{Section 1.1.1}",
731 &[
732 Token {
733 offset_from: 7,
734 offset_to: 11,
735 position: 0,
736 text: "this".to_string(),
737 position_length: 1,
738 },
739 Token {
740 offset_from: 12,
741 offset_to: 14,
742 position: 1,
743 text: "is".to_string(),
744 position_length: 1,
745 },
746 Token {
747 offset_from: 15,
748 offset_to: 20,
749 position: 2,
750 text: "title".to_string(),
751 position_length: 1,
752 },
753 Token {
754 offset_from: 30,
755 offset_to: 36,
756 position: 3,
757 text: "author".to_string(),
758 position_length: 1,
759 },
760 Token {
761 offset_from: 47,
762 offset_to: 54,
763 position: 4,
764 text: "section".to_string(),
765 position_length: 1,
766 },
767 Token {
768 offset_from: 55,
769 offset_to: 56,
770 position: 5,
771 text: "1".to_string(),
772 position_length: 1,
773 },
774 Token {
775 offset_from: 70,
776 offset_to: 77,
777 position: 6,
778 text: "section".to_string(),
779 position_length: 1,
780 },
781 Token {
782 offset_from: 78,
783 offset_to: 79,
784 position: 7,
785 text: "1".to_string(),
786 position_length: 1,
787 },
788 Token {
789 offset_from: 80,
790 offset_to: 81,
791 position: 8,
792 text: "1".to_string(),
793 position_length: 1,
794 },
795 Token {
796 offset_from: 98,
797 offset_to: 105,
798 position: 9,
799 text: "section".to_string(),
800 position_length: 1,
801 },
802 Token {
803 offset_from: 106,
804 offset_to: 107,
805 position: 10,
806 text: "1".to_string(),
807 position_length: 1,
808 },
809 Token {
810 offset_from: 108,
811 offset_to: 109,
812 position: 11,
813 text: "1".to_string(),
814 position_length: 1,
815 },
816 Token {
817 offset_from: 110,
818 offset_to: 111,
819 position: 12,
820 text: "1".to_string(),
821 position_length: 1,
822 },
823 ],
824 );
825 assert_tokenization(
826 &mut tokenizer,
827 "-abc{} \\[34\\] \\] \\) \\} 1 ### abc \\(",
828 &[
829 Token {
830 offset_from: 2,
831 offset_to: 5,
832 position: 0,
833 text: "ref".to_string(),
834 position_length: 1,
835 },
836 Token {
837 offset_from: 13,
838 offset_to: 16,
839 position: 1,
840 text: "abc".to_string(),
841 position_length: 1,
842 },
843 Token {
844 offset_from: 35,
845 offset_to: 36,
846 position: 2,
847 text: "1".to_string(),
848 position_length: 1,
849 },
850 Token {
851 offset_from: 41,
852 offset_to: 44,
853 position: 3,
854 text: "abc".to_string(),
855 position_length: 1,
856 },
857 ],
858 );
859 }
860
861 #[test]
862 fn test_zh_tokenizer() {
863 let tokenizer_manager = TokenizerManager::default();
864 tokenizer_manager.register(
865 "tokenizer",
866 TextAnalyzer::builder(MmdTokenizer::default())
867 .filter(RemoveLongFilter::limit(40))
868 .filter(LowerCaser)
869 .build(),
870 );
871 let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
872 assert_tokenization(
873 &mut tokenizer,
874 "在查hello, worl土d动!",
875 &[
876 Token {
877 offset_from: 0,
878 offset_to: 3,
879 position: 0,
880 text: "在".to_string(),
881 position_length: 1,
882 },
883 Token {
884 offset_from: 3,
885 offset_to: 6,
886 position: 1,
887 text: "查".to_string(),
888 position_length: 1,
889 },
890 Token {
891 offset_from: 6,
892 offset_to: 11,
893 position: 2,
894 text: "hello".to_string(),
895 position_length: 1,
896 },
897 Token {
898 offset_from: 13,
899 offset_to: 17,
900 position: 3,
901 text: "worl".to_string(),
902 position_length: 1,
903 },
904 Token {
905 offset_from: 17,
906 offset_to: 20,
907 position: 4,
908 text: "土".to_string(),
909 position_length: 1,
910 },
911 Token {
912 offset_from: 20,
913 offset_to: 21,
914 position: 5,
915 text: "d".to_string(),
916 position_length: 1,
917 },
918 Token {
919 offset_from: 21,
920 offset_to: 24,
921 position: 6,
922 text: "动".to_string(),
923 position_length: 1,
924 },
925 ],
926 );
927 assert_tokenization(
928 &mut tokenizer,
929 "在查土d动",
930 &[
931 Token {
932 offset_from: 0,
933 offset_to: 3,
934 position: 0,
935 text: "在".to_string(),
936 position_length: 1,
937 },
938 Token {
939 offset_from: 3,
940 offset_to: 6,
941 position: 1,
942 text: "查".to_string(),
943 position_length: 1,
944 },
945 Token {
946 offset_from: 6,
947 offset_to: 9,
948 position: 2,
949 text: "土".to_string(),
950 position_length: 1,
951 },
952 Token {
953 offset_from: 9,
954 offset_to: 10,
955 position: 3,
956 text: "d".to_string(),
957 position_length: 1,
958 },
959 Token {
960 offset_from: 10,
961 offset_to: 13,
962 position: 4,
963 text: "动".to_string(),
964 position_length: 1,
965 },
966 ],
967 );
968 assert_tokenization(
969 &mut tokenizer,
970 "Veri 在查hello, c查m p查 查lex worl土d动!",
971 &[
972 Token {
973 offset_from: 0,
974 offset_to: 4,
975 position: 0,
976 text: "veri".to_string(),
977 position_length: 1,
978 },
979 Token {
980 offset_from: 5,
981 offset_to: 8,
982 position: 1,
983 text: "在".to_string(),
984 position_length: 1,
985 },
986 Token {
987 offset_from: 8,
988 offset_to: 11,
989 position: 2,
990 text: "查".to_string(),
991 position_length: 1,
992 },
993 Token {
994 offset_from: 11,
995 offset_to: 16,
996 position: 3,
997 text: "hello".to_string(),
998 position_length: 1,
999 },
1000 Token {
1001 offset_from: 18,
1002 offset_to: 19,
1003 position: 4,
1004 text: "c".to_string(),
1005 position_length: 1,
1006 },
1007 Token {
1008 offset_from: 19,
1009 offset_to: 22,
1010 position: 5,
1011 text: "查".to_string(),
1012 position_length: 1,
1013 },
1014 Token {
1015 offset_from: 22,
1016 offset_to: 23,
1017 position: 6,
1018 text: "m".to_string(),
1019 position_length: 1,
1020 },
1021 Token {
1022 offset_from: 24,
1023 offset_to: 25,
1024 position: 7,
1025 text: "p".to_string(),
1026 position_length: 1,
1027 },
1028 Token {
1029 offset_from: 25,
1030 offset_to: 28,
1031 position: 8,
1032 text: "查".to_string(),
1033 position_length: 1,
1034 },
1035 Token {
1036 offset_from: 29,
1037 offset_to: 32,
1038 position: 9,
1039 text: "查".to_string(),
1040 position_length: 1,
1041 },
1042 Token {
1043 offset_from: 32,
1044 offset_to: 35,
1045 position: 10,
1046 text: "lex".to_string(),
1047 position_length: 1,
1048 },
1049 Token {
1050 offset_from: 37,
1051 offset_to: 41,
1052 position: 11,
1053 text: "worl".to_string(),
1054 position_length: 1,
1055 },
1056 Token {
1057 offset_from: 41,
1058 offset_to: 44,
1059 position: 12,
1060 text: "土".to_string(),
1061 position_length: 1,
1062 },
1063 Token {
1064 offset_from: 44,
1065 offset_to: 45,
1066 position: 13,
1067 text: "d".to_string(),
1068 position_length: 1,
1069 },
1070 Token {
1071 offset_from: 45,
1072 offset_to: 48,
1073 position: 14,
1074 text: "动".to_string(),
1075 position_length: 1,
1076 },
1077 ],
1078 );
1079 assert_tokenization(&mut tokenizer, "。", &[]);
1080 }
1081}