1use std::fmt;
2
3use serde::de::{self, MapAccess, SeqAccess, Visitor};
4use serde::{Deserialize, Deserializer, Serialize};
5
6use lindera_core::dictionary::{Dictionary, UserDictionary};
7use lindera_core::mode::Mode;
8use lindera_core::viterbi::Lattice;
9use lindera_core::LinderaResult;
10use lindera_dictionary::{DictionaryConfig, DictionaryLoader, UserDictionaryConfig};
11
12use crate::token::Token;
13
14#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
16pub struct TokenizerConfig {
17 pub dictionary: DictionaryConfig,
19
20 pub user_dictionary: Option<UserDictionaryConfig>,
22
23 pub mode: Mode,
25}
26
27impl Default for TokenizerConfig {
28 fn default() -> Self {
31 Self {
32 dictionary: DictionaryConfig {
33 kind: None,
34 path: None,
35 },
36 user_dictionary: None,
37 mode: Mode::Normal,
38 }
39 }
40}
41
42impl<'de> Deserialize<'de> for TokenizerConfig {
43 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
44 where
45 D: Deserializer<'de>,
46 {
47 enum Field {
48 Dictionary,
49 UserDictionary,
50 Mode,
51 }
52
53 impl<'de> Deserialize<'de> for Field {
54 fn deserialize<D>(deserializer: D) -> Result<Field, D::Error>
55 where
56 D: Deserializer<'de>,
57 {
58 struct FieldVisitor;
59
60 impl<'de> Visitor<'de> for FieldVisitor {
61 type Value = Field;
62
63 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
64 formatter.write_str("`dictionary`, `user_dictionary`, or `mode`")
65 }
66
67 fn visit_str<E>(self, value: &str) -> Result<Field, E>
68 where
69 E: de::Error,
70 {
71 match value {
72 "dictionary" => Ok(Field::Dictionary),
73 "user_dictionary" => Ok(Field::UserDictionary),
74 "mode" => Ok(Field::Mode),
75 _ => Err(de::Error::unknown_field(value, FIELDS)),
76 }
77 }
78 }
79
80 deserializer.deserialize_identifier(FieldVisitor)
81 }
82 }
83
84 struct DurationVisitor;
85
86 impl<'de> Visitor<'de> for DurationVisitor {
87 type Value = TokenizerConfig;
88
89 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
90 formatter.write_str("struct TokenizerConfig")
91 }
92
93 fn visit_seq<V>(self, mut seq: V) -> Result<TokenizerConfig, V::Error>
94 where
95 V: SeqAccess<'de>,
96 {
97 let dictionary = seq
98 .next_element()?
99 .ok_or_else(|| de::Error::invalid_length(0, &self))?;
100 let user_dictionary = seq.next_element()?.unwrap_or(None);
101 let mode = seq.next_element()?.unwrap_or(Mode::Normal);
102
103 Ok(TokenizerConfig {
104 dictionary,
105 user_dictionary,
106 mode,
107 })
108 }
109
110 fn visit_map<V>(self, mut map: V) -> Result<TokenizerConfig, V::Error>
111 where
112 V: MapAccess<'de>,
113 {
114 let mut dictionary = None;
115 let mut user_dictionary = None;
116 let mut mode = None;
117 while let Some(key) = map.next_key()? {
118 match key {
119 Field::Dictionary => {
120 if dictionary.is_some() {
121 return Err(de::Error::duplicate_field("dictionary"));
122 }
123 dictionary = Some(map.next_value()?);
124 }
125 Field::UserDictionary => {
126 if user_dictionary.is_some() {
127 return Err(de::Error::duplicate_field("user_dictionary"));
128 }
129 user_dictionary = Some(map.next_value()?);
130 }
131 Field::Mode => {
132 if mode.is_some() {
133 return Err(de::Error::duplicate_field("mode"));
134 }
135 mode = Some(map.next_value()?);
136 }
137 }
138 }
139 let dictionary =
140 dictionary.ok_or_else(|| de::Error::missing_field("dictionary"))?;
141 let mode = mode.unwrap_or(Mode::Normal);
142 Ok(TokenizerConfig {
143 dictionary,
144 user_dictionary,
145 mode,
146 })
147 }
148 }
149
150 const FIELDS: &[&str] = &["dictionary", "user_dictionary", "mode", "with_details"];
151 deserializer.deserialize_struct("TokenizerConfig", FIELDS, DurationVisitor)
152 }
153}
154
155#[derive(Clone)]
156pub struct Tokenizer {
158 pub dictionary: Dictionary,
160
161 pub user_dictionary: Option<UserDictionary>,
163
164 pub mode: Mode,
166}
167
168impl Tokenizer {
169 pub fn from_config(config: TokenizerConfig) -> LinderaResult<Self> {
178 let dictionary = DictionaryLoader::load_dictionary_from_config(config.dictionary)?;
179
180 let user_dictionary = match config.user_dictionary {
181 Some(user_dict_conf) => Some(DictionaryLoader::load_user_dictionary_from_config(
182 user_dict_conf,
183 )?),
184 None => None,
185 };
186
187 Ok(Self::new(dictionary, user_dictionary, config.mode))
188 }
189
190 pub fn new(
201 dictionary: Dictionary,
202 user_dictionary: Option<UserDictionary>,
203 mode: Mode,
204 ) -> Self {
205 Self {
206 dictionary,
207 user_dictionary,
208 mode,
209 }
210 }
211
212 pub fn tokenize<'a>(&'a self, text: &'a str) -> LinderaResult<Vec<Token<'a>>> {
224 let mut tokens: Vec<Token> = Vec::new();
225 let mut lattice = Lattice::default();
226
227 let mut position = 0_usize;
228 let mut byte_position = 0_usize;
229
230 for sentence in text.split_inclusive(&['。', '、', '\n', '\t']) {
232 if text.is_empty() {
233 continue;
234 }
235
236 lattice.set_text(
237 &self.dictionary.dict,
238 &self.user_dictionary.as_ref().map(|d| &d.dict),
239 &self.dictionary.char_definitions,
240 &self.dictionary.unknown_dictionary,
241 sentence,
242 &self.mode,
243 );
244 lattice.calculate_path_costs(&self.dictionary.cost_matrix, &self.mode);
245
246 let offsets = lattice.tokens_offset();
247
248 for i in 0..offsets.len() {
249 let (byte_start, word_id) = offsets[i];
250 let byte_end = if i == offsets.len() - 1 {
251 sentence.len()
252 } else {
253 let (next_start, _word_id) = offsets[i + 1];
254 next_start
255 };
256
257 let surface = &sentence[byte_start..byte_end];
259
260 let token_start = byte_position;
262 byte_position += surface.len();
263 let token_end = byte_position;
264
265 tokens.push(Token::new(
266 surface,
267 token_start,
268 token_end,
269 position,
270 word_id,
271 &self.dictionary,
272 self.user_dictionary.as_ref(),
273 ));
274
275 position += 1;
276 }
277 }
278
279 Ok(tokens)
280 }
281}
282
283#[cfg(test)]
284mod tests {
285 #[allow(unused_imports)]
286 #[cfg(any(
287 feature = "ipadic",
288 feature = "ipadic-neologd",
289 feature = "unidic",
290 feature = "ko-dic",
291 feature = "cc-cedict"
292 ))]
293 use std::{
294 fs::File,
295 io::{BufReader, Read},
296 path::PathBuf,
297 };
298
299 #[allow(unused_imports)]
300 #[cfg(any(
301 feature = "ipadic",
302 feature = "ipadic-neologd",
303 feature = "unidic",
304 feature = "ko-dic",
305 feature = "cc-cedict"
306 ))]
307 use lindera_core::mode::{Mode, Penalty};
308
309 #[cfg(any(
310 feature = "ipadic",
311 feature = "ipadic-neologd",
312 feature = "unidic",
313 feature = "ko-dic",
314 feature = "cc-cedict"
315 ))]
316 use lindera_dictionary::{DictionaryConfig, DictionaryKind, UserDictionaryConfig};
317
318 #[cfg(any(
319 feature = "ipadic",
320 feature = "ipadic-neologd",
321 feature = "unidic",
322 feature = "ko-dic",
323 feature = "cc-cedict"
324 ))]
325 use crate::tokenizer::{Tokenizer, TokenizerConfig};
326
327 #[test]
328 #[cfg(feature = "ipadic")]
329 fn test_tokenize_config_ipadic_normal() {
330 let config_str = r#"
331 {
332 "dictionary": {
333 "kind": "ipadic"
334 },
335 "mode": "normal"
336 }
337 "#;
338
339 let config: TokenizerConfig = serde_json::from_str(config_str).unwrap();
340 assert_eq!(config.dictionary.kind, Some(DictionaryKind::IPADIC));
341 }
342
343 #[test]
344 #[cfg(feature = "ipadic")]
345 fn test_tokenize_config_ipadic_decompose() {
346 let config_str = r#"
347 {
348 "dictionary": {
349 "kind": "ipadic"
350 },
351 "mode": {
352 "decompose": {
353 "kanji_penalty_length_threshold": 2,
354 "kanji_penalty_length_penalty": 3000,
355 "other_penalty_length_threshold": 7,
356 "other_penalty_length_penalty": 1700
357 }
358 }
359 }
360 "#;
361
362 let config: TokenizerConfig = serde_json::from_str(config_str).unwrap();
363 assert_eq!(config.dictionary.kind, Some(DictionaryKind::IPADIC));
364 }
365
366 #[test]
367 #[cfg(feature = "ipadic")]
368 fn test_tokenize_ipadic() {
369 let dictionary = DictionaryConfig {
370 kind: Some(DictionaryKind::IPADIC),
371 path: None,
372 };
373
374 let config = TokenizerConfig {
375 dictionary,
376 user_dictionary: None,
377 mode: Mode::Normal,
378 };
379
380 let tokenizer = Tokenizer::from_config(config).unwrap();
381 let mut tokens = tokenizer
382 .tokenize("日本語の形態素解析を行うことができます。テスト。")
383 .unwrap();
384 let mut tokens_iter = tokens.iter_mut();
385 {
386 let token = tokens_iter.next().unwrap();
387 assert_eq!(token.text, "日本語");
388 assert_eq!(token.byte_start, 0);
389 assert_eq!(token.byte_end, 9);
390 assert_eq!(token.position, 0);
391 assert_eq!(token.position_length, 1);
392 assert_eq!(
393 token.get_details().unwrap(),
394 vec![
395 "名詞",
396 "一般",
397 "*",
398 "*",
399 "*",
400 "*",
401 "日本語",
402 "ニホンゴ",
403 "ニホンゴ"
404 ]
405 );
406 }
407 {
408 let token = tokens_iter.next().unwrap();
409 assert_eq!(token.text, "の");
410 assert_eq!(token.byte_start, 9);
411 assert_eq!(token.byte_end, 12);
412 assert_eq!(token.position, 1);
413 assert_eq!(token.position_length, 1);
414 assert_eq!(
415 token.get_details().unwrap(),
416 vec!["助詞", "連体化", "*", "*", "*", "*", "の", "ノ", "ノ"]
417 );
418 }
419 {
420 let token = tokens_iter.next().unwrap();
421 assert_eq!(token.text, "形態素");
422 assert_eq!(token.byte_start, 12);
423 assert_eq!(token.byte_end, 21);
424 assert_eq!(token.position, 2);
425 assert_eq!(token.position_length, 1);
426 assert_eq!(
427 token.get_details().unwrap(),
428 vec![
429 "名詞",
430 "一般",
431 "*",
432 "*",
433 "*",
434 "*",
435 "形態素",
436 "ケイタイソ",
437 "ケイタイソ"
438 ]
439 );
440 }
441 {
442 let token = tokens_iter.next().unwrap();
443 assert_eq!(token.text, "解析");
444 assert_eq!(token.byte_start, 21);
445 assert_eq!(token.byte_end, 27);
446 assert_eq!(token.position, 3);
447 assert_eq!(token.position_length, 1);
448 assert_eq!(
449 token.get_details().unwrap(),
450 vec![
451 "名詞",
452 "サ変接続",
453 "*",
454 "*",
455 "*",
456 "*",
457 "解析",
458 "カイセキ",
459 "カイセキ"
460 ]
461 );
462 }
463 {
464 let token = tokens_iter.next().unwrap();
465 assert_eq!(token.text, "を");
466 assert_eq!(token.byte_start, 27);
467 assert_eq!(token.byte_end, 30);
468 assert_eq!(token.position, 4);
469 assert_eq!(token.position_length, 1);
470 assert_eq!(
471 token.get_details().unwrap(),
472 vec!["助詞", "格助詞", "一般", "*", "*", "*", "を", "ヲ", "ヲ"]
473 );
474 }
475 {
476 let token = tokens_iter.next().unwrap();
477 assert_eq!(token.text, "行う");
478 assert_eq!(token.byte_start, 30);
479 assert_eq!(token.byte_end, 36);
480 assert_eq!(token.position, 5);
481 assert_eq!(token.position_length, 1);
482 assert_eq!(
483 token.get_details().unwrap(),
484 vec![
485 "動詞",
486 "自立",
487 "*",
488 "*",
489 "五段・ワ行促音便",
490 "基本形",
491 "行う",
492 "オコナウ",
493 "オコナウ"
494 ]
495 );
496 }
497 {
498 let token = tokens_iter.next().unwrap();
499 assert_eq!(token.text, "こと");
500 assert_eq!(token.byte_start, 36);
501 assert_eq!(token.byte_end, 42);
502 assert_eq!(token.position, 6);
503 assert_eq!(token.position_length, 1);
504 assert_eq!(
505 token.get_details().unwrap(),
506 vec![
507 "名詞",
508 "非自立",
509 "一般",
510 "*",
511 "*",
512 "*",
513 "こと",
514 "コト",
515 "コト"
516 ]
517 );
518 }
519 {
520 let token = tokens_iter.next().unwrap();
521 assert_eq!(token.text, "が");
522 assert_eq!(token.byte_start, 42);
523 assert_eq!(token.byte_end, 45);
524 assert_eq!(token.position, 7);
525 assert_eq!(token.position_length, 1);
526 assert_eq!(
527 token.get_details().unwrap(),
528 vec!["助詞", "格助詞", "一般", "*", "*", "*", "が", "ガ", "ガ"]
529 );
530 }
531 {
532 let token = tokens_iter.next().unwrap();
533 assert_eq!(token.text, "でき");
534 assert_eq!(token.byte_start, 45);
535 assert_eq!(token.byte_end, 51);
536 assert_eq!(token.position, 8);
537 assert_eq!(token.position_length, 1);
538 assert_eq!(
539 token.get_details().unwrap(),
540 vec![
541 "動詞",
542 "自立",
543 "*",
544 "*",
545 "一段",
546 "連用形",
547 "できる",
548 "デキ",
549 "デキ"
550 ]
551 );
552 }
553 {
554 let token = tokens_iter.next().unwrap();
555 assert_eq!(token.text, "ます");
556 assert_eq!(token.byte_start, 51);
557 assert_eq!(token.byte_end, 57);
558 assert_eq!(token.position, 9);
559 assert_eq!(token.position_length, 1);
560 assert_eq!(
561 token.get_details().unwrap(),
562 vec![
563 "助動詞",
564 "*",
565 "*",
566 "*",
567 "特殊・マス",
568 "基本形",
569 "ます",
570 "マス",
571 "マス"
572 ]
573 );
574 }
575 {
576 let token = tokens_iter.next().unwrap();
577 assert_eq!(token.text, "。");
578 assert_eq!(token.byte_start, 57);
579 assert_eq!(token.byte_end, 60);
580 assert_eq!(token.position, 10);
581 assert_eq!(token.position_length, 1);
582 assert_eq!(
583 token.get_details().unwrap(),
584 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
585 );
586 }
587 {
588 let token = tokens_iter.next().unwrap();
589 assert_eq!(token.text, "テスト");
590 assert_eq!(token.byte_start, 60);
591 assert_eq!(token.byte_end, 69);
592 assert_eq!(token.position, 11);
593 assert_eq!(token.position_length, 1);
594 assert_eq!(
595 token.get_details().unwrap(),
596 vec![
597 "名詞",
598 "サ変接続",
599 "*",
600 "*",
601 "*",
602 "*",
603 "テスト",
604 "テスト",
605 "テスト"
606 ]
607 );
608 }
609 {
610 let token = tokens_iter.next().unwrap();
611 assert_eq!(token.text, "。");
612 assert_eq!(token.byte_start, 69);
613 assert_eq!(token.byte_end, 72);
614 assert_eq!(token.position, 12);
615 assert_eq!(token.position_length, 1);
616 assert_eq!(
617 token.get_details().unwrap(),
618 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
619 );
620 }
621 }
622
623 #[test]
624 #[cfg(feature = "unidic")]
625 fn test_tokenize_unidic() {
626 let dictionary = DictionaryConfig {
627 kind: Some(DictionaryKind::UniDic),
628 path: None,
629 };
630
631 let config = TokenizerConfig {
632 dictionary,
633 user_dictionary: None,
634 mode: Mode::Normal,
635 };
636
637 let tokenizer = Tokenizer::from_config(config).unwrap();
638 let mut tokens = tokenizer
639 .tokenize("日本語の形態素解析を行うことができます。")
640 .unwrap();
641 let mut tokens_iter = tokens.iter_mut();
642 {
643 let token = tokens_iter.next().unwrap();
644 assert_eq!(token.text, "日本");
645 assert_eq!(token.byte_start, 0);
646 assert_eq!(token.byte_end, 6);
647 assert_eq!(token.position, 0);
648 assert_eq!(token.position_length, 1);
649 assert_eq!(
650 token.get_details().unwrap(),
651 vec![
652 "名詞",
653 "固有名詞",
654 "地名",
655 "国",
656 "*",
657 "*",
658 "ニッポン",
659 "日本",
660 "日本",
661 "ニッポン",
662 "日本",
663 "ニッポン",
664 "固",
665 "*",
666 "*",
667 "*",
668 "*"
669 ]
670 );
671 }
672 {
673 let token = tokens_iter.next().unwrap();
674 assert_eq!(token.text, "語");
675 assert_eq!(token.byte_start, 6);
676 assert_eq!(token.byte_end, 9);
677 assert_eq!(token.position, 1);
678 assert_eq!(token.position_length, 1);
679 assert_eq!(
680 token.get_details().unwrap(),
681 vec![
682 "名詞",
683 "普通名詞",
684 "一般",
685 "*",
686 "*",
687 "*",
688 "ゴ",
689 "語",
690 "語",
691 "ゴ",
692 "語",
693 "ゴ",
694 "漢",
695 "*",
696 "*",
697 "*",
698 "*"
699 ]
700 );
701 }
702 {
703 let token = tokens_iter.next().unwrap();
704 assert_eq!(token.text, "の");
705 assert_eq!(token.byte_start, 9);
706 assert_eq!(token.byte_end, 12);
707 assert_eq!(token.position, 2);
708 assert_eq!(token.position_length, 1);
709 assert_eq!(
710 token.get_details().unwrap(),
711 vec![
712 "助詞",
713 "格助詞",
714 "*",
715 "*",
716 "*",
717 "*",
718 "ノ",
719 "の",
720 "の",
721 "ノ",
722 "の",
723 "ノ",
724 "和",
725 "*",
726 "*",
727 "*",
728 "*"
729 ]
730 );
731 }
732 {
733 let token = tokens_iter.next().unwrap();
734 assert_eq!(token.text, "形態");
735 assert_eq!(token.byte_start, 12);
736 assert_eq!(token.byte_end, 18);
737 assert_eq!(token.position, 3);
738 assert_eq!(token.position_length, 1);
739 assert_eq!(
740 token.get_details().unwrap(),
741 vec![
742 "名詞",
743 "普通名詞",
744 "一般",
745 "*",
746 "*",
747 "*",
748 "ケイタイ",
749 "形態",
750 "形態",
751 "ケータイ",
752 "形態",
753 "ケータイ",
754 "漢",
755 "*",
756 "*",
757 "*",
758 "*"
759 ]
760 );
761 }
762 {
763 let token = tokens_iter.next().unwrap();
764 assert_eq!(token.text, "素");
765 assert_eq!(token.byte_start, 18);
766 assert_eq!(token.byte_end, 21);
767 assert_eq!(token.position, 4);
768 assert_eq!(token.position_length, 1);
769 assert_eq!(
770 token.get_details().unwrap(),
771 vec![
772 "接尾辞",
773 "名詞的",
774 "一般",
775 "*",
776 "*",
777 "*",
778 "ソ",
779 "素",
780 "素",
781 "ソ",
782 "素",
783 "ソ",
784 "漢",
785 "*",
786 "*",
787 "*",
788 "*"
789 ]
790 );
791 }
792 {
793 let token = tokens_iter.next().unwrap();
794 assert_eq!(token.text, "解析");
795 assert_eq!(token.byte_start, 21);
796 assert_eq!(token.byte_end, 27);
797 assert_eq!(token.position, 5);
798 assert_eq!(token.position_length, 1);
799 assert_eq!(
800 token.get_details().unwrap(),
801 vec![
802 "名詞",
803 "普通名詞",
804 "サ変可能",
805 "*",
806 "*",
807 "*",
808 "カイセキ",
809 "解析",
810 "解析",
811 "カイセキ",
812 "解析",
813 "カイセキ",
814 "漢",
815 "*",
816 "*",
817 "*",
818 "*"
819 ]
820 );
821 }
822 {
823 let token = tokens_iter.next().unwrap();
824 assert_eq!(token.text, "を");
825 assert_eq!(token.byte_start, 27);
826 assert_eq!(token.byte_end, 30);
827 assert_eq!(token.position, 6);
828 assert_eq!(token.position_length, 1);
829 assert_eq!(
830 token.get_details().unwrap(),
831 vec![
832 "助詞",
833 "格助詞",
834 "*",
835 "*",
836 "*",
837 "*",
838 "ヲ",
839 "を",
840 "を",
841 "オ",
842 "を",
843 "オ",
844 "和",
845 "*",
846 "*",
847 "*",
848 "*"
849 ]
850 );
851 }
852 {
853 let token = tokens_iter.next().unwrap();
854 assert_eq!(token.text, "行う");
855 assert_eq!(token.byte_start, 30);
856 assert_eq!(token.byte_end, 36);
857 assert_eq!(token.position, 7);
858 assert_eq!(token.position_length, 1);
859 assert_eq!(
860 token.get_details().unwrap(),
861 vec![
862 "動詞",
863 "一般",
864 "*",
865 "*",
866 "五段-ワア行",
867 "連体形-一般",
868 "オコナウ",
869 "行う",
870 "行う",
871 "オコナウ",
872 "行う",
873 "オコナウ",
874 "和",
875 "*",
876 "*",
877 "*",
878 "*"
879 ]
880 );
881 }
882 {
883 let token = tokens_iter.next().unwrap();
884 assert_eq!(token.text, "こと");
885 assert_eq!(token.byte_start, 36);
886 assert_eq!(token.byte_end, 42);
887 assert_eq!(token.position, 8);
888 assert_eq!(token.position_length, 1);
889 assert_eq!(
890 token.get_details().unwrap(),
891 vec![
892 "名詞",
893 "普通名詞",
894 "一般",
895 "*",
896 "*",
897 "*",
898 "コト",
899 "事",
900 "こと",
901 "コト",
902 "こと",
903 "コト",
904 "和",
905 "コ濁",
906 "基本形",
907 "*",
908 "*"
909 ]
910 );
911 }
912 {
913 let token = tokens_iter.next().unwrap();
914 assert_eq!(token.text, "が");
915 assert_eq!(token.byte_start, 42);
916 assert_eq!(token.byte_end, 45);
917 assert_eq!(token.position, 9);
918 assert_eq!(token.position_length, 1);
919 assert_eq!(
920 token.get_details().unwrap(),
921 vec![
922 "助詞",
923 "格助詞",
924 "*",
925 "*",
926 "*",
927 "*",
928 "ガ",
929 "が",
930 "が",
931 "ガ",
932 "が",
933 "ガ",
934 "和",
935 "*",
936 "*",
937 "*",
938 "*"
939 ]
940 );
941 }
942 {
943 let token = tokens_iter.next().unwrap();
944 assert_eq!(token.text, "でき");
945 assert_eq!(token.byte_start, 45);
946 assert_eq!(token.byte_end, 51);
947 assert_eq!(token.position, 10);
948 assert_eq!(token.position_length, 1);
949 assert_eq!(
950 token.get_details().unwrap(),
951 vec![
952 "動詞",
953 "非自立可能",
954 "*",
955 "*",
956 "上一段-カ行",
957 "連用形-一般",
958 "デキル",
959 "出来る",
960 "でき",
961 "デキ",
962 "できる",
963 "デキル",
964 "和",
965 "*",
966 "*",
967 "*",
968 "*"
969 ]
970 );
971 }
972 {
973 let token = tokens_iter.next().unwrap();
974 assert_eq!(token.text, "ます");
975 assert_eq!(token.byte_start, 51);
976 assert_eq!(token.byte_end, 57);
977 assert_eq!(token.position, 11);
978 assert_eq!(token.position_length, 1);
979 assert_eq!(
980 token.get_details().unwrap(),
981 vec![
982 "助動詞",
983 "*",
984 "*",
985 "*",
986 "助動詞-マス",
987 "終止形-一般",
988 "マス",
989 "ます",
990 "ます",
991 "マス",
992 "ます",
993 "マス",
994 "和",
995 "*",
996 "*",
997 "*",
998 "*"
999 ]
1000 );
1001 }
1002 {
1003 let token = tokens_iter.next().unwrap();
1004 assert_eq!(token.text, "。");
1005 assert_eq!(token.byte_start, 57);
1006 assert_eq!(token.byte_end, 60);
1007 assert_eq!(token.position, 12);
1008 assert_eq!(token.position_length, 1);
1009 assert_eq!(
1010 token.get_details().unwrap(),
1011 vec![
1012 "補助記号",
1013 "句点",
1014 "*",
1015 "*",
1016 "*",
1017 "*",
1018 "",
1019 "。",
1020 "。",
1021 "",
1022 "。",
1023 "",
1024 "記号",
1025 "*",
1026 "*",
1027 "*",
1028 "*"
1029 ]
1030 );
1031 }
1032 }
1033
1034 #[test]
1035 #[cfg(feature = "ko-dic")]
1036 fn test_tokenize_ko_dic() {
1037 let dictionary = DictionaryConfig {
1038 kind: Some(DictionaryKind::KoDic),
1039 path: None,
1040 };
1041
1042 let config = TokenizerConfig {
1043 dictionary,
1044 user_dictionary: None,
1045 mode: Mode::Normal,
1046 };
1047
1048 let tokenizer = Tokenizer::from_config(config).unwrap();
1049 let mut tokens = tokenizer
1050 .tokenize("한국어의형태해석을실시할수있습니다.")
1051 .unwrap();
1052 let mut tokens_iter = tokens.iter_mut();
1053 {
1054 let token = tokens_iter.next().unwrap();
1055 assert_eq!(token.text, "한국어");
1056 assert_eq!(token.byte_start, 0);
1057 assert_eq!(token.byte_end, 9);
1058 assert_eq!(token.position, 0);
1059 assert_eq!(token.position_length, 1);
1060 assert_eq!(
1061 token.get_details().unwrap(),
1062 vec![
1063 "NNG",
1064 "*",
1065 "F",
1066 "한국어",
1067 "Compound",
1068 "*",
1069 "*",
1070 "한국/NNG/*+어/NNG/*"
1071 ]
1072 );
1073 }
1074 {
1075 let token = tokens_iter.next().unwrap();
1076 assert_eq!(token.text, "의");
1077 assert_eq!(token.byte_start, 9);
1078 assert_eq!(token.byte_end, 12);
1079 assert_eq!(token.position, 1);
1080 assert_eq!(token.position_length, 1);
1081 assert_eq!(
1082 token.get_details().unwrap(),
1083 vec!["JKG", "*", "F", "의", "*", "*", "*", "*"]
1084 );
1085 }
1086 {
1087 let token = tokens_iter.next().unwrap();
1088 assert_eq!(token.text, "형태");
1089 assert_eq!(token.byte_start, 12);
1090 assert_eq!(token.byte_end, 18);
1091 assert_eq!(token.position, 2);
1092 assert_eq!(token.position_length, 1);
1093 assert_eq!(
1094 token.get_details().unwrap(),
1095 vec!["NNG", "*", "F", "형태", "*", "*", "*", "*"]
1096 );
1097 }
1098 {
1099 let token = tokens_iter.next().unwrap();
1100 assert_eq!(token.text, "해석");
1101 assert_eq!(token.byte_start, 18);
1102 assert_eq!(token.byte_end, 24);
1103 assert_eq!(token.position, 3);
1104 assert_eq!(token.position_length, 1);
1105 assert_eq!(
1106 token.get_details().unwrap(),
1107 vec!["NNG", "행위", "T", "해석", "*", "*", "*", "*"]
1108 );
1109 }
1110 {
1111 let token = tokens_iter.next().unwrap();
1112 assert_eq!(token.text, "을");
1113 assert_eq!(token.byte_start, 24);
1114 assert_eq!(token.byte_end, 27);
1115 assert_eq!(token.position, 4);
1116 assert_eq!(token.position_length, 1);
1117 assert_eq!(
1118 token.get_details().unwrap(),
1119 vec!["JKO", "*", "T", "을", "*", "*", "*", "*"]
1120 );
1121 }
1122 {
1123 let token = tokens_iter.next().unwrap();
1124 assert_eq!(token.text, "실시");
1125 assert_eq!(token.byte_start, 27);
1126 assert_eq!(token.byte_end, 33);
1127 assert_eq!(token.position, 5);
1128 assert_eq!(token.position_length, 1);
1129 assert_eq!(
1130 token.get_details().unwrap(),
1131 vec!["NNG", "행위", "F", "실시", "*", "*", "*", "*"]
1132 );
1133 }
1134 {
1135 let token = tokens_iter.next().unwrap();
1136 assert_eq!(token.text, "할");
1137 assert_eq!(token.byte_start, 33);
1138 assert_eq!(token.byte_end, 36);
1139 assert_eq!(token.position, 6);
1140 assert_eq!(token.position_length, 1);
1141 assert_eq!(
1142 token.get_details().unwrap(),
1143 vec![
1144 "XSV+ETM",
1145 "*",
1146 "T",
1147 "할",
1148 "Inflect",
1149 "XSV",
1150 "ETM",
1151 "하/XSV/*+ᆯ/ETM/*"
1152 ]
1153 );
1154 }
1155 {
1156 let token = tokens_iter.next().unwrap();
1157 assert_eq!(token.text, "수");
1158 assert_eq!(token.byte_start, 36);
1159 assert_eq!(token.byte_end, 39);
1160 assert_eq!(token.position, 7);
1161 assert_eq!(token.position_length, 1);
1162 assert_eq!(
1163 token.get_details().unwrap(),
1164 vec!["NNB", "*", "F", "수", "*", "*", "*", "*"]
1165 );
1166 }
1167 {
1168 let token = tokens_iter.next().unwrap();
1169 assert_eq!(token.text, "있");
1170 assert_eq!(token.byte_start, 39);
1171 assert_eq!(token.byte_end, 42);
1172 assert_eq!(token.position, 8);
1173 assert_eq!(token.position_length, 1);
1174 assert_eq!(
1175 token.get_details().unwrap(),
1176 vec!["VV", "*", "T", "있", "*", "*", "*", "*"]
1177 );
1178 }
1179 {
1180 let token = tokens_iter.next().unwrap();
1181 assert_eq!(token.text, "습니다");
1182 assert_eq!(token.byte_start, 42);
1183 assert_eq!(token.byte_end, 51);
1184 assert_eq!(token.position, 9);
1185 assert_eq!(token.position_length, 1);
1186 assert_eq!(
1187 token.get_details().unwrap(),
1188 vec!["EF", "*", "F", "습니다", "*", "*", "*", "*"]
1189 );
1190 }
1191 {
1192 let token = tokens_iter.next().unwrap();
1193 assert_eq!(token.text, ".");
1194 assert_eq!(token.byte_start, 51);
1195 assert_eq!(token.byte_end, 52);
1196 assert_eq!(token.position, 10);
1197 assert_eq!(token.position_length, 1);
1198 assert_eq!(
1199 token.get_details().unwrap(),
1200 vec!["SF", "*", "*", "*", "*", "*", "*", "*"]
1201 );
1202 }
1203 }
1204
1205 #[test]
1206 #[cfg(feature = "cc-cedict")]
1207 fn test_tokenize_cc_cedict() {
1208 let dictionary = DictionaryConfig {
1209 kind: Some(DictionaryKind::CcCedict),
1210 path: None,
1211 };
1212
1213 let config = TokenizerConfig {
1214 dictionary,
1215 user_dictionary: None,
1216 mode: Mode::Normal,
1217 };
1218
1219 let tokenizer = Tokenizer::from_config(config).unwrap();
1220 let mut tokens = tokenizer.tokenize("可以进行中文形态学分析。").unwrap();
1221 let mut tokens_iter = tokens.iter_mut();
1222 {
1223 let token = tokens_iter.next().unwrap();
1224 assert_eq!(token.text, "可以");
1225 assert_eq!(token.byte_start, 0);
1226 assert_eq!(token.byte_end, 6);
1227 assert_eq!(token.position, 0);
1228 assert_eq!(token.position_length, 1);
1229 assert_eq!(
1230 token.get_details().unwrap(),
1231 vec![
1232 "*",
1233 "*",
1234 "*",
1235 "*",
1236 "ke3 yi3",
1237 "可以",
1238 "可以",
1239 "can/may/possible/able to/not bad/pretty good/"
1240 ]
1241 );
1242 }
1243 {
1244 let token = tokens_iter.next().unwrap();
1245 assert_eq!(token.text, "进行");
1246 assert_eq!(token.byte_start, 6);
1247 assert_eq!(token.byte_end, 12);
1248 assert_eq!(token.position, 1);
1249 assert_eq!(token.position_length, 1);
1250 assert_eq!(
1251 token.get_details().unwrap(),
1252 vec![
1253 "*",
1254 "*",
1255 "*",
1256 "*",
1257 "jin4 xing2",
1258 "進行",
1259 "进行",
1260 "to advance/to conduct/underway/in progress/to do/to carry out/to carry on/to execute/"
1261 ]
1262 );
1263 }
1264 {
1265 let token = tokens_iter.next().unwrap();
1266 assert_eq!(token.text, "中文");
1267 assert_eq!(token.byte_start, 12);
1268 assert_eq!(token.byte_end, 18);
1269 assert_eq!(token.position, 2);
1270 assert_eq!(token.position_length, 1);
1271 assert_eq!(
1272 token.get_details().unwrap(),
1273 vec![
1274 "*",
1275 "*",
1276 "*",
1277 "*",
1278 "Zhong1 wen2",
1279 "中文",
1280 "中文",
1281 "Chinese language/"
1282 ]
1283 );
1284 }
1285 {
1286 let token = tokens_iter.next().unwrap();
1287 assert_eq!(token.text, "形态学");
1288 assert_eq!(token.byte_start, 18);
1289 assert_eq!(token.byte_end, 27);
1290 assert_eq!(token.position, 3);
1291 assert_eq!(token.position_length, 1);
1292 assert_eq!(
1293 token.get_details().unwrap(),
1294 vec![
1295 "*",
1296 "*",
1297 "*",
1298 "*",
1299 "xing2 tai4 xue2",
1300 "形態學",
1301 "形态学",
1302 "morphology (in biology or linguistics)/"
1303 ]
1304 );
1305 }
1306 {
1307 let token = tokens_iter.next().unwrap();
1308 assert_eq!(token.text, "分析");
1309 assert_eq!(token.byte_start, 27);
1310 assert_eq!(token.byte_end, 33);
1311 assert_eq!(token.position, 4);
1312 assert_eq!(token.position_length, 1);
1313 assert_eq!(
1314 token.get_details().unwrap(),
1315 vec![
1316 "*",
1317 "*",
1318 "*",
1319 "*",
1320 "fen1 xi1",
1321 "分析",
1322 "分析",
1323 "to analyze/analysis/CL:個|个[ge4]/"
1324 ]
1325 );
1326 }
1327 {
1328 let token = tokens_iter.next().unwrap();
1329 assert_eq!(token.text, "。");
1330 assert_eq!(token.byte_start, 33);
1331 assert_eq!(token.byte_end, 36);
1332 assert_eq!(token.position, 5);
1333 assert_eq!(token.position_length, 1);
1334 assert_eq!(token.get_details().unwrap(), vec!["UNK"]);
1335 }
1336 }
1337
1338 #[test]
1339 #[cfg(feature = "ipadic")]
1340 fn test_tokenize_with_simple_userdic_ipadic() {
1341 let dictionary = DictionaryConfig {
1342 kind: Some(DictionaryKind::IPADIC),
1343 path: None,
1344 };
1345
1346 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1347 .join("../resources")
1348 .join("ipadic_simple_userdic.csv");
1349
1350 let user_dictionary = Some(UserDictionaryConfig {
1351 kind: Some(DictionaryKind::IPADIC),
1352 path: userdic_file,
1353 });
1354
1355 let config = TokenizerConfig {
1356 dictionary,
1357 user_dictionary,
1358 mode: Mode::Normal,
1359 };
1360
1361 let tokenizer = Tokenizer::from_config(config).unwrap();
1362 let mut tokens = tokenizer
1363 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
1364 .unwrap();
1365 let mut tokens_iter = tokens.iter_mut();
1366 {
1367 let token = tokens_iter.next().unwrap();
1368 assert_eq!(token.text, "東京スカイツリー");
1369 assert_eq!(token.byte_start, 0);
1370 assert_eq!(token.byte_end, 24);
1371 assert_eq!(token.position, 0);
1372 assert_eq!(token.position_length, 1);
1373 assert_eq!(
1374 token.get_details().unwrap(),
1375 vec![
1376 "カスタム名詞",
1377 "*",
1378 "*",
1379 "*",
1380 "*",
1381 "*",
1382 "東京スカイツリー",
1383 "トウキョウスカイツリー",
1384 "*"
1385 ]
1386 );
1387 }
1388 {
1389 let token = tokens_iter.next().unwrap();
1390 assert_eq!(token.text, "の");
1391 assert_eq!(token.byte_start, 24);
1392 assert_eq!(token.byte_end, 27);
1393 assert_eq!(token.position, 1);
1394 assert_eq!(token.position_length, 1);
1395 assert_eq!(
1396 token.get_details().unwrap(),
1397 vec!["助詞", "連体化", "*", "*", "*", "*", "の", "ノ", "ノ"]
1398 );
1399 }
1400 {
1401 let token = tokens_iter.next().unwrap();
1402 assert_eq!(token.text, "最寄り駅");
1403 assert_eq!(token.byte_start, 27);
1404 assert_eq!(token.byte_end, 39);
1405 assert_eq!(token.position, 2);
1406 assert_eq!(token.position_length, 1);
1407 assert_eq!(
1408 token.get_details().unwrap(),
1409 vec![
1410 "名詞",
1411 "一般",
1412 "*",
1413 "*",
1414 "*",
1415 "*",
1416 "最寄り駅",
1417 "モヨリエキ",
1418 "モヨリエキ"
1419 ]
1420 );
1421 }
1422 {
1423 let token = tokens_iter.next().unwrap();
1424 assert_eq!(token.text, "は");
1425 assert_eq!(token.byte_start, 39);
1426 assert_eq!(token.byte_end, 42);
1427 assert_eq!(token.position, 3);
1428 assert_eq!(token.position_length, 1);
1429 assert_eq!(
1430 token.get_details().unwrap(),
1431 vec!["助詞", "係助詞", "*", "*", "*", "*", "は", "ハ", "ワ"]
1432 );
1433 }
1434 {
1435 let token = tokens_iter.next().unwrap();
1436 assert_eq!(token.text, "とうきょうスカイツリー駅");
1437 assert_eq!(token.byte_start, 42);
1438 assert_eq!(token.byte_end, 78);
1439 assert_eq!(token.position, 4);
1440 assert_eq!(token.position_length, 1);
1441 assert_eq!(
1442 token.get_details().unwrap(),
1443 vec![
1444 "カスタム名詞",
1445 "*",
1446 "*",
1447 "*",
1448 "*",
1449 "*",
1450 "とうきょうスカイツリー駅",
1451 "トウキョウスカイツリーエキ",
1452 "*"
1453 ]
1454 );
1455 }
1456 {
1457 let token = tokens_iter.next().unwrap();
1458 assert_eq!(token.text, "です");
1459 assert_eq!(token.byte_start, 78);
1460 assert_eq!(token.byte_end, 84);
1461 assert_eq!(token.position, 5);
1462 assert_eq!(token.position_length, 1);
1463 assert_eq!(
1464 token.get_details().unwrap(),
1465 vec![
1466 "助動詞",
1467 "*",
1468 "*",
1469 "*",
1470 "特殊・デス",
1471 "基本形",
1472 "です",
1473 "デス",
1474 "デス"
1475 ]
1476 );
1477 }
1478 {
1479 let token = tokens_iter.next().unwrap();
1480 assert_eq!(token.text, "。");
1481 assert_eq!(token.byte_start, 84);
1482 assert_eq!(token.byte_end, 87);
1483 assert_eq!(token.position, 6);
1484 assert_eq!(token.position_length, 1);
1485 assert_eq!(
1486 token.get_details().unwrap(),
1487 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
1488 );
1489 }
1490 }
1491
1492 #[test]
1493 #[cfg(feature = "unidic")]
1494 fn test_tokenize_with_simple_userdic_unidic() {
1495 let dictionary = DictionaryConfig {
1496 kind: Some(DictionaryKind::UniDic),
1497 path: None,
1498 };
1499
1500 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1501 .join("../resources")
1502 .join("unidic_simple_userdic.csv");
1503
1504 let user_dictionary = Some(UserDictionaryConfig {
1505 kind: Some(DictionaryKind::UniDic),
1506 path: userdic_file,
1507 });
1508
1509 let config = TokenizerConfig {
1510 dictionary,
1511 user_dictionary,
1512 mode: Mode::Normal,
1513 };
1514
1515 let tokenizer = Tokenizer::from_config(config).unwrap();
1516 let mut tokens = tokenizer
1517 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
1518 .unwrap();
1519 let mut tokens_iter = tokens.iter_mut();
1520 {
1521 let token = tokens_iter.next().unwrap();
1522 assert_eq!(token.text, "東京スカイツリー");
1523 assert_eq!(token.byte_start, 0);
1524 assert_eq!(token.byte_end, 24);
1525 assert_eq!(token.position, 0);
1526 assert_eq!(token.position_length, 1);
1527 assert_eq!(
1528 token.get_details().unwrap(),
1529 vec![
1530 "カスタム名詞",
1531 "*",
1532 "*",
1533 "*",
1534 "*",
1535 "*",
1536 "トウキョウスカイツリー",
1537 "*",
1538 "*",
1539 "*",
1540 "*",
1541 "*",
1542 "*",
1543 "*",
1544 "*",
1545 "*",
1546 "*"
1547 ]
1548 );
1549 }
1550 {
1551 let token = tokens_iter.next().unwrap();
1552 assert_eq!(token.text, "の");
1553 assert_eq!(token.byte_start, 24);
1554 assert_eq!(token.byte_end, 27);
1555 assert_eq!(token.position, 1);
1556 assert_eq!(token.position_length, 1);
1557 assert_eq!(
1558 token.get_details().unwrap(),
1559 vec![
1560 "助詞",
1561 "格助詞",
1562 "*",
1563 "*",
1564 "*",
1565 "*",
1566 "ノ",
1567 "の",
1568 "の",
1569 "ノ",
1570 "の",
1571 "ノ",
1572 "和",
1573 "*",
1574 "*",
1575 "*",
1576 "*"
1577 ]
1578 );
1579 }
1580 {
1581 let token = tokens_iter.next().unwrap();
1582 assert_eq!(token.text, "最寄り");
1583 assert_eq!(token.byte_start, 27);
1584 assert_eq!(token.byte_end, 36);
1585 assert_eq!(token.position, 2);
1586 assert_eq!(token.position_length, 1);
1587 assert_eq!(
1588 token.get_details().unwrap(),
1589 vec![
1590 "名詞",
1591 "普通名詞",
1592 "一般",
1593 "*",
1594 "*",
1595 "*",
1596 "モヨリ",
1597 "最寄り",
1598 "最寄り",
1599 "モヨリ",
1600 "最寄り",
1601 "モヨリ",
1602 "和",
1603 "*",
1604 "*",
1605 "*",
1606 "*"
1607 ]
1608 );
1609 }
1610 {
1611 let token = tokens_iter.next().unwrap();
1612 assert_eq!(token.text, "駅");
1613 assert_eq!(token.byte_start, 36);
1614 assert_eq!(token.byte_end, 39);
1615 assert_eq!(token.position, 3);
1616 assert_eq!(token.position_length, 1);
1617 assert_eq!(
1618 token.get_details().unwrap(),
1619 vec![
1620 "名詞",
1621 "普通名詞",
1622 "一般",
1623 "*",
1624 "*",
1625 "*",
1626 "エキ",
1627 "駅",
1628 "駅",
1629 "エキ",
1630 "駅",
1631 "エキ",
1632 "漢",
1633 "*",
1634 "*",
1635 "*",
1636 "*"
1637 ]
1638 );
1639 }
1640 {
1641 let token = tokens_iter.next().unwrap();
1642 assert_eq!(token.text, "は");
1643 assert_eq!(token.byte_start, 39);
1644 assert_eq!(token.byte_end, 42);
1645 assert_eq!(token.position, 4);
1646 assert_eq!(token.position_length, 1);
1647 assert_eq!(
1648 token.get_details().unwrap(),
1649 vec![
1650 "助詞",
1651 "係助詞",
1652 "*",
1653 "*",
1654 "*",
1655 "*",
1656 "ハ",
1657 "は",
1658 "は",
1659 "ワ",
1660 "は",
1661 "ワ",
1662 "和",
1663 "*",
1664 "*",
1665 "*",
1666 "*"
1667 ]
1668 );
1669 }
1670 {
1671 let token = tokens_iter.next().unwrap();
1672 assert_eq!(token.text, "とうきょうスカイツリー駅");
1673 assert_eq!(token.byte_start, 42);
1674 assert_eq!(token.byte_end, 78);
1675 assert_eq!(token.position, 5);
1676 assert_eq!(token.position_length, 1);
1677 assert_eq!(
1678 token.get_details().unwrap(),
1679 vec![
1680 "カスタム名詞",
1681 "*",
1682 "*",
1683 "*",
1684 "*",
1685 "*",
1686 "トウキョウスカイツリーエキ",
1687 "*",
1688 "*",
1689 "*",
1690 "*",
1691 "*",
1692 "*",
1693 "*",
1694 "*",
1695 "*",
1696 "*"
1697 ]
1698 );
1699 }
1700 {
1701 let token = tokens_iter.next().unwrap();
1702 assert_eq!(token.text, "です");
1703 assert_eq!(token.byte_start, 78);
1704 assert_eq!(token.byte_end, 84);
1705 assert_eq!(token.position, 6);
1706 assert_eq!(token.position_length, 1);
1707 assert_eq!(
1708 token.get_details().unwrap(),
1709 vec![
1710 "助動詞",
1711 "*",
1712 "*",
1713 "*",
1714 "助動詞-デス",
1715 "終止形-一般",
1716 "デス",
1717 "です",
1718 "です",
1719 "デス",
1720 "です",
1721 "デス",
1722 "和",
1723 "*",
1724 "*",
1725 "*",
1726 "*"
1727 ]
1728 );
1729 }
1730 {
1731 let token = tokens_iter.next().unwrap();
1732 assert_eq!(token.text, "。");
1733 assert_eq!(token.byte_start, 84);
1734 assert_eq!(token.byte_end, 87);
1735 assert_eq!(token.position, 7);
1736 assert_eq!(token.position_length, 1);
1737 assert_eq!(
1738 token.get_details().unwrap(),
1739 vec![
1740 "補助記号",
1741 "句点",
1742 "*",
1743 "*",
1744 "*",
1745 "*",
1746 "",
1747 "。",
1748 "。",
1749 "",
1750 "。",
1751 "",
1752 "記号",
1753 "*",
1754 "*",
1755 "*",
1756 "*"
1757 ]
1758 );
1759 }
1760 }
1761
1762 #[test]
1763 #[cfg(feature = "ko-dic")]
1764 fn test_tokenize_with_simple_userdic_ko_dic() {
1765 let dictionary = DictionaryConfig {
1766 kind: Some(DictionaryKind::KoDic),
1767 path: None,
1768 };
1769
1770 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1771 .join("../resources")
1772 .join("ko-dic_simple_userdic.csv");
1773
1774 let user_dictionary = Some(UserDictionaryConfig {
1775 kind: Some(DictionaryKind::KoDic),
1776 path: userdic_file,
1777 });
1778
1779 let config = TokenizerConfig {
1780 dictionary,
1781 user_dictionary,
1782 mode: Mode::Normal,
1783 };
1784
1785 let tokenizer = Tokenizer::from_config(config).unwrap();
1786 let mut tokens = tokenizer.tokenize("하네다공항한정토트백.").unwrap();
1787 let mut tokens_iter = tokens.iter_mut();
1788 {
1789 let token = tokens_iter.next().unwrap();
1790 assert_eq!(token.text, "하네다공항");
1791 assert_eq!(token.byte_start, 0);
1792 assert_eq!(token.byte_end, 15);
1793 assert_eq!(token.position, 0);
1794 assert_eq!(token.position_length, 1);
1795 assert_eq!(
1796 token.get_details().unwrap(),
1797 vec!["NNP", "*", "*", "하네다공항", "*", "*", "*", "*"]
1798 );
1799 }
1800 {
1801 let token = tokens_iter.next().unwrap();
1802 assert_eq!(token.text, "한정");
1803 assert_eq!(token.byte_start, 15);
1804 assert_eq!(token.byte_end, 21);
1805 assert_eq!(token.position, 1);
1806 assert_eq!(token.position_length, 1);
1807 assert_eq!(
1808 token.get_details().unwrap(),
1809 vec!["NNG", "*", "T", "한정", "*", "*", "*", "*"]
1810 );
1811 }
1812 {
1813 let token = tokens_iter.next().unwrap();
1814 assert_eq!(token.text, "토트백");
1815 assert_eq!(token.byte_start, 21);
1816 assert_eq!(token.byte_end, 30);
1817 assert_eq!(token.position, 2);
1818 assert_eq!(token.position_length, 1);
1819 assert_eq!(
1820 token.get_details().unwrap(),
1821 vec![
1822 "NNG",
1823 "*",
1824 "T",
1825 "토트백",
1826 "Compound",
1827 "*",
1828 "*",
1829 "토트/NNP/인명+백/NNG/*"
1830 ]
1831 );
1832 }
1833 {
1834 let token = tokens_iter.next().unwrap();
1835 assert_eq!(token.text, ".");
1836 assert_eq!(token.byte_start, 30);
1837 assert_eq!(token.byte_end, 31);
1838 assert_eq!(token.position, 3);
1839 assert_eq!(token.position_length, 1);
1840 assert_eq!(
1841 token.get_details().unwrap(),
1842 vec!["SF", "*", "*", "*", "*", "*", "*", "*"]
1843 );
1844 }
1845 }
1846
1847 #[test]
1848 #[cfg(feature = "cc-cedict")]
1849 fn test_tokenize_with_simple_userdic_cc_cedict() {
1850 let dictionary = DictionaryConfig {
1851 kind: Some(DictionaryKind::CcCedict),
1852 path: None,
1853 };
1854
1855 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1856 .join("../resources")
1857 .join("cc-cedict_simple_userdic.csv");
1858
1859 let user_dictionary = Some(UserDictionaryConfig {
1860 kind: Some(DictionaryKind::CcCedict),
1861 path: userdic_file,
1862 });
1863
1864 let config = TokenizerConfig {
1865 dictionary,
1866 user_dictionary,
1867 mode: Mode::Normal,
1868 };
1869
1870 let tokenizer = Tokenizer::from_config(config).unwrap();
1871 let mut tokens = tokenizer.tokenize("羽田机场限定托特包。").unwrap();
1872 let mut tokens_iter = tokens.iter_mut();
1873 {
1874 let token = tokens_iter.next().unwrap();
1875 assert_eq!(token.text, "羽田机场");
1876 assert_eq!(token.byte_start, 0);
1877 assert_eq!(token.byte_end, 12);
1878 assert_eq!(token.position, 0);
1879 assert_eq!(token.position_length, 1);
1880 assert_eq!(
1881 token.get_details().unwrap(),
1882 vec!["*", "*", "*", "*", "Yu3 tian2 ji1 chang3", "*", "*", "*"]
1883 );
1884 }
1885 {
1886 let token = tokens_iter.next().unwrap();
1887 assert_eq!(token.text, "限定");
1888 assert_eq!(token.byte_start, 12);
1889 assert_eq!(token.byte_end, 18);
1890 assert_eq!(token.position, 1);
1891 assert_eq!(token.position_length, 1);
1892 assert_eq!(
1893 token.get_details().unwrap(),
1894 vec![
1895 "*",
1896 "*",
1897 "*",
1898 "*",
1899 "xian4 ding4",
1900 "限定",
1901 "限定",
1902 "to restrict to/to limit/"
1903 ]
1904 );
1905 }
1906 {
1907 let token = tokens_iter.next().unwrap();
1908 assert_eq!(token.text, "托特");
1909 assert_eq!(token.byte_start, 18);
1910 assert_eq!(token.byte_end, 24);
1911 assert_eq!(token.position, 2);
1912 assert_eq!(token.position_length, 1);
1913 assert_eq!(
1914 token.get_details().unwrap(),
1915 vec![
1916 "*",
1917 "*",
1918 "*",
1919 "*",
1920 "tuo1 te4",
1921 "托特",
1922 "托特",
1923 "(loanword) tote (bag)/"
1924 ]
1925 );
1926 }
1927 {
1928 let token = tokens_iter.next().unwrap();
1929 assert_eq!(token.text, "包");
1930 assert_eq!(token.byte_start, 24);
1931 assert_eq!(token.byte_end, 27);
1932 assert_eq!(token.position, 3);
1933 assert_eq!(token.position_length, 1);
1934 assert_eq!(
1935 token.get_details().unwrap(),
1936 vec![
1937 "*",
1938 "*",
1939 "*",
1940 "*",
1941 "bao1",
1942 "包",
1943 "包",
1944 "to cover/to wrap/to hold/to include/to take charge of/to contract (to or for)/package/wrapper/container/bag/to hold or embrace/bundle/packet/CL:個|个[ge4]",
1945 "隻|只[zhi1]/"
1946 ]
1947 );
1948 }
1949 {
1950 let token = tokens_iter.next().unwrap();
1951 assert_eq!(token.text, "。");
1952 assert_eq!(token.byte_start, 27);
1953 assert_eq!(token.byte_end, 30);
1954 assert_eq!(token.position, 4);
1955 assert_eq!(token.position_length, 1);
1956 assert_eq!(token.get_details().unwrap(), vec!["UNK"]);
1957 }
1958 }
1959
1960 #[test]
1961 #[cfg(feature = "ipadic")]
1962 fn test_tokenize_with_simple_userdic_bin_ipadic() {
1963 let dictionary = DictionaryConfig {
1964 kind: Some(DictionaryKind::IPADIC),
1965 path: None,
1966 };
1967
1968 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1969 .join("../resources")
1970 .join("ipadic_simple_userdic.bin");
1971
1972 let user_dictionary = Some(UserDictionaryConfig {
1973 kind: Some(DictionaryKind::IPADIC),
1974 path: userdic_file,
1975 });
1976
1977 let config = TokenizerConfig {
1978 dictionary,
1979 user_dictionary,
1980 mode: Mode::Normal,
1981 };
1982
1983 let tokenizer = Tokenizer::from_config(config).unwrap();
1984 let mut tokens = tokenizer
1985 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
1986 .unwrap();
1987 let mut tokens_iter = tokens.iter_mut();
1988 {
1989 let token = tokens_iter.next().unwrap();
1990 assert_eq!(token.text, "東京スカイツリー");
1991 assert_eq!(token.byte_start, 0);
1992 assert_eq!(token.byte_end, 24);
1993 assert_eq!(token.position, 0);
1994 assert_eq!(token.position_length, 1);
1995 assert_eq!(
1996 token.get_details().unwrap(),
1997 vec![
1998 "カスタム名詞",
1999 "*",
2000 "*",
2001 "*",
2002 "*",
2003 "*",
2004 "東京スカイツリー",
2005 "トウキョウスカイツリー",
2006 "*"
2007 ]
2008 );
2009 }
2010 {
2011 let token = tokens_iter.next().unwrap();
2012 assert_eq!(token.text, "の");
2013 assert_eq!(token.byte_start, 24);
2014 assert_eq!(token.byte_end, 27);
2015 assert_eq!(token.position, 1);
2016 assert_eq!(token.position_length, 1);
2017 assert_eq!(
2018 token.get_details().unwrap(),
2019 vec!["助詞", "連体化", "*", "*", "*", "*", "の", "ノ", "ノ"]
2020 );
2021 }
2022 {
2023 let token = tokens_iter.next().unwrap();
2024 assert_eq!(token.text, "最寄り駅");
2025 assert_eq!(token.byte_start, 27);
2026 assert_eq!(token.byte_end, 39);
2027 assert_eq!(token.position, 2);
2028 assert_eq!(token.position_length, 1);
2029 assert_eq!(
2030 token.get_details().unwrap(),
2031 vec![
2032 "名詞",
2033 "一般",
2034 "*",
2035 "*",
2036 "*",
2037 "*",
2038 "最寄り駅",
2039 "モヨリエキ",
2040 "モヨリエキ"
2041 ]
2042 );
2043 }
2044 {
2045 let token = tokens_iter.next().unwrap();
2046 assert_eq!(token.text, "は");
2047 assert_eq!(token.byte_start, 39);
2048 assert_eq!(token.byte_end, 42);
2049 assert_eq!(token.position, 3);
2050 assert_eq!(token.position_length, 1);
2051 assert_eq!(
2052 token.get_details().unwrap(),
2053 vec!["助詞", "係助詞", "*", "*", "*", "*", "は", "ハ", "ワ"]
2054 );
2055 }
2056 {
2057 let token = tokens_iter.next().unwrap();
2058 assert_eq!(token.text, "とうきょうスカイツリー駅");
2059 assert_eq!(token.byte_start, 42);
2060 assert_eq!(token.byte_end, 78);
2061 assert_eq!(token.position, 4);
2062 assert_eq!(token.position_length, 1);
2063 assert_eq!(
2064 token.get_details().unwrap(),
2065 vec![
2066 "カスタム名詞",
2067 "*",
2068 "*",
2069 "*",
2070 "*",
2071 "*",
2072 "とうきょうスカイツリー駅",
2073 "トウキョウスカイツリーエキ",
2074 "*"
2075 ]
2076 );
2077 }
2078 {
2079 let token = tokens_iter.next().unwrap();
2080 assert_eq!(token.text, "です");
2081 assert_eq!(token.byte_start, 78);
2082 assert_eq!(token.byte_end, 84);
2083 assert_eq!(token.position, 5);
2084 assert_eq!(token.position_length, 1);
2085 assert_eq!(
2086 token.get_details().unwrap(),
2087 vec![
2088 "助動詞",
2089 "*",
2090 "*",
2091 "*",
2092 "特殊・デス",
2093 "基本形",
2094 "です",
2095 "デス",
2096 "デス"
2097 ]
2098 );
2099 }
2100 {
2101 let token = tokens_iter.next().unwrap();
2102 assert_eq!(token.text, "。");
2103 assert_eq!(token.byte_start, 84);
2104 assert_eq!(token.byte_end, 87);
2105 assert_eq!(token.position, 6);
2106 assert_eq!(token.position_length, 1);
2107 assert_eq!(
2108 token.get_details().unwrap(),
2109 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
2110 );
2111 }
2112 }
2113
2114 #[test]
2115 #[cfg(feature = "unidic")]
2116 fn test_tokenize_with_simple_userdic_bin_unidic() {
2117 let dictionary = DictionaryConfig {
2118 kind: Some(DictionaryKind::UniDic),
2119 path: None,
2120 };
2121
2122 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2123 .join("../resources")
2124 .join("unidic_simple_userdic.bin");
2125
2126 let user_dictionary = Some(UserDictionaryConfig {
2127 kind: Some(DictionaryKind::UniDic),
2128 path: userdic_file,
2129 });
2130
2131 let config = TokenizerConfig {
2132 dictionary,
2133 user_dictionary,
2134 mode: Mode::Normal,
2135 };
2136
2137 let tokenizer = Tokenizer::from_config(config).unwrap();
2138 let mut tokens = tokenizer
2139 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
2140 .unwrap();
2141 let mut tokens_iter = tokens.iter_mut();
2142 {
2143 let token = tokens_iter.next().unwrap();
2144 assert_eq!(token.text, "東京スカイツリー");
2145 assert_eq!(token.byte_start, 0);
2146 assert_eq!(token.byte_end, 24);
2147 assert_eq!(token.position, 0);
2148 assert_eq!(token.position_length, 1);
2149 assert_eq!(
2150 token.get_details().unwrap(),
2151 vec![
2152 "カスタム名詞",
2153 "*",
2154 "*",
2155 "*",
2156 "*",
2157 "*",
2158 "トウキョウスカイツリー",
2159 "*",
2160 "*",
2161 "*",
2162 "*",
2163 "*",
2164 "*",
2165 "*",
2166 "*",
2167 "*",
2168 "*"
2169 ]
2170 );
2171 }
2172 {
2173 let token = tokens_iter.next().unwrap();
2174 assert_eq!(token.text, "の");
2175 assert_eq!(token.byte_start, 24);
2176 assert_eq!(token.byte_end, 27);
2177 assert_eq!(token.position, 1);
2178 assert_eq!(token.position_length, 1);
2179 assert_eq!(
2180 token.get_details().unwrap(),
2181 vec![
2182 "助詞",
2183 "格助詞",
2184 "*",
2185 "*",
2186 "*",
2187 "*",
2188 "ノ",
2189 "の",
2190 "の",
2191 "ノ",
2192 "の",
2193 "ノ",
2194 "和",
2195 "*",
2196 "*",
2197 "*",
2198 "*"
2199 ]
2200 );
2201 }
2202 {
2203 let token = tokens_iter.next().unwrap();
2204 assert_eq!(token.text, "最寄り");
2205 assert_eq!(token.byte_start, 27);
2206 assert_eq!(token.byte_end, 36);
2207 assert_eq!(token.position, 2);
2208 assert_eq!(token.position_length, 1);
2209 assert_eq!(
2210 token.get_details().unwrap(),
2211 vec![
2212 "名詞",
2213 "普通名詞",
2214 "一般",
2215 "*",
2216 "*",
2217 "*",
2218 "モヨリ",
2219 "最寄り",
2220 "最寄り",
2221 "モヨリ",
2222 "最寄り",
2223 "モヨリ",
2224 "和",
2225 "*",
2226 "*",
2227 "*",
2228 "*"
2229 ]
2230 );
2231 }
2232 {
2233 let token = tokens_iter.next().unwrap();
2234 assert_eq!(token.text, "駅");
2235 assert_eq!(token.byte_start, 36);
2236 assert_eq!(token.byte_end, 39);
2237 assert_eq!(token.position, 3);
2238 assert_eq!(token.position_length, 1);
2239 assert_eq!(
2240 token.get_details().unwrap(),
2241 vec![
2242 "名詞",
2243 "普通名詞",
2244 "一般",
2245 "*",
2246 "*",
2247 "*",
2248 "エキ",
2249 "駅",
2250 "駅",
2251 "エキ",
2252 "駅",
2253 "エキ",
2254 "漢",
2255 "*",
2256 "*",
2257 "*",
2258 "*"
2259 ]
2260 );
2261 }
2262 {
2263 let token = tokens_iter.next().unwrap();
2264 assert_eq!(token.text, "は");
2265 assert_eq!(token.byte_start, 39);
2266 assert_eq!(token.byte_end, 42);
2267 assert_eq!(token.position, 4);
2268 assert_eq!(token.position_length, 1);
2269 assert_eq!(
2270 token.get_details().unwrap(),
2271 vec![
2272 "助詞",
2273 "係助詞",
2274 "*",
2275 "*",
2276 "*",
2277 "*",
2278 "ハ",
2279 "は",
2280 "は",
2281 "ワ",
2282 "は",
2283 "ワ",
2284 "和",
2285 "*",
2286 "*",
2287 "*",
2288 "*"
2289 ]
2290 );
2291 }
2292 {
2293 let token = tokens_iter.next().unwrap();
2294 assert_eq!(token.text, "とうきょうスカイツリー駅");
2295 assert_eq!(token.byte_start, 42);
2296 assert_eq!(token.byte_end, 78);
2297 assert_eq!(token.position, 5);
2298 assert_eq!(token.position_length, 1);
2299 assert_eq!(
2300 token.get_details().unwrap(),
2301 vec![
2302 "カスタム名詞",
2303 "*",
2304 "*",
2305 "*",
2306 "*",
2307 "*",
2308 "トウキョウスカイツリーエキ",
2309 "*",
2310 "*",
2311 "*",
2312 "*",
2313 "*",
2314 "*",
2315 "*",
2316 "*",
2317 "*",
2318 "*"
2319 ]
2320 );
2321 }
2322 {
2323 let token = tokens_iter.next().unwrap();
2324 assert_eq!(token.text, "です");
2325 assert_eq!(token.byte_start, 78);
2326 assert_eq!(token.byte_end, 84);
2327 assert_eq!(token.position, 6);
2328 assert_eq!(token.position_length, 1);
2329 assert_eq!(
2330 token.get_details().unwrap(),
2331 vec![
2332 "助動詞",
2333 "*",
2334 "*",
2335 "*",
2336 "助動詞-デス",
2337 "終止形-一般",
2338 "デス",
2339 "です",
2340 "です",
2341 "デス",
2342 "です",
2343 "デス",
2344 "和",
2345 "*",
2346 "*",
2347 "*",
2348 "*"
2349 ]
2350 );
2351 }
2352 {
2353 let token = tokens_iter.next().unwrap();
2354 assert_eq!(token.text, "。");
2355 assert_eq!(token.byte_start, 84);
2356 assert_eq!(token.byte_end, 87);
2357 assert_eq!(token.position, 7);
2358 assert_eq!(token.position_length, 1);
2359 assert_eq!(
2360 token.get_details().unwrap(),
2361 vec![
2362 "補助記号",
2363 "句点",
2364 "*",
2365 "*",
2366 "*",
2367 "*",
2368 "",
2369 "。",
2370 "。",
2371 "",
2372 "。",
2373 "",
2374 "記号",
2375 "*",
2376 "*",
2377 "*",
2378 "*"
2379 ]
2380 );
2381 }
2382 }
2383
2384 #[test]
2385 #[cfg(feature = "ko-dic")]
2386 fn test_tokenize_with_simple_userdic_bin_ko_dic() {
2387 let dictionary = DictionaryConfig {
2388 kind: Some(DictionaryKind::KoDic),
2389 path: None,
2390 };
2391
2392 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2393 .join("../resources")
2394 .join("ko-dic_simple_userdic.bin");
2395
2396 let user_dictionary = Some(UserDictionaryConfig {
2397 kind: Some(DictionaryKind::KoDic),
2398 path: userdic_file,
2399 });
2400
2401 let config = TokenizerConfig {
2402 dictionary,
2403 user_dictionary,
2404 mode: Mode::Normal,
2405 };
2406
2407 let tokenizer = Tokenizer::from_config(config).unwrap();
2408 let mut tokens = tokenizer.tokenize("하네다공항한정토트백.").unwrap();
2409 let mut tokens_iter = tokens.iter_mut();
2410 {
2411 let token = tokens_iter.next().unwrap();
2412 assert_eq!(token.text, "하네다공항");
2413 assert_eq!(token.byte_start, 0);
2414 assert_eq!(token.byte_end, 15);
2415 assert_eq!(token.position, 0);
2416 assert_eq!(token.position_length, 1);
2417 assert_eq!(
2418 token.get_details().unwrap(),
2419 vec!["NNP", "*", "*", "하네다공항", "*", "*", "*", "*"]
2420 );
2421 }
2422 {
2423 let token = tokens_iter.next().unwrap();
2424 assert_eq!(token.text, "한정");
2425 assert_eq!(token.byte_start, 15);
2426 assert_eq!(token.byte_end, 21);
2427 assert_eq!(token.position, 1);
2428 assert_eq!(token.position_length, 1);
2429 assert_eq!(
2430 token.get_details().unwrap(),
2431 vec!["NNG", "*", "T", "한정", "*", "*", "*", "*"]
2432 );
2433 }
2434 {
2435 let token = tokens_iter.next().unwrap();
2436 assert_eq!(token.text, "토트백");
2437 assert_eq!(token.byte_start, 21);
2438 assert_eq!(token.byte_end, 30);
2439 assert_eq!(token.position, 2);
2440 assert_eq!(token.position_length, 1);
2441 assert_eq!(
2442 token.get_details().unwrap(),
2443 vec![
2444 "NNG",
2445 "*",
2446 "T",
2447 "토트백",
2448 "Compound",
2449 "*",
2450 "*",
2451 "토트/NNP/인명+백/NNG/*"
2452 ]
2453 );
2454 }
2455 {
2456 let token = tokens_iter.next().unwrap();
2457 assert_eq!(token.text, ".");
2458 assert_eq!(token.byte_start, 30);
2459 assert_eq!(token.byte_end, 31);
2460 assert_eq!(token.position, 3);
2461 assert_eq!(token.position_length, 1);
2462 assert_eq!(
2463 token.get_details().unwrap(),
2464 vec!["SF", "*", "*", "*", "*", "*", "*", "*"]
2465 );
2466 }
2467 }
2468
2469 #[test]
2470 #[cfg(feature = "cc-cedict")]
2471 fn test_tokenize_with_simple_userdic_bin_cc_cedict() {
2472 let dictionary = DictionaryConfig {
2473 kind: Some(DictionaryKind::CcCedict),
2474 path: None,
2475 };
2476
2477 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2478 .join("../resources")
2479 .join("cc-cedict_simple_userdic.bin");
2480
2481 let user_dictionary = Some(UserDictionaryConfig {
2482 kind: Some(DictionaryKind::CcCedict),
2483 path: userdic_file,
2484 });
2485
2486 let config = TokenizerConfig {
2487 dictionary,
2488 user_dictionary,
2489 mode: Mode::Normal,
2490 };
2491
2492 let tokenizer = Tokenizer::from_config(config).unwrap();
2493 let mut tokens = tokenizer.tokenize("羽田机场限定托特包。").unwrap();
2494 let mut tokens_iter = tokens.iter_mut();
2495 {
2496 let token = tokens_iter.next().unwrap();
2497 assert_eq!(token.text, "羽田机场");
2498 assert_eq!(token.byte_start, 0);
2499 assert_eq!(token.byte_end, 12);
2500 assert_eq!(token.position, 0);
2501 assert_eq!(token.position_length, 1);
2502 assert_eq!(
2503 token.get_details().unwrap(),
2504 vec!["*", "*", "*", "*", "Yu3 tian2 ji1 chang3", "*", "*", "*"]
2505 );
2506 }
2507 {
2508 let token = tokens_iter.next().unwrap();
2509 assert_eq!(token.text, "限定");
2510 assert_eq!(token.byte_start, 12);
2511 assert_eq!(token.byte_end, 18);
2512 assert_eq!(token.position, 1);
2513 assert_eq!(token.position_length, 1);
2514 assert_eq!(
2515 token.get_details().unwrap(),
2516 vec![
2517 "*",
2518 "*",
2519 "*",
2520 "*",
2521 "xian4 ding4",
2522 "限定",
2523 "限定",
2524 "to restrict to/to limit/"
2525 ]
2526 );
2527 }
2528 {
2529 let token = tokens_iter.next().unwrap();
2530 assert_eq!(token.text, "托特");
2531 assert_eq!(token.byte_start, 18);
2532 assert_eq!(token.byte_end, 24);
2533 assert_eq!(token.position, 2);
2534 assert_eq!(token.position_length, 1);
2535 assert_eq!(
2536 token.get_details().unwrap(),
2537 vec![
2538 "*",
2539 "*",
2540 "*",
2541 "*",
2542 "tuo1 te4",
2543 "托特",
2544 "托特",
2545 "(loanword) tote (bag)/"
2546 ]
2547 );
2548 }
2549 {
2550 let token = tokens_iter.next().unwrap();
2551 assert_eq!(token.text, "包");
2552 assert_eq!(token.byte_start, 24);
2553 assert_eq!(token.byte_end, 27);
2554 assert_eq!(token.position, 3);
2555 assert_eq!(token.position_length, 1);
2556 assert_eq!(
2557 token.get_details().unwrap(),
2558 vec![
2559 "*",
2560 "*",
2561 "*",
2562 "*",
2563 "bao1",
2564 "包",
2565 "包",
2566 "to cover/to wrap/to hold/to include/to take charge of/to contract (to or for)/package/wrapper/container/bag/to hold or embrace/bundle/packet/CL:個|个[ge4]",
2567 "隻|只[zhi1]/"
2568 ]
2569 );
2570 }
2571 {
2572 let token = tokens_iter.next().unwrap();
2573 assert_eq!(token.text, "。");
2574 assert_eq!(token.byte_start, 27);
2575 assert_eq!(token.byte_end, 30);
2576 assert_eq!(token.position, 4);
2577 assert_eq!(token.position_length, 1);
2578 assert_eq!(token.get_details().unwrap(), vec!["UNK"]);
2579 }
2580 }
2581
2582 #[test]
2583 #[cfg(feature = "ipadic")]
2584 fn test_tokenize_with_detailed_userdic_ipadic() {
2585 let dictionary = DictionaryConfig {
2586 kind: Some(DictionaryKind::IPADIC),
2587 path: None,
2588 };
2589
2590 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2591 .join("../resources")
2592 .join("ipadic_detailed_userdic.csv");
2593
2594 let user_dictionary = Some(UserDictionaryConfig {
2595 kind: Some(DictionaryKind::IPADIC),
2596 path: userdic_file,
2597 });
2598
2599 let config = TokenizerConfig {
2600 dictionary,
2601 user_dictionary,
2602 mode: Mode::Normal,
2603 };
2604
2605 let tokenizer = Tokenizer::from_config(config).unwrap();
2606 let mut tokens = tokenizer
2607 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
2608 .unwrap();
2609 let mut tokens_iter = tokens.iter_mut();
2610 {
2611 let token = tokens_iter.next().unwrap();
2612 assert_eq!(token.text, "東京スカイツリー");
2613 assert_eq!(token.byte_start, 0);
2614 assert_eq!(token.byte_end, 24);
2615 assert_eq!(token.position, 0);
2616 assert_eq!(token.position_length, 1);
2617 assert_eq!(
2618 token.get_details().unwrap(),
2619 vec![
2620 "名詞",
2621 "固有名詞",
2622 "一般",
2623 "カスタム名詞",
2624 "*",
2625 "*",
2626 "東京スカイツリー",
2627 "トウキョウスカイツリー",
2628 "トウキョウスカイツリー"
2629 ]
2630 );
2631 }
2632 {
2633 let token = tokens_iter.next().unwrap();
2634 assert_eq!(token.text, "の");
2635 assert_eq!(token.byte_start, 24);
2636 assert_eq!(token.byte_end, 27);
2637 assert_eq!(token.position, 1);
2638 assert_eq!(token.position_length, 1);
2639 assert_eq!(
2640 token.get_details().unwrap(),
2641 vec!["助詞", "連体化", "*", "*", "*", "*", "の", "ノ", "ノ"]
2642 );
2643 }
2644 {
2645 let token = tokens_iter.next().unwrap();
2646 assert_eq!(token.text, "最寄り駅");
2647 assert_eq!(token.byte_start, 27);
2648 assert_eq!(token.byte_end, 39);
2649 assert_eq!(token.position, 2);
2650 assert_eq!(token.position_length, 1);
2651 assert_eq!(
2652 token.get_details().unwrap(),
2653 vec![
2654 "名詞",
2655 "一般",
2656 "*",
2657 "*",
2658 "*",
2659 "*",
2660 "最寄り駅",
2661 "モヨリエキ",
2662 "モヨリエキ"
2663 ]
2664 );
2665 }
2666 {
2667 let token = tokens_iter.next().unwrap();
2668 assert_eq!(token.text, "は");
2669 assert_eq!(token.byte_start, 39);
2670 assert_eq!(token.byte_end, 42);
2671 assert_eq!(token.position, 3);
2672 assert_eq!(token.position_length, 1);
2673 assert_eq!(
2674 token.get_details().unwrap(),
2675 vec!["助詞", "係助詞", "*", "*", "*", "*", "は", "ハ", "ワ"]
2676 );
2677 }
2678 {
2679 let token = tokens_iter.next().unwrap();
2680 assert_eq!(token.text, "とうきょうスカイツリー駅");
2681 assert_eq!(token.byte_start, 42);
2682 assert_eq!(token.byte_end, 78);
2683 assert_eq!(token.position, 4);
2684 assert_eq!(token.position_length, 1);
2685 assert_eq!(
2686 token.get_details().unwrap(),
2687 vec![
2688 "名詞",
2689 "固有名詞",
2690 "一般",
2691 "カスタム名詞",
2692 "*",
2693 "*",
2694 "とうきょうスカイツリー駅",
2695 "トウキョウスカイツリーエキ",
2696 "トウキョウスカイツリーエキ"
2697 ]
2698 );
2699 }
2700 {
2701 let token = tokens_iter.next().unwrap();
2702 assert_eq!(token.text, "です");
2703 assert_eq!(token.byte_start, 78);
2704 assert_eq!(token.byte_end, 84);
2705 assert_eq!(token.position, 5);
2706 assert_eq!(token.position_length, 1);
2707 assert_eq!(
2708 token.get_details().unwrap(),
2709 vec![
2710 "助動詞",
2711 "*",
2712 "*",
2713 "*",
2714 "特殊・デス",
2715 "基本形",
2716 "です",
2717 "デス",
2718 "デス"
2719 ]
2720 );
2721 }
2722 {
2723 let token = tokens_iter.next().unwrap();
2724 assert_eq!(token.text, "。");
2725 assert_eq!(token.byte_start, 84);
2726 assert_eq!(token.byte_end, 87);
2727 assert_eq!(token.position, 6);
2728 assert_eq!(token.position_length, 1);
2729 assert_eq!(
2730 token.get_details().unwrap(),
2731 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
2732 );
2733 }
2734 }
2735
2736 #[test]
2737 #[cfg(feature = "ipadic")]
2738 fn test_mixed_user_dict() {
2739 let dictionary = DictionaryConfig {
2740 kind: Some(DictionaryKind::IPADIC),
2741 path: None,
2742 };
2743
2744 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2745 .join("../resources")
2746 .join("ipadic_mixed_userdic.csv");
2747
2748 let user_dictionary = Some(UserDictionaryConfig {
2749 kind: Some(DictionaryKind::IPADIC),
2750 path: userdic_file,
2751 });
2752
2753 let config = TokenizerConfig {
2754 dictionary,
2755 user_dictionary,
2756 mode: Mode::Normal,
2757 };
2758
2759 let tokenizer = Tokenizer::from_config(config).unwrap();
2760 let mut tokens = tokenizer
2761 .tokenize("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です。")
2762 .unwrap();
2763 let mut tokens_iter = tokens.iter_mut();
2764 {
2765 let token = tokens_iter.next().unwrap();
2766 assert_eq!(token.text, "東京スカイツリー");
2767 assert_eq!(token.byte_start, 0);
2768 assert_eq!(token.byte_end, 24);
2769 assert_eq!(token.position, 0);
2770 assert_eq!(token.position_length, 1);
2771 assert_eq!(
2772 token.get_details().unwrap(),
2773 vec![
2774 "名詞",
2775 "固有名詞",
2776 "一般",
2777 "カスタム名詞",
2778 "*",
2779 "*",
2780 "東京スカイツリー",
2781 "トウキョウスカイツリー",
2782 "トウキョウスカイツリー"
2783 ]
2784 );
2785 }
2786 {
2787 let token = tokens_iter.next().unwrap();
2788 assert_eq!(token.text, "の");
2789 assert_eq!(token.byte_start, 24);
2790 assert_eq!(token.byte_end, 27);
2791 assert_eq!(token.position, 1);
2792 assert_eq!(token.position_length, 1);
2793 assert_eq!(
2794 token.get_details().unwrap(),
2795 vec!["助詞", "連体化", "*", "*", "*", "*", "の", "ノ", "ノ"]
2796 );
2797 }
2798 {
2799 let token = tokens_iter.next().unwrap();
2800 assert_eq!(token.text, "最寄り駅");
2801 assert_eq!(token.byte_start, 27);
2802 assert_eq!(token.byte_end, 39);
2803 assert_eq!(token.position, 2);
2804 assert_eq!(token.position_length, 1);
2805 assert_eq!(
2806 token.get_details().unwrap(),
2807 vec![
2808 "名詞",
2809 "一般",
2810 "*",
2811 "*",
2812 "*",
2813 "*",
2814 "最寄り駅",
2815 "モヨリエキ",
2816 "モヨリエキ"
2817 ]
2818 );
2819 }
2820 {
2821 let token = tokens_iter.next().unwrap();
2822 assert_eq!(token.text, "は");
2823 assert_eq!(token.byte_start, 39);
2824 assert_eq!(token.byte_end, 42);
2825 assert_eq!(token.position, 3);
2826 assert_eq!(token.position_length, 1);
2827 assert_eq!(
2828 token.get_details().unwrap(),
2829 vec!["助詞", "係助詞", "*", "*", "*", "*", "は", "ハ", "ワ"]
2830 );
2831 }
2832 {
2833 let token = tokens_iter.next().unwrap();
2834 assert_eq!(token.text, "とうきょうスカイツリー駅");
2835 assert_eq!(token.byte_start, 42);
2836 assert_eq!(token.byte_end, 78);
2837 assert_eq!(token.position, 4);
2838 assert_eq!(token.position_length, 1);
2839 assert_eq!(
2840 token.get_details().unwrap(),
2841 vec![
2842 "カスタム名詞",
2843 "*",
2844 "*",
2845 "*",
2846 "*",
2847 "*",
2848 "とうきょうスカイツリー駅",
2849 "トウキョウスカイツリーエキ",
2850 "*"
2851 ]
2852 );
2853 }
2854 {
2855 let token = tokens_iter.next().unwrap();
2856 assert_eq!(token.text, "です");
2857 assert_eq!(token.byte_start, 78);
2858 assert_eq!(token.byte_end, 84);
2859 assert_eq!(token.position, 5);
2860 assert_eq!(token.position_length, 1);
2861 assert_eq!(
2862 token.get_details().unwrap(),
2863 vec![
2864 "助動詞",
2865 "*",
2866 "*",
2867 "*",
2868 "特殊・デス",
2869 "基本形",
2870 "です",
2871 "デス",
2872 "デス"
2873 ]
2874 );
2875 }
2876 {
2877 let token = tokens_iter.next().unwrap();
2878 assert_eq!(token.text, "。");
2879 assert_eq!(token.byte_start, 84);
2880 assert_eq!(token.byte_end, 87);
2881 assert_eq!(token.position, 6);
2882 assert_eq!(token.position_length, 1);
2883 assert_eq!(
2884 token.get_details().unwrap(),
2885 vec!["記号", "句点", "*", "*", "*", "*", "。", "。", "。"]
2886 );
2887 }
2888 }
2889
2890 #[test]
2891 #[cfg(feature = "ipadic")]
2892 #[should_panic(expected = "failed to parse word cost")]
2893 fn test_user_dict_invalid_word_cost() {
2894 let dictionary = DictionaryConfig {
2895 kind: Some(DictionaryKind::IPADIC),
2896 path: None,
2897 };
2898
2899 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2900 .join("../resources")
2901 .join("ipadic_userdic_invalid_word_cost.csv");
2902
2903 let user_dictionary = Some(UserDictionaryConfig {
2904 kind: Some(DictionaryKind::IPADIC),
2905 path: userdic_file,
2906 });
2907
2908 let config = TokenizerConfig {
2909 dictionary,
2910 user_dictionary,
2911 mode: Mode::Normal,
2912 };
2913
2914 Tokenizer::from_config(config).unwrap();
2915 }
2916
2917 #[test]
2918 #[cfg(feature = "ipadic")]
2919 #[should_panic(expected = "user dictionary should be a CSV with 3 or 13+ fields")]
2920 fn test_user_dict_number_of_fields_is_11() {
2921 let dictionary = DictionaryConfig {
2922 kind: Some(DictionaryKind::IPADIC),
2923 path: None,
2924 };
2925
2926 let userdic_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
2927 .join("../resources")
2928 .join("ipadic_userdic_insufficient_number_of_fields.csv");
2929
2930 let user_dictionary = Some(UserDictionaryConfig {
2931 kind: Some(DictionaryKind::IPADIC),
2932 path: userdic_file,
2933 });
2934
2935 let config = TokenizerConfig {
2936 dictionary,
2937 user_dictionary,
2938 mode: Mode::Normal,
2939 };
2940
2941 Tokenizer::from_config(config).unwrap();
2942 }
2943
2944 #[test]
2945 #[cfg(feature = "ipadic")]
2946 fn test_tokenize_with_nomal_mode() {
2947 let dictionary = DictionaryConfig {
2948 kind: Some(DictionaryKind::IPADIC),
2949 path: None,
2950 };
2951
2952 let config = TokenizerConfig {
2953 dictionary,
2954 user_dictionary: None,
2955 mode: Mode::Normal,
2956 };
2957
2958 let tokenizer = Tokenizer::from_config(config).unwrap();
2959 let mut tokens = tokenizer.tokenize("羽田空港限定トートバッグ").unwrap();
2960 let mut tokens_iter = tokens.iter_mut();
2961 {
2962 let token = tokens_iter.next().unwrap();
2963 assert_eq!(token.text, "羽田空港");
2964 assert_eq!(token.byte_start, 0);
2965 assert_eq!(token.byte_end, 12);
2966 assert_eq!(token.position, 0);
2967 assert_eq!(token.position_length, 1);
2968 assert_eq!(
2969 token.get_details().unwrap(),
2970 vec![
2971 "名詞",
2972 "固有名詞",
2973 "一般",
2974 "*",
2975 "*",
2976 "*",
2977 "羽田空港",
2978 "ハネダクウコウ",
2979 "ハネダクーコー"
2980 ]
2981 );
2982 }
2983 {
2984 let token = tokens_iter.next().unwrap();
2985 assert_eq!(token.text, "限定");
2986 assert_eq!(token.byte_start, 12);
2987 assert_eq!(token.byte_end, 18);
2988 assert_eq!(token.position, 1);
2989 assert_eq!(token.position_length, 1);
2990 assert_eq!(
2991 token.get_details().unwrap(),
2992 vec![
2993 "名詞",
2994 "サ変接続",
2995 "*",
2996 "*",
2997 "*",
2998 "*",
2999 "限定",
3000 "ゲンテイ",
3001 "ゲンテイ"
3002 ]
3003 );
3004 }
3005 {
3006 let token = tokens_iter.next().unwrap();
3007 assert_eq!(token.text, "トートバッグ");
3008 assert_eq!(token.byte_start, 18);
3009 assert_eq!(token.byte_end, 36);
3010 assert_eq!(token.position, 2);
3011 assert_eq!(token.position_length, 1);
3012 assert_eq!(token.get_details().unwrap(), vec!["UNK"]);
3013 }
3014 }
3015
3016 #[test]
3017 #[cfg(feature = "ipadic")]
3018 fn test_tokenize_with_decompose_mode() {
3019 let dictionary = DictionaryConfig {
3020 kind: Some(DictionaryKind::IPADIC),
3021 path: None,
3022 };
3023
3024 let config = TokenizerConfig {
3025 dictionary,
3026 user_dictionary: None,
3027 mode: Mode::Decompose(Penalty::default()),
3028 };
3029
3030 let tokenizer = Tokenizer::from_config(config).unwrap();
3031 let mut tokens = tokenizer.tokenize("羽田空港限定トートバッグ").unwrap();
3032 let mut tokens_iter = tokens.iter_mut();
3033 {
3034 let token = tokens_iter.next().unwrap();
3035 assert_eq!(token.text, "羽田");
3036 assert_eq!(token.byte_start, 0);
3037 assert_eq!(token.byte_end, 6);
3038 assert_eq!(token.position, 0);
3039 assert_eq!(token.position_length, 1);
3040 assert_eq!(
3041 token.get_details().unwrap(),
3042 vec![
3043 "名詞",
3044 "固有名詞",
3045 "人名",
3046 "姓",
3047 "*",
3048 "*",
3049 "羽田",
3050 "ハタ",
3051 "ハタ"
3052 ]
3053 );
3054 }
3055 {
3056 let token = tokens_iter.next().unwrap();
3057 assert_eq!(token.text, "空港");
3058 assert_eq!(token.byte_start, 6);
3059 assert_eq!(token.byte_end, 12);
3060 assert_eq!(token.position, 1);
3061 assert_eq!(token.position_length, 1);
3062 assert_eq!(
3063 token.get_details().unwrap(),
3064 vec![
3065 "名詞",
3066 "一般",
3067 "*",
3068 "*",
3069 "*",
3070 "*",
3071 "空港",
3072 "クウコウ",
3073 "クーコー"
3074 ]
3075 );
3076 }
3077 {
3078 let token = tokens_iter.next().unwrap();
3079 assert_eq!(token.text, "限定");
3080 assert_eq!(token.byte_start, 12);
3081 assert_eq!(token.byte_end, 18);
3082 assert_eq!(token.position, 2);
3083 assert_eq!(token.position_length, 1);
3084 assert_eq!(
3085 token.get_details().unwrap(),
3086 vec![
3087 "名詞",
3088 "サ変接続",
3089 "*",
3090 "*",
3091 "*",
3092 "*",
3093 "限定",
3094 "ゲンテイ",
3095 "ゲンテイ"
3096 ]
3097 );
3098 }
3099 {
3100 let token = tokens_iter.next().unwrap();
3101 assert_eq!(token.text, "トートバッグ");
3102 assert_eq!(token.byte_start, 18);
3103 assert_eq!(token.byte_end, 36);
3104 assert_eq!(token.position, 3);
3105 assert_eq!(token.position_length, 1);
3106 assert_eq!(token.get_details().unwrap(), vec!["UNK"]);
3107 }
3108 }
3109
3110 #[test]
3111 #[cfg(feature = "ipadic")]
3112 fn test_long_text() {
3113 let mut large_file = BufReader::new(
3114 File::open(
3115 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
3116 .join("../resources")
3117 .join("bocchan.txt"),
3118 )
3119 .unwrap(),
3120 );
3121 let mut large_text = String::new();
3122 let _size = large_file.read_to_string(&mut large_text).unwrap();
3123
3124 let dictionary = DictionaryConfig {
3125 kind: Some(DictionaryKind::IPADIC),
3126 path: None,
3127 };
3128
3129 let config = TokenizerConfig {
3130 dictionary,
3131 user_dictionary: None,
3132 mode: Mode::Normal,
3133 };
3134
3135 let tokenizer = Tokenizer::from_config(config).unwrap();
3136
3137 let tokens = tokenizer.tokenize(large_text.as_str()).unwrap();
3138 assert!(!tokens.is_empty());
3139 }
3140}