1use std::borrow::Cow;
38use std::path::Path;
39
40use mecab_ko_dict::{SystemDictionary, UserDictionary};
41
42use crate::error::Result;
43use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
44use crate::normalizer::{NormalizationConfig, Normalizer};
45use crate::pool::{PoolManager, PoolStats};
46use crate::pos_tag::PosTag;
47use crate::unknown::UnknownHandler;
48use crate::viterbi::{SpacePenalty, ViterbiSearcher};
49
50#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct Token {
55 pub surface: String,
57
58 pub pos: String,
60
61 pub start_pos: usize,
63
64 pub end_pos: usize,
66
67 pub start_byte: usize,
69
70 pub end_byte: usize,
72
73 pub reading: Option<String>,
75
76 pub lemma: Option<String>,
78
79 pub cost: i32,
81
82 pub features: String,
84
85 pub normalized: Option<String>,
87}
88
89impl Token {
90 #[must_use]
92 pub const fn new(
93 surface: String,
94 pos: String,
95 start_pos: usize,
96 end_pos: usize,
97 start_byte: usize,
98 end_byte: usize,
99 ) -> Self {
100 Self {
101 surface,
102 pos,
103 start_pos,
104 end_pos,
105 start_byte,
106 end_byte,
107 reading: None,
108 lemma: None,
109 cost: 0,
110 features: String::new(),
111 normalized: None,
112 }
113 }
114
115 #[must_use]
121 pub fn from_node(node: &Node) -> Self {
122 let features = node.feature.to_string();
123 let (pos, reading, lemma) = parse_features(&features);
124
125 Self {
126 surface: node.surface.to_string(),
127 pos: pos.to_string(),
128 start_pos: node.start_pos,
129 end_pos: node.end_pos,
130 start_byte: node.start_byte,
131 end_byte: node.end_byte,
132 reading,
133 lemma,
134 cost: node.total_cost,
135 features,
136 normalized: None,
137 }
138 }
139
140 #[inline]
142 #[must_use]
143 pub const fn char_len(&self) -> usize {
144 self.end_pos - self.start_pos
145 }
146
147 #[inline]
149 #[must_use]
150 pub const fn byte_len(&self) -> usize {
151 self.end_byte - self.start_byte
152 }
153
154 #[must_use]
156 pub fn pos_tag(&self) -> Option<PosTag> {
157 self.pos.parse().ok()
158 }
159}
160
161fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
169 let mut split = features.splitn(5, ',');
171
172 let pos = split.next().unwrap_or("*");
173
174 let reading = split
176 .nth(2) .filter(|s| !s.is_empty() && *s != "*")
178 .map(std::string::ToString::to_string);
179
180 let lemma = reading.clone();
181
182 (Cow::Borrowed(pos), reading, lemma)
183}
184
185pub struct Tokenizer {
196 dictionary: SystemDictionary,
198
199 unknown_handler: UnknownHandler,
201
202 viterbi_searcher: ViterbiSearcher,
204
205 lattice: Lattice,
207
208 normalizer: Option<Normalizer>,
210
211 enable_normalization: bool,
213
214 pool_manager: PoolManager,
216}
217
218impl Tokenizer {
219 pub fn new() -> Result<Self> {
237 let dictionary = SystemDictionary::load_default()?;
238 let unknown_handler = UnknownHandler::korean_default();
239 let viterbi_searcher =
240 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
241
242 let lattice = Lattice::new("");
244
245 Ok(Self {
246 dictionary,
247 unknown_handler,
248 viterbi_searcher,
249 lattice,
250 normalizer: None,
251 enable_normalization: false,
252 pool_manager: PoolManager::new(),
253 })
254 }
255
256 pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
267 let dictionary = SystemDictionary::load(dict_path)?;
268 let unknown_handler = UnknownHandler::korean_default();
269 let viterbi_searcher =
270 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
271
272 let lattice = Lattice::new("");
273
274 Ok(Self {
275 dictionary,
276 unknown_handler,
277 viterbi_searcher,
278 lattice,
279 normalizer: None,
280 enable_normalization: false,
281 pool_manager: PoolManager::new(),
282 })
283 }
284
285 #[must_use]
304 pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
305 self.dictionary.set_user_dictionary(user_dict);
306 self
307 }
308
309 pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
311 self.dictionary.set_user_dictionary(user_dict);
312 }
313
314 #[must_use]
320 pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
321 self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
322 self
323 }
324
325 pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
348 if text.is_empty() {
349 return Vec::new();
350 }
351
352 self.lattice.reset(text);
354
355 self.build_lattice();
357
358 let path = self
360 .viterbi_searcher
361 .search(&mut self.lattice, self.dictionary.matrix());
362
363 path.iter()
365 .filter_map(|&node_id| self.lattice.node(node_id))
366 .map(Token::from_node)
367 .collect()
368 }
369
370 fn build_lattice(&mut self) {
375 let char_len = self.lattice.char_len();
376
377 for pos in 0..char_len {
379 let has_dict_entry = self.add_dict_nodes(pos);
381
382 self.unknown_handler
384 .add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
385 }
386 }
387
388 fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
400 let char_len = self.lattice.char_len();
405 let search_text: &str = self.lattice.substring(start_pos, char_len);
406
407 if search_text.is_empty() {
408 return false;
409 }
410
411 let match_indices: Vec<(u32, usize)> = self
415 .dictionary
416 .trie()
417 .common_prefix_search(search_text)
418 .collect();
419
420 let user_entries: Vec<_> = self
425 .dictionary
426 .user_dictionary()
427 .map(|ud| ud.common_prefix_search(search_text))
428 .unwrap_or_default();
429
430 let mut found = false;
432
433 for (index, byte_len) in match_indices {
434 if let Some(entry) = self.dictionary.get_entry(index) {
435 let end_pos = self
438 .lattice
439 .char_pos_from_start_and_byte_len(start_pos, byte_len);
440
441 self.lattice.add_node(
442 NodeBuilder::new(&entry.surface, start_pos, end_pos)
443 .left_id(entry.left_id)
444 .right_id(entry.right_id)
445 .word_cost(i32::from(entry.cost))
446 .node_type(NodeType::Known)
447 .feature(&entry.feature),
448 );
449
450 found = true;
451 }
452 }
453
454 for user_entry in user_entries {
455 let surface_char_len = user_entry.surface.chars().count();
456 let end_pos = start_pos + surface_char_len;
457
458 self.lattice.add_node(
459 NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
460 .left_id(user_entry.left_id)
461 .right_id(user_entry.right_id)
462 .word_cost(i32::from(user_entry.cost))
463 .node_type(NodeType::User)
464 .feature(&user_entry.feature),
465 );
466
467 found = true;
468 }
469
470 found
471 }
472
473 pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
485 if !text.is_empty() {
486 self.lattice.reset(text);
487 self.build_lattice();
488 }
489 &self.lattice
490 }
491
492 pub fn wakati(&mut self, text: &str) -> Vec<String> {
502 self.tokenize(text).into_iter().map(|t| t.surface).collect()
503 }
504
505 pub fn nouns(&mut self, text: &str) -> Vec<String> {
515 self.tokenize(text)
516 .into_iter()
517 .filter(|t| t.pos.starts_with("NN"))
518 .map(|t| t.surface)
519 .collect()
520 }
521
522 pub fn morphs(&mut self, text: &str) -> Vec<String> {
524 self.wakati(text)
525 }
526
527 pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
537 self.tokenize(text)
538 .into_iter()
539 .map(|t| (t.surface, t.pos))
540 .collect()
541 }
542
543 #[must_use]
545 pub const fn dictionary(&self) -> &SystemDictionary {
546 &self.dictionary
547 }
548
549 pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
551 self.lattice.stats()
552 }
553
554 #[must_use]
558 pub fn pool_stats(&self) -> PoolStats {
559 self.pool_manager.stats()
560 }
561
562 pub fn clear_pools(&self) {
567 self.pool_manager.clear_all();
568 }
569
570 pub fn set_normalization(
581 &mut self,
582 enable: bool,
583 config: Option<NormalizationConfig>,
584 ) -> Result<()> {
585 self.enable_normalization = enable;
586
587 if enable {
588 let normalizer_config = config.unwrap_or_default();
589 self.normalizer = Some(Normalizer::new(normalizer_config)?);
590 } else {
591 self.normalizer = None;
592 }
593
594 Ok(())
595 }
596
597 #[must_use]
599 pub const fn normalizer(&self) -> Option<&Normalizer> {
600 self.normalizer.as_ref()
601 }
602
603 #[must_use]
605 pub const fn is_normalization_enabled(&self) -> bool {
606 self.enable_normalization
607 }
608
609 pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
621 let mut tokens = self.tokenize(text);
622
623 if let Some(normalizer) = &self.normalizer {
625 for token in &mut tokens {
626 token.normalized = Some(normalizer.normalize(&token.surface));
627 }
628 }
629
630 tokens
631 }
632
633 #[must_use]
645 pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
646 self.normalizer.as_ref().map_or_else(
647 || (word.to_string(), Vec::new()),
648 |normalizer| {
649 let standard = normalizer.normalize(word);
650 let variants = normalizer.get_variants(&standard);
651 (standard, variants)
652 },
653 )
654 }
655}
656
657#[cfg(test)]
661#[allow(clippy::expect_used, clippy::vec_init_then_push)]
662mod tests {
663 use super::*;
664 use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
665
666 fn create_test_tokenizer() -> Tokenizer {
668 let mut trie_entries = vec![
670 ("아버지", 0u32),
671 ("가", 1),
672 ("방", 2),
673 ("에", 3),
674 ("들어가", 4),
675 ("신다", 5),
676 ];
677 let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
678 let trie = mecab_ko_dict::Trie::from_vec(trie_bytes);
679
680 let matrix = DenseMatrix::new(10, 10, 100);
682 let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
683
684 let mut entries = Vec::new();
686 entries.push(DictEntry::new(
687 "아버지",
688 1,
689 1,
690 1000,
691 "NNG,*,T,아버지,*,*,*,*",
692 ));
693 entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
694 entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
695 entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
696 entries.push(DictEntry::new(
697 "들어가",
698 3,
699 3,
700 1500,
701 "VV,*,F,들어가다,*,*,*,*",
702 ));
703 entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
704
705 let dictionary = SystemDictionary::new_test(
706 std::path::PathBuf::from("./test_dic"),
707 trie,
708 matrix,
709 entries,
710 );
711
712 let unknown_handler = UnknownHandler::korean_default();
713 let viterbi_searcher =
714 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
715 let lattice = Lattice::new("");
716
717 Tokenizer {
718 dictionary,
719 unknown_handler,
720 viterbi_searcher,
721 lattice,
722 normalizer: None,
723 enable_normalization: false,
724 pool_manager: PoolManager::new(),
725 }
726 }
727
728 #[test]
729 fn test_token_creation() {
730 let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
731
732 assert_eq!(token.surface, "안녕");
733 assert_eq!(token.pos, "NNG");
734 assert_eq!(token.start_pos, 0);
735 assert_eq!(token.end_pos, 2);
736 assert_eq!(token.char_len(), 2);
737 assert_eq!(token.byte_len(), 6);
738 }
739
740 #[test]
741 fn test_parse_features() {
742 let features = "NNG,*,T,안녕,*,*,*,*";
743 let (pos, reading, lemma) = parse_features(features);
744
745 assert_eq!(pos, "NNG");
746 assert_eq!(reading, Some("안녕".to_string()));
747 assert_eq!(lemma, Some("안녕".to_string()));
748 }
749
750 #[test]
751 fn test_parse_features_no_reading() {
752 let features = "JKS,*,F,*,*,*,*,*";
753 let (pos, reading, _lemma) = parse_features(features);
754
755 assert_eq!(pos, "JKS");
756 assert_eq!(reading, None);
757 }
758
759 #[test]
760 fn test_tokenize_simple() {
761 let mut tokenizer = create_test_tokenizer();
762 let tokens = tokenizer.tokenize("아버지");
763
764 assert!(!tokens.is_empty());
765 assert_eq!(tokens[0].surface, "아버지");
766 assert_eq!(tokens[0].pos, "NNG");
767 }
768
769 #[test]
770 fn test_tokenize_with_particle() {
771 let mut tokenizer = create_test_tokenizer();
772 let tokens = tokenizer.tokenize("아버지가");
773
774 assert_eq!(tokens.len(), 2);
775 assert_eq!(tokens[0].surface, "아버지");
776 assert_eq!(tokens[0].pos, "NNG");
777 assert_eq!(tokens[1].surface, "가");
778 assert_eq!(tokens[1].pos, "JKS");
779 }
780
781 #[test]
782 fn test_tokenize_complex() {
783 let mut tokenizer = create_test_tokenizer();
784 let tokens = tokenizer.tokenize("아버지가방에들어가신다");
785
786 assert!(!tokens.is_empty());
788
789 assert_eq!(tokens[0].surface, "아버지");
791 }
792
793 #[test]
794 fn test_tokenize_empty() {
795 let mut tokenizer = create_test_tokenizer();
796 let tokens = tokenizer.tokenize("");
797
798 assert!(tokens.is_empty());
799 }
800
801 #[test]
802 fn test_tokenize_with_spaces() {
803 let mut tokenizer = create_test_tokenizer();
804 let tokens = tokenizer.tokenize("아버지 가방");
805
806 assert!(!tokens.is_empty());
808 }
809
810 #[test]
811 fn test_wakati() {
812 let mut tokenizer = create_test_tokenizer();
813 let surfaces = tokenizer.wakati("아버지가");
814
815 assert_eq!(surfaces.len(), 2);
816 assert_eq!(surfaces[0], "아버지");
817 assert_eq!(surfaces[1], "가");
818 }
819
820 #[test]
821 fn test_nouns() {
822 let mut tokenizer = create_test_tokenizer();
823 let nouns = tokenizer.nouns("아버지가방에");
824
825 assert!(nouns.contains(&"아버지".to_string()));
827 assert!(nouns.contains(&"방".to_string()));
828 assert!(!nouns.contains(&"가".to_string())); }
830
831 #[test]
832 fn test_pos() {
833 let mut tokenizer = create_test_tokenizer();
834 let pos_tags = tokenizer.pos("아버지가");
835
836 assert_eq!(pos_tags.len(), 2);
837 assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
838 assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
839 }
840
841 #[test]
842 fn test_tokenize_to_lattice() {
843 let mut tokenizer = create_test_tokenizer();
844 let lattice = tokenizer.tokenize_to_lattice("아버지가");
845
846 assert!(lattice.node_count() > 2); let stats = lattice.stats();
851 assert!(stats.total_nodes > 2);
852 }
853
854 #[test]
855 fn test_lattice_stats() {
856 let mut tokenizer = create_test_tokenizer();
857 tokenizer.tokenize("아버지가");
858
859 let stats = tokenizer.lattice_stats();
860 assert!(stats.total_nodes > 0);
861 assert!(stats.char_length > 0);
862 }
863
864 #[test]
865 fn test_token_positions() {
866 let mut tokenizer = create_test_tokenizer();
867 let tokens = tokenizer.tokenize("아버지가");
868
869 assert_eq!(tokens[0].start_pos, 0);
871 assert_eq!(tokens[0].end_pos, 3);
872
873 assert_eq!(tokens[1].start_pos, 3);
875 assert_eq!(tokens[1].end_pos, 4);
876 }
877
878 #[test]
879 fn test_multiple_tokenize_calls() {
880 let mut tokenizer = create_test_tokenizer();
881
882 let tokens1 = tokenizer.tokenize("아버지");
884 assert!(!tokens1.is_empty());
885
886 let tokens2 = tokenizer.tokenize("가방");
888 assert!(!tokens2.is_empty());
889
890 assert_ne!(tokens1[0].surface, tokens2[0].surface);
892 }
893
894 #[test]
895 fn test_token_from_node() {
896 use crate::lattice::Node;
897 use std::borrow::Cow;
898
899 let node = Node {
900 id: 1,
901 surface: Cow::Borrowed("테스트"),
902 start_pos: 0,
903 end_pos: 3,
904 start_byte: 0,
905 end_byte: 9,
906 left_id: 1,
907 right_id: 1,
908 word_cost: 1000,
909 total_cost: 1500,
910 prev_node_id: 0,
911 node_type: NodeType::Known,
912 feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
913 has_space_before: false,
914 };
915
916 let token = Token::from_node(&node);
917
918 assert_eq!(token.surface, "테스트");
919 assert_eq!(token.pos, "NNG");
920 assert_eq!(token.start_pos, 0);
921 assert_eq!(token.end_pos, 3);
922 assert_eq!(token.reading, Some("테스트".to_string()));
923 assert_eq!(token.cost, 1500);
924 }
925
926 #[test]
927 fn test_with_user_dict() {
928 let mut tokenizer = create_test_tokenizer();
929
930 let mut user_dict = UserDictionary::new();
931 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
932
933 tokenizer.set_user_dict(user_dict);
934
935 assert!(tokenizer.dictionary().user_dictionary().is_some());
937 }
938}