1use std::borrow::Cow;
38use std::path::Path;
39
40use mecab_ko_dict::{SystemDictionary, UserDictionary};
41
42use crate::error::Result;
43use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
44use crate::normalizer::{NormalizationConfig, Normalizer};
45use crate::pool::{PoolManager, PoolStats};
46use crate::pos_tag::PosTag;
47use crate::unknown::UnknownHandler;
48use crate::viterbi::{SpacePenalty, ViterbiSearcher};
49
50#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct Token {
55 pub surface: String,
57
58 pub pos: String,
60
61 pub start_pos: usize,
63
64 pub end_pos: usize,
66
67 pub start_byte: usize,
69
70 pub end_byte: usize,
72
73 pub reading: Option<String>,
75
76 pub lemma: Option<String>,
78
79 pub cost: i32,
81
82 pub features: String,
84
85 pub normalized: Option<String>,
87}
88
89impl Token {
90 #[must_use]
92 pub const fn new(
93 surface: String,
94 pos: String,
95 start_pos: usize,
96 end_pos: usize,
97 start_byte: usize,
98 end_byte: usize,
99 ) -> Self {
100 Self {
101 surface,
102 pos,
103 start_pos,
104 end_pos,
105 start_byte,
106 end_byte,
107 reading: None,
108 lemma: None,
109 cost: 0,
110 features: String::new(),
111 normalized: None,
112 }
113 }
114
115 #[must_use]
121 pub fn from_node(node: &Node) -> Self {
122 let features = node.feature.to_string();
123 let (pos, reading, lemma) = parse_features(&features);
124
125 Self {
126 surface: node.surface.to_string(),
127 pos: pos.to_string(),
128 start_pos: node.start_pos,
129 end_pos: node.end_pos,
130 start_byte: node.start_byte,
131 end_byte: node.end_byte,
132 reading,
133 lemma,
134 cost: node.total_cost,
135 features,
136 normalized: None,
137 }
138 }
139
140 #[inline]
142 #[must_use]
143 pub const fn char_len(&self) -> usize {
144 self.end_pos - self.start_pos
145 }
146
147 #[inline]
149 #[must_use]
150 pub const fn byte_len(&self) -> usize {
151 self.end_byte - self.start_byte
152 }
153
154 #[must_use]
156 pub fn pos_tag(&self) -> Option<PosTag> {
157 self.pos.parse().ok()
158 }
159}
160
161fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
169 let mut split = features.splitn(5, ',');
171
172 let pos = split.next().unwrap_or("*");
173
174 let reading = split
176 .nth(2) .filter(|s| !s.is_empty() && *s != "*")
178 .map(std::string::ToString::to_string);
179
180 let lemma = reading.clone();
181
182 (Cow::Borrowed(pos), reading, lemma)
183}
184
185pub struct Tokenizer {
196 dictionary: SystemDictionary,
198
199 unknown_handler: UnknownHandler,
201
202 viterbi_searcher: ViterbiSearcher,
204
205 lattice: Lattice,
207
208 normalizer: Option<Normalizer>,
210
211 enable_normalization: bool,
213
214 pool_manager: PoolManager,
216}
217
218impl Tokenizer {
219 pub fn new() -> Result<Self> {
237 let dictionary = SystemDictionary::load_default()?;
238 let unknown_handler = UnknownHandler::korean_default();
239 let viterbi_searcher =
240 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
241
242 let lattice = Lattice::new("");
244
245 Ok(Self {
246 dictionary,
247 unknown_handler,
248 viterbi_searcher,
249 lattice,
250 normalizer: None,
251 enable_normalization: false,
252 pool_manager: PoolManager::new(),
253 })
254 }
255
256 pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
267 let dictionary = SystemDictionary::load(dict_path)?;
268 let unknown_handler = UnknownHandler::korean_default();
269 let viterbi_searcher =
270 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
271
272 let lattice = Lattice::new("");
273
274 Ok(Self {
275 dictionary,
276 unknown_handler,
277 viterbi_searcher,
278 lattice,
279 normalizer: None,
280 enable_normalization: false,
281 pool_manager: PoolManager::new(),
282 })
283 }
284
285 #[must_use]
304 pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
305 self.dictionary.set_user_dictionary(user_dict);
306 self
307 }
308
309 pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
331 self.dictionary.set_user_dictionary(user_dict);
332 }
333
334 #[must_use]
340 pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
341 self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
342 self
343 }
344
345 pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
368 if text.is_empty() {
369 return Vec::new();
370 }
371
372 self.lattice.reset(text);
374
375 self.build_lattice();
377
378 let path = self
380 .viterbi_searcher
381 .search(&mut self.lattice, self.dictionary.matrix());
382
383 path.iter()
385 .filter_map(|&node_id| self.lattice.node(node_id))
386 .map(Token::from_node)
387 .collect()
388 }
389
390 fn build_lattice(&mut self) {
395 let char_len = self.lattice.char_len();
396
397 for pos in 0..char_len {
399 let has_dict_entry = self.add_dict_nodes(pos);
401
402 self.unknown_handler
404 .add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
405 }
406 }
407
408 fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
420 let char_len = self.lattice.char_len();
425 let search_text: &str = self.lattice.substring(start_pos, char_len);
426
427 if search_text.is_empty() {
428 return false;
429 }
430
431 let match_indices: Vec<(u32, usize)> = self
435 .dictionary
436 .trie()
437 .common_prefix_search(search_text)
438 .collect();
439
440 let user_entries: Vec<_> = self
445 .dictionary
446 .user_dictionary()
447 .map(|ud| ud.common_prefix_search(search_text))
448 .unwrap_or_default();
449
450 let mut found = false;
452
453 for (index, byte_len) in match_indices {
454 if let Some(entry) = self.dictionary.get_entry(index) {
455 let end_pos = self
458 .lattice
459 .char_pos_from_start_and_byte_len(start_pos, byte_len);
460
461 self.lattice.add_node(
462 NodeBuilder::new(&entry.surface, start_pos, end_pos)
463 .left_id(entry.left_id)
464 .right_id(entry.right_id)
465 .word_cost(i32::from(entry.cost))
466 .node_type(NodeType::Known)
467 .feature(&entry.feature),
468 );
469
470 found = true;
471 }
472 }
473
474 for user_entry in user_entries {
475 let surface_char_len = user_entry.surface.chars().count();
476 let end_pos = start_pos + surface_char_len;
477
478 self.lattice.add_node(
479 NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
480 .left_id(user_entry.left_id)
481 .right_id(user_entry.right_id)
482 .word_cost(i32::from(user_entry.cost))
483 .node_type(NodeType::User)
484 .feature(&user_entry.feature),
485 );
486
487 found = true;
488 }
489
490 found
491 }
492
493 pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
505 if !text.is_empty() {
506 self.lattice.reset(text);
507 self.build_lattice();
508 }
509 &self.lattice
510 }
511
512 pub fn wakati(&mut self, text: &str) -> Vec<String> {
543 self.tokenize(text).into_iter().map(|t| t.surface).collect()
544 }
545
546 pub fn nouns(&mut self, text: &str) -> Vec<String> {
556 self.tokenize(text)
557 .into_iter()
558 .filter(|t| t.pos.starts_with("NN"))
559 .map(|t| t.surface)
560 .collect()
561 }
562
563 pub fn morphs(&mut self, text: &str) -> Vec<String> {
576 self.wakati(text)
577 }
578
579 pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
602 self.tokenize(text)
603 .into_iter()
604 .map(|t| (t.surface, t.pos))
605 .collect()
606 }
607
608 #[must_use]
613 pub const fn dictionary(&self) -> &SystemDictionary {
614 &self.dictionary
615 }
616
617 #[must_use]
622 pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
623 self.lattice.stats()
624 }
625
626 #[must_use]
630 pub fn pool_stats(&self) -> PoolStats {
631 self.pool_manager.stats()
632 }
633
634 #[must_use]
638 pub fn memory_stats(&self) -> crate::memory::MemoryStats {
639 crate::memory::MemoryStats {
640 dictionary_bytes: 0, lattice_bytes: self.lattice.memory_usage(),
642 pool_bytes: self.pool_manager.total_memory_usage(),
643 cache_bytes: 0,
644 interner_bytes: 0,
645 token_bytes: 0,
646 }
647 }
648
649 pub fn clear_pools(&self) {
654 self.pool_manager.clear_all();
655 }
656
657 pub fn set_normalization(
668 &mut self,
669 enable: bool,
670 config: Option<NormalizationConfig>,
671 ) -> Result<()> {
672 self.enable_normalization = enable;
673
674 if enable {
675 let normalizer_config = config.unwrap_or_default();
676 self.normalizer = Some(Normalizer::new(normalizer_config)?);
677 } else {
678 self.normalizer = None;
679 }
680
681 Ok(())
682 }
683
684 #[must_use]
686 pub const fn normalizer(&self) -> Option<&Normalizer> {
687 self.normalizer.as_ref()
688 }
689
690 #[must_use]
692 pub const fn is_normalization_enabled(&self) -> bool {
693 self.enable_normalization
694 }
695
696 pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
708 let mut tokens = self.tokenize(text);
709
710 if let Some(normalizer) = &self.normalizer {
712 for token in &mut tokens {
713 token.normalized = Some(normalizer.normalize(&token.surface));
714 }
715 }
716
717 tokens
718 }
719
720 #[must_use]
732 pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
733 self.normalizer.as_ref().map_or_else(
734 || (word.to_string(), Vec::new()),
735 |normalizer| {
736 let standard = normalizer.normalize(word);
737 let variants = normalizer.get_variants(&standard);
738 (standard, variants)
739 },
740 )
741 }
742}
743
744#[cfg(test)]
748#[allow(clippy::expect_used, clippy::vec_init_then_push)]
749mod tests {
750 use super::*;
751 use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
752
753 fn create_test_tokenizer() -> Tokenizer {
755 let mut trie_entries = vec![
757 ("아버지", 0u32),
758 ("가", 1),
759 ("방", 2),
760 ("에", 3),
761 ("들어가", 4),
762 ("신다", 5),
763 ];
764 let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
765 let trie = mecab_ko_dict::Trie::from_vec(trie_bytes);
766
767 let matrix = DenseMatrix::new(10, 10, 100);
769 let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
770
771 let mut entries = Vec::new();
773 entries.push(DictEntry::new(
774 "아버지",
775 1,
776 1,
777 1000,
778 "NNG,*,T,아버지,*,*,*,*",
779 ));
780 entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
781 entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
782 entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
783 entries.push(DictEntry::new(
784 "들어가",
785 3,
786 3,
787 1500,
788 "VV,*,F,들어가다,*,*,*,*",
789 ));
790 entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
791
792 let dictionary = SystemDictionary::new_test(
793 std::path::PathBuf::from("./test_dic"),
794 trie,
795 matrix,
796 entries,
797 );
798
799 let unknown_handler = UnknownHandler::korean_default();
800 let viterbi_searcher =
801 ViterbiSearcher::new().with_space_penalty(SpacePenalty::korean_default());
802 let lattice = Lattice::new("");
803
804 Tokenizer {
805 dictionary,
806 unknown_handler,
807 viterbi_searcher,
808 lattice,
809 normalizer: None,
810 enable_normalization: false,
811 pool_manager: PoolManager::new(),
812 }
813 }
814
815 #[test]
816 fn test_token_creation() {
817 let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
818
819 assert_eq!(token.surface, "안녕");
820 assert_eq!(token.pos, "NNG");
821 assert_eq!(token.start_pos, 0);
822 assert_eq!(token.end_pos, 2);
823 assert_eq!(token.char_len(), 2);
824 assert_eq!(token.byte_len(), 6);
825 }
826
827 #[test]
828 fn test_parse_features() {
829 let features = "NNG,*,T,안녕,*,*,*,*";
830 let (pos, reading, lemma) = parse_features(features);
831
832 assert_eq!(pos, "NNG");
833 assert_eq!(reading, Some("안녕".to_string()));
834 assert_eq!(lemma, Some("안녕".to_string()));
835 }
836
837 #[test]
838 fn test_parse_features_no_reading() {
839 let features = "JKS,*,F,*,*,*,*,*";
840 let (pos, reading, _lemma) = parse_features(features);
841
842 assert_eq!(pos, "JKS");
843 assert_eq!(reading, None);
844 }
845
846 #[test]
847 fn test_tokenize_simple() {
848 let mut tokenizer = create_test_tokenizer();
849 let tokens = tokenizer.tokenize("아버지");
850
851 assert!(!tokens.is_empty());
852 assert_eq!(tokens[0].surface, "아버지");
853 assert_eq!(tokens[0].pos, "NNG");
854 }
855
856 #[test]
857 fn test_tokenize_with_particle() {
858 let mut tokenizer = create_test_tokenizer();
859 let tokens = tokenizer.tokenize("아버지가");
860
861 assert_eq!(tokens.len(), 2);
862 assert_eq!(tokens[0].surface, "아버지");
863 assert_eq!(tokens[0].pos, "NNG");
864 assert_eq!(tokens[1].surface, "가");
865 assert_eq!(tokens[1].pos, "JKS");
866 }
867
868 #[test]
869 fn test_tokenize_complex() {
870 let mut tokenizer = create_test_tokenizer();
871 let tokens = tokenizer.tokenize("아버지가방에들어가신다");
872
873 assert!(!tokens.is_empty());
875
876 assert_eq!(tokens[0].surface, "아버지");
878 }
879
880 #[test]
881 fn test_tokenize_empty() {
882 let mut tokenizer = create_test_tokenizer();
883 let tokens = tokenizer.tokenize("");
884
885 assert!(tokens.is_empty());
886 }
887
888 #[test]
889 fn test_tokenize_with_spaces() {
890 let mut tokenizer = create_test_tokenizer();
891 let tokens = tokenizer.tokenize("아버지 가방");
892
893 assert!(!tokens.is_empty());
895 }
896
897 #[test]
898 fn test_wakati() {
899 let mut tokenizer = create_test_tokenizer();
900 let surfaces = tokenizer.wakati("아버지가");
901
902 assert_eq!(surfaces.len(), 2);
903 assert_eq!(surfaces[0], "아버지");
904 assert_eq!(surfaces[1], "가");
905 }
906
907 #[test]
908 fn test_nouns() {
909 let mut tokenizer = create_test_tokenizer();
910 let nouns = tokenizer.nouns("아버지가방에");
911
912 assert!(nouns.contains(&"아버지".to_string()));
914 assert!(nouns.contains(&"방".to_string()));
915 assert!(!nouns.contains(&"가".to_string())); }
917
918 #[test]
919 fn test_pos() {
920 let mut tokenizer = create_test_tokenizer();
921 let pos_tags = tokenizer.pos("아버지가");
922
923 assert_eq!(pos_tags.len(), 2);
924 assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
925 assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
926 }
927
928 #[test]
929 fn test_tokenize_to_lattice() {
930 let mut tokenizer = create_test_tokenizer();
931 let lattice = tokenizer.tokenize_to_lattice("아버지가");
932
933 assert!(lattice.node_count() > 2); let stats = lattice.stats();
938 assert!(stats.total_nodes > 2);
939 }
940
941 #[test]
942 fn test_lattice_stats() {
943 let mut tokenizer = create_test_tokenizer();
944 tokenizer.tokenize("아버지가");
945
946 let stats = tokenizer.lattice_stats();
947 assert!(stats.total_nodes > 0);
948 assert!(stats.char_length > 0);
949 }
950
951 #[test]
952 fn test_token_positions() {
953 let mut tokenizer = create_test_tokenizer();
954 let tokens = tokenizer.tokenize("아버지가");
955
956 assert_eq!(tokens[0].start_pos, 0);
958 assert_eq!(tokens[0].end_pos, 3);
959
960 assert_eq!(tokens[1].start_pos, 3);
962 assert_eq!(tokens[1].end_pos, 4);
963 }
964
965 #[test]
966 fn test_multiple_tokenize_calls() {
967 let mut tokenizer = create_test_tokenizer();
968
969 let tokens1 = tokenizer.tokenize("아버지");
971 assert!(!tokens1.is_empty());
972
973 let tokens2 = tokenizer.tokenize("가방");
975 assert!(!tokens2.is_empty());
976
977 assert_ne!(tokens1[0].surface, tokens2[0].surface);
979 }
980
981 #[test]
982 fn test_token_from_node() {
983 use crate::lattice::Node;
984 use std::borrow::Cow;
985
986 let node = Node {
987 id: 1,
988 surface: Cow::Borrowed("테스트"),
989 start_pos: 0,
990 end_pos: 3,
991 start_byte: 0,
992 end_byte: 9,
993 left_id: 1,
994 right_id: 1,
995 word_cost: 1000,
996 total_cost: 1500,
997 prev_node_id: 0,
998 node_type: NodeType::Known,
999 feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
1000 has_space_before: false,
1001 };
1002
1003 let token = Token::from_node(&node);
1004
1005 assert_eq!(token.surface, "테스트");
1006 assert_eq!(token.pos, "NNG");
1007 assert_eq!(token.start_pos, 0);
1008 assert_eq!(token.end_pos, 3);
1009 assert_eq!(token.reading, Some("테스트".to_string()));
1010 assert_eq!(token.cost, 1500);
1011 }
1012
1013 #[test]
1014 fn test_with_user_dict() {
1015 let mut tokenizer = create_test_tokenizer();
1016
1017 let mut user_dict = UserDictionary::new();
1018 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1019
1020 tokenizer.set_user_dict(user_dict);
1021
1022 assert!(tokenizer.dictionary().user_dictionary().is_some());
1024 }
1025}