1use std::borrow::Cow;
38use std::path::Path;
39
40use mecab_ko_dict::{SystemDictionary, UserDictionary};
41
42use crate::error::Result;
43use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
44use crate::normalizer::{NormalizationConfig, Normalizer};
45use crate::pool::{PoolManager, PoolStats};
46use crate::pos_tag::PosTag;
47use crate::unknown::UnknownHandler;
48use crate::viterbi::{SpacePenalty, ViterbiSearcher};
49
50#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct Token {
55 pub surface: String,
57
58 pub pos: String,
60
61 pub start_pos: usize,
63
64 pub end_pos: usize,
66
67 pub start_byte: usize,
69
70 pub end_byte: usize,
72
73 pub reading: Option<String>,
75
76 pub lemma: Option<String>,
78
79 pub cost: i32,
81
82 pub features: String,
84
85 pub normalized: Option<String>,
87}
88
89impl Token {
90 #[must_use]
92 pub const fn new(
93 surface: String,
94 pos: String,
95 start_pos: usize,
96 end_pos: usize,
97 start_byte: usize,
98 end_byte: usize,
99 ) -> Self {
100 Self {
101 surface,
102 pos,
103 start_pos,
104 end_pos,
105 start_byte,
106 end_byte,
107 reading: None,
108 lemma: None,
109 cost: 0,
110 features: String::new(),
111 normalized: None,
112 }
113 }
114
115 #[must_use]
121 pub fn from_node(node: &Node) -> Self {
122 let features = node.feature.to_string();
123 let (pos, reading, lemma) = parse_features(&features);
124
125 Self {
126 surface: node.surface.to_string(),
127 pos: pos.to_string(),
128 start_pos: node.start_pos,
129 end_pos: node.end_pos,
130 start_byte: node.start_byte,
131 end_byte: node.end_byte,
132 reading,
133 lemma,
134 cost: node.total_cost,
135 features,
136 normalized: None,
137 }
138 }
139
140 #[inline]
142 #[must_use]
143 pub const fn char_len(&self) -> usize {
144 self.end_pos - self.start_pos
145 }
146
147 #[inline]
149 #[must_use]
150 pub const fn byte_len(&self) -> usize {
151 self.end_byte - self.start_byte
152 }
153
154 #[must_use]
156 pub fn pos_tag(&self) -> Option<PosTag> {
157 self.pos.parse().ok()
158 }
159}
160
161fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
169 let mut split = features.splitn(5, ',');
171
172 let pos = split.next().unwrap_or("*");
173
174 let reading = split
176 .nth(2) .filter(|s| !s.is_empty() && *s != "*")
178 .map(std::string::ToString::to_string);
179
180 let lemma = reading.clone();
181
182 (Cow::Borrowed(pos), reading, lemma)
183}
184
185pub struct Tokenizer {
196 dictionary: SystemDictionary,
198
199 unknown_handler: UnknownHandler,
201
202 viterbi_searcher: ViterbiSearcher,
204
205 lattice: Lattice,
207
208 normalizer: Option<Normalizer>,
210
211 enable_normalization: bool,
213
214 pool_manager: PoolManager,
216}
217
218impl Tokenizer {
219 pub fn new() -> Result<Self> {
237 let dictionary = SystemDictionary::load_default()?;
238 let unknown_handler = UnknownHandler::korean_default();
239 let viterbi_searcher = ViterbiSearcher::new();
240
241 let lattice = Lattice::new("");
243
244 Ok(Self {
245 dictionary,
246 unknown_handler,
247 viterbi_searcher,
248 lattice,
249 normalizer: None,
250 enable_normalization: false,
251 pool_manager: PoolManager::new(),
252 })
253 }
254
255 pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
266 let dictionary = SystemDictionary::load(dict_path)?;
267 let unknown_handler = UnknownHandler::korean_default();
268 let viterbi_searcher = ViterbiSearcher::new();
269
270 let lattice = Lattice::new("");
271
272 Ok(Self {
273 dictionary,
274 unknown_handler,
275 viterbi_searcher,
276 lattice,
277 normalizer: None,
278 enable_normalization: false,
279 pool_manager: PoolManager::new(),
280 })
281 }
282
283 #[must_use]
302 pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
303 self.dictionary.set_user_dictionary(user_dict);
304 self
305 }
306
307 pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
329 self.dictionary.set_user_dictionary(user_dict);
330 }
331
332 #[cfg(feature = "hot-reload-v2")]
341 pub fn set_hot_reload(
342 &mut self,
343 hr: std::sync::Arc<mecab_ko_dict::hot_reload_v2::HotReloadDictV2>,
344 ) {
345 self.dictionary.set_hot_reload(hr);
346 }
347
348 #[must_use]
354 pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
355 self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
356 self
357 }
358
359 pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
382 if text.is_empty() {
383 return Vec::new();
384 }
385
386 self.lattice.reset(text);
388
389 self.build_lattice();
391
392 let path = self
394 .viterbi_searcher
395 .search(&mut self.lattice, self.dictionary.matrix());
396
397 path.iter()
399 .filter_map(|&node_id| self.lattice.node(node_id))
400 .map(Token::from_node)
401 .collect()
402 }
403
404 fn build_lattice(&mut self) {
409 let char_len = self.lattice.char_len();
410
411 for pos in 0..char_len {
413 let has_dict_entry = self.add_dict_nodes(pos);
415
416 self.unknown_handler
418 .add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
419 }
420 }
421
422 fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
434 let char_len = self.lattice.char_len();
439 let search_text: &str = self.lattice.substring(start_pos, char_len);
440
441 if search_text.is_empty() {
442 return false;
443 }
444
445 let dict_entries: Vec<_> = self
450 .dictionary
451 .common_prefix_search(search_text)
452 .unwrap_or_default();
453
454 let user_entries: Vec<_> = self
459 .dictionary
460 .user_dictionary()
461 .map(|ud| ud.common_prefix_search(search_text))
462 .unwrap_or_default();
463
464 let mut found = false;
466
467 for (entry, byte_len) in dict_entries {
468 let end_pos = self
471 .lattice
472 .char_pos_from_start_and_byte_len(start_pos, byte_len);
473
474 self.lattice.add_node(
475 NodeBuilder::new(&entry.surface, start_pos, end_pos)
476 .left_id(entry.left_id)
477 .right_id(entry.right_id)
478 .word_cost(i32::from(entry.cost))
479 .node_type(NodeType::Known)
480 .feature(&entry.feature),
481 );
482
483 found = true;
484 }
485
486 for user_entry in user_entries {
487 let surface_char_len = user_entry.surface.chars().count();
488 let end_pos = start_pos + surface_char_len;
489
490 self.lattice.add_node(
491 NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
492 .left_id(user_entry.left_id)
493 .right_id(user_entry.right_id)
494 .word_cost(i32::from(user_entry.cost))
495 .node_type(NodeType::User)
496 .feature(&user_entry.feature),
497 );
498
499 found = true;
500 }
501
502 found
503 }
504
505 pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
517 if !text.is_empty() {
518 self.lattice.reset(text);
519 self.build_lattice();
520 }
521 &self.lattice
522 }
523
524 pub fn wakati(&mut self, text: &str) -> Vec<String> {
555 self.tokenize(text).into_iter().map(|t| t.surface).collect()
556 }
557
558 pub fn nouns(&mut self, text: &str) -> Vec<String> {
568 self.tokenize(text)
569 .into_iter()
570 .filter(|t| t.pos.starts_with("NN"))
571 .map(|t| t.surface)
572 .collect()
573 }
574
575 pub fn morphs(&mut self, text: &str) -> Vec<String> {
588 self.wakati(text)
589 }
590
591 pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
614 self.tokenize(text)
615 .into_iter()
616 .map(|t| (t.surface, t.pos))
617 .collect()
618 }
619
620 #[must_use]
625 pub const fn dictionary(&self) -> &SystemDictionary {
626 &self.dictionary
627 }
628
629 #[must_use]
634 pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
635 self.lattice.stats()
636 }
637
638 #[must_use]
642 pub fn pool_stats(&self) -> PoolStats {
643 self.pool_manager.stats()
644 }
645
646 #[must_use]
650 pub fn memory_stats(&self) -> crate::memory::MemoryStats {
651 crate::memory::MemoryStats {
652 dictionary_bytes: 0, lattice_bytes: self.lattice.memory_usage(),
654 pool_bytes: self.pool_manager.total_memory_usage(),
655 cache_bytes: 0,
656 interner_bytes: 0,
657 token_bytes: 0,
658 }
659 }
660
661 pub fn clear_pools(&self) {
666 self.pool_manager.clear_all();
667 }
668
669 pub fn set_normalization(
680 &mut self,
681 enable: bool,
682 config: Option<NormalizationConfig>,
683 ) -> Result<()> {
684 self.enable_normalization = enable;
685
686 if enable {
687 let normalizer_config = config.unwrap_or_default();
688 self.normalizer = Some(Normalizer::new(normalizer_config)?);
689 } else {
690 self.normalizer = None;
691 }
692
693 Ok(())
694 }
695
696 #[must_use]
698 pub const fn normalizer(&self) -> Option<&Normalizer> {
699 self.normalizer.as_ref()
700 }
701
702 #[must_use]
704 pub const fn is_normalization_enabled(&self) -> bool {
705 self.enable_normalization
706 }
707
708 pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
720 let mut tokens = self.tokenize(text);
721
722 if let Some(normalizer) = &self.normalizer {
724 for token in &mut tokens {
725 token.normalized = Some(normalizer.normalize(&token.surface));
726 }
727 }
728
729 tokens
730 }
731
732 #[must_use]
744 pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
745 self.normalizer.as_ref().map_or_else(
746 || (word.to_string(), Vec::new()),
747 |normalizer| {
748 let standard = normalizer.normalize(word);
749 let variants = normalizer.get_variants(&standard);
750 (standard, variants)
751 },
752 )
753 }
754}
755
756#[cfg(test)]
760#[allow(clippy::expect_used, clippy::vec_init_then_push)]
761mod tests {
762 use super::*;
763 use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
764
765 fn create_test_tokenizer() -> Tokenizer {
767 let mut trie_entries = vec![
769 ("아버지", 0u32),
770 ("가", 1),
771 ("방", 2),
772 ("에", 3),
773 ("들어가", 4),
774 ("신다", 5),
775 ];
776 let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
777 let trie = mecab_ko_dict::Trie::from_vec(trie_bytes);
778
779 let matrix = DenseMatrix::new(10, 10, 100);
781 let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
782
783 let mut entries = Vec::new();
785 entries.push(DictEntry::new(
786 "아버지",
787 1,
788 1,
789 1000,
790 "NNG,*,T,아버지,*,*,*,*",
791 ));
792 entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
793 entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
794 entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
795 entries.push(DictEntry::new(
796 "들어가",
797 3,
798 3,
799 1500,
800 "VV,*,F,들어가다,*,*,*,*",
801 ));
802 entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
803
804 let dictionary = SystemDictionary::new_test(
805 std::path::PathBuf::from("./test_dic"),
806 trie,
807 matrix,
808 entries,
809 );
810
811 let unknown_handler = UnknownHandler::korean_default();
812 let viterbi_searcher = ViterbiSearcher::new();
813 let lattice = Lattice::new("");
814
815 Tokenizer {
816 dictionary,
817 unknown_handler,
818 viterbi_searcher,
819 lattice,
820 normalizer: None,
821 enable_normalization: false,
822 pool_manager: PoolManager::new(),
823 }
824 }
825
826 #[test]
827 fn test_token_creation() {
828 let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
829
830 assert_eq!(token.surface, "안녕");
831 assert_eq!(token.pos, "NNG");
832 assert_eq!(token.start_pos, 0);
833 assert_eq!(token.end_pos, 2);
834 assert_eq!(token.char_len(), 2);
835 assert_eq!(token.byte_len(), 6);
836 }
837
838 #[test]
839 fn test_parse_features() {
840 let features = "NNG,*,T,안녕,*,*,*,*";
841 let (pos, reading, lemma) = parse_features(features);
842
843 assert_eq!(pos, "NNG");
844 assert_eq!(reading, Some("안녕".to_string()));
845 assert_eq!(lemma, Some("안녕".to_string()));
846 }
847
848 #[test]
849 fn test_parse_features_no_reading() {
850 let features = "JKS,*,F,*,*,*,*,*";
851 let (pos, reading, _lemma) = parse_features(features);
852
853 assert_eq!(pos, "JKS");
854 assert_eq!(reading, None);
855 }
856
857 #[test]
858 fn test_tokenize_simple() {
859 let mut tokenizer = create_test_tokenizer();
860 let tokens = tokenizer.tokenize("아버지");
861
862 assert!(!tokens.is_empty());
863 assert_eq!(tokens[0].surface, "아버지");
864 assert_eq!(tokens[0].pos, "NNG");
865 }
866
867 #[test]
868 fn test_tokenize_with_particle() {
869 let mut tokenizer = create_test_tokenizer();
870 let tokens = tokenizer.tokenize("아버지가");
871
872 assert_eq!(tokens.len(), 2);
873 assert_eq!(tokens[0].surface, "아버지");
874 assert_eq!(tokens[0].pos, "NNG");
875 assert_eq!(tokens[1].surface, "가");
876 assert_eq!(tokens[1].pos, "JKS");
877 }
878
879 #[test]
880 fn test_tokenize_complex() {
881 let mut tokenizer = create_test_tokenizer();
882 let tokens = tokenizer.tokenize("아버지가방에들어가신다");
883
884 assert!(!tokens.is_empty());
886
887 assert_eq!(tokens[0].surface, "아버지");
889 }
890
891 #[test]
892 fn test_tokenize_empty() {
893 let mut tokenizer = create_test_tokenizer();
894 let tokens = tokenizer.tokenize("");
895
896 assert!(tokens.is_empty());
897 }
898
899 #[test]
900 fn test_tokenize_with_spaces() {
901 let mut tokenizer = create_test_tokenizer();
902 let tokens = tokenizer.tokenize("아버지 가방");
903
904 assert!(!tokens.is_empty());
906 }
907
908 #[test]
909 fn test_wakati() {
910 let mut tokenizer = create_test_tokenizer();
911 let surfaces = tokenizer.wakati("아버지가");
912
913 assert_eq!(surfaces.len(), 2);
914 assert_eq!(surfaces[0], "아버지");
915 assert_eq!(surfaces[1], "가");
916 }
917
918 #[test]
919 fn test_nouns() {
920 let mut tokenizer = create_test_tokenizer();
921 let nouns = tokenizer.nouns("아버지가방에");
922
923 assert!(nouns.contains(&"아버지".to_string()));
925 assert!(nouns.contains(&"방".to_string()));
926 assert!(!nouns.contains(&"가".to_string())); }
928
929 #[test]
930 fn test_pos() {
931 let mut tokenizer = create_test_tokenizer();
932 let pos_tags = tokenizer.pos("아버지가");
933
934 assert_eq!(pos_tags.len(), 2);
935 assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
936 assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
937 }
938
939 #[test]
940 fn test_tokenize_to_lattice() {
941 let mut tokenizer = create_test_tokenizer();
942 let lattice = tokenizer.tokenize_to_lattice("아버지가");
943
944 assert!(lattice.node_count() > 2); let stats = lattice.stats();
949 assert!(stats.total_nodes > 2);
950 }
951
952 #[test]
953 fn test_lattice_stats() {
954 let mut tokenizer = create_test_tokenizer();
955 tokenizer.tokenize("아버지가");
956
957 let stats = tokenizer.lattice_stats();
958 assert!(stats.total_nodes > 0);
959 assert!(stats.char_length > 0);
960 }
961
962 #[test]
963 fn test_token_positions() {
964 let mut tokenizer = create_test_tokenizer();
965 let tokens = tokenizer.tokenize("아버지가");
966
967 assert_eq!(tokens[0].start_pos, 0);
969 assert_eq!(tokens[0].end_pos, 3);
970
971 assert_eq!(tokens[1].start_pos, 3);
973 assert_eq!(tokens[1].end_pos, 4);
974 }
975
976 #[test]
977 fn test_multiple_tokenize_calls() {
978 let mut tokenizer = create_test_tokenizer();
979
980 let tokens1 = tokenizer.tokenize("아버지");
982 assert!(!tokens1.is_empty());
983
984 let tokens2 = tokenizer.tokenize("가방");
986 assert!(!tokens2.is_empty());
987
988 assert_ne!(tokens1[0].surface, tokens2[0].surface);
990 }
991
992 #[test]
993 fn test_token_from_node() {
994 use crate::lattice::Node;
995 use std::borrow::Cow;
996
997 let node = Node {
998 id: 1,
999 surface: Cow::Borrowed("테스트"),
1000 start_pos: 0,
1001 end_pos: 3,
1002 start_byte: 0,
1003 end_byte: 9,
1004 left_id: 1,
1005 right_id: 1,
1006 word_cost: 1000,
1007 total_cost: 1500,
1008 prev_node_id: 0,
1009 node_type: NodeType::Known,
1010 feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
1011 has_space_before: false,
1012 };
1013
1014 let token = Token::from_node(&node);
1015
1016 assert_eq!(token.surface, "테스트");
1017 assert_eq!(token.pos, "NNG");
1018 assert_eq!(token.start_pos, 0);
1019 assert_eq!(token.end_pos, 3);
1020 assert_eq!(token.reading, Some("테스트".to_string()));
1021 assert_eq!(token.cost, 1500);
1022 }
1023
1024 #[test]
1025 fn test_with_user_dict() {
1026 let mut tokenizer = create_test_tokenizer();
1027
1028 let mut user_dict = UserDictionary::new();
1029 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1030
1031 tokenizer.set_user_dict(user_dict);
1032
1033 assert!(tokenizer.dictionary().user_dictionary().is_some());
1035 }
1036}