1use std::borrow::Cow;
38use std::path::Path;
39
40use mecab_ko_dict::{SystemDictionary, UserDictionary};
41
42use crate::error::Result;
43use crate::lattice::{Lattice, Node, NodeBuilder, NodeType};
44use crate::normalizer::{NormalizationConfig, Normalizer};
45use crate::pool::{PoolManager, PoolStats};
46use crate::pos_tag::PosTag;
47use crate::unknown::UnknownHandler;
48use crate::viterbi::{SpacePenalty, ViterbiSearcher};
49
50#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct Token {
55 pub surface: String,
57
58 pub pos: String,
60
61 pub start_pos: usize,
63
64 pub end_pos: usize,
66
67 pub start_byte: usize,
69
70 pub end_byte: usize,
72
73 pub reading: Option<String>,
75
76 pub lemma: Option<String>,
78
79 pub cost: i32,
81
82 pub features: String,
84
85 pub normalized: Option<String>,
87}
88
89impl Token {
90 #[must_use]
92 pub const fn new(
93 surface: String,
94 pos: String,
95 start_pos: usize,
96 end_pos: usize,
97 start_byte: usize,
98 end_byte: usize,
99 ) -> Self {
100 Self {
101 surface,
102 pos,
103 start_pos,
104 end_pos,
105 start_byte,
106 end_byte,
107 reading: None,
108 lemma: None,
109 cost: 0,
110 features: String::new(),
111 normalized: None,
112 }
113 }
114
115 #[must_use]
121 pub fn from_node(node: &Node) -> Self {
122 let features = node.feature.to_string();
123 let (pos, reading, lemma) = parse_features(&features);
124
125 Self {
126 surface: node.surface.to_string(),
127 pos: pos.to_string(),
128 start_pos: node.start_pos,
129 end_pos: node.end_pos,
130 start_byte: node.start_byte,
131 end_byte: node.end_byte,
132 reading,
133 lemma,
134 cost: node.total_cost,
135 features,
136 normalized: None,
137 }
138 }
139
140 #[inline]
142 #[must_use]
143 pub const fn char_len(&self) -> usize {
144 self.end_pos - self.start_pos
145 }
146
147 #[inline]
149 #[must_use]
150 pub const fn byte_len(&self) -> usize {
151 self.end_byte - self.start_byte
152 }
153
154 #[must_use]
156 pub fn pos_tag(&self) -> Option<PosTag> {
157 self.pos.parse().ok()
158 }
159}
160
161fn parse_features(features: &str) -> (Cow<'_, str>, Option<String>, Option<String>) {
169 let mut split = features.splitn(5, ',');
171
172 let pos = split.next().unwrap_or("*");
173
174 let reading = split
176 .nth(2) .filter(|s| !s.is_empty() && *s != "*")
178 .map(std::string::ToString::to_string);
179
180 let lemma = reading.clone();
181
182 (Cow::Borrowed(pos), reading, lemma)
183}
184
185pub struct Tokenizer {
196 dictionary: SystemDictionary,
198
199 unknown_handler: UnknownHandler,
201
202 viterbi_searcher: ViterbiSearcher,
204
205 lattice: Lattice,
207
208 normalizer: Option<Normalizer>,
210
211 enable_normalization: bool,
213
214 pool_manager: PoolManager,
216}
217
218impl Tokenizer {
219 pub fn new() -> Result<Self> {
237 let dictionary = SystemDictionary::load_default()?;
238 let unknown_handler = UnknownHandler::korean_default();
239 let viterbi_searcher =
240 ViterbiSearcher::new();
241
242 let lattice = Lattice::new("");
244
245 Ok(Self {
246 dictionary,
247 unknown_handler,
248 viterbi_searcher,
249 lattice,
250 normalizer: None,
251 enable_normalization: false,
252 pool_manager: PoolManager::new(),
253 })
254 }
255
256 pub fn with_dict<P: AsRef<Path>>(dict_path: P) -> Result<Self> {
267 let dictionary = SystemDictionary::load(dict_path)?;
268 let unknown_handler = UnknownHandler::korean_default();
269 let viterbi_searcher =
270 ViterbiSearcher::new();
271
272 let lattice = Lattice::new("");
273
274 Ok(Self {
275 dictionary,
276 unknown_handler,
277 viterbi_searcher,
278 lattice,
279 normalizer: None,
280 enable_normalization: false,
281 pool_manager: PoolManager::new(),
282 })
283 }
284
285 #[must_use]
304 pub fn with_user_dict(mut self, user_dict: UserDictionary) -> Self {
305 self.dictionary.set_user_dictionary(user_dict);
306 self
307 }
308
309 pub fn set_user_dict(&mut self, user_dict: UserDictionary) {
331 self.dictionary.set_user_dictionary(user_dict);
332 }
333
334 #[must_use]
340 pub fn with_space_penalty(mut self, penalty: SpacePenalty) -> Self {
341 self.viterbi_searcher = ViterbiSearcher::new().with_space_penalty(penalty);
342 self
343 }
344
345 pub fn tokenize(&mut self, text: &str) -> Vec<Token> {
368 if text.is_empty() {
369 return Vec::new();
370 }
371
372 self.lattice.reset(text);
374
375 self.build_lattice();
377
378 let path = self
380 .viterbi_searcher
381 .search(&mut self.lattice, self.dictionary.matrix());
382
383 path.iter()
385 .filter_map(|&node_id| self.lattice.node(node_id))
386 .map(Token::from_node)
387 .collect()
388 }
389
390 fn build_lattice(&mut self) {
395 let char_len = self.lattice.char_len();
396
397 for pos in 0..char_len {
399 let has_dict_entry = self.add_dict_nodes(pos);
401
402 self.unknown_handler
404 .add_unknown_nodes(&mut self.lattice, pos, has_dict_entry);
405 }
406 }
407
408 fn add_dict_nodes(&mut self, start_pos: usize) -> bool {
420 let char_len = self.lattice.char_len();
425 let search_text: &str = self.lattice.substring(start_pos, char_len);
426
427 if search_text.is_empty() {
428 return false;
429 }
430
431 let dict_entries: Vec<_> = self.dictionary.common_prefix_search(search_text);
436
437 let user_entries: Vec<_> = self
442 .dictionary
443 .user_dictionary()
444 .map(|ud| ud.common_prefix_search(search_text))
445 .unwrap_or_default();
446
447 let mut found = false;
449
450 for (entry, byte_len) in dict_entries {
451 let end_pos = self
454 .lattice
455 .char_pos_from_start_and_byte_len(start_pos, byte_len);
456
457 self.lattice.add_node(
458 NodeBuilder::new(&entry.surface, start_pos, end_pos)
459 .left_id(entry.left_id)
460 .right_id(entry.right_id)
461 .word_cost(i32::from(entry.cost))
462 .node_type(NodeType::Known)
463 .feature(&entry.feature),
464 );
465
466 found = true;
467 }
468
469 for user_entry in user_entries {
470 let surface_char_len = user_entry.surface.chars().count();
471 let end_pos = start_pos + surface_char_len;
472
473 self.lattice.add_node(
474 NodeBuilder::new(&user_entry.surface, start_pos, end_pos)
475 .left_id(user_entry.left_id)
476 .right_id(user_entry.right_id)
477 .word_cost(i32::from(user_entry.cost))
478 .node_type(NodeType::User)
479 .feature(&user_entry.feature),
480 );
481
482 found = true;
483 }
484
485 found
486 }
487
488 pub fn tokenize_to_lattice(&mut self, text: &str) -> &Lattice {
500 if !text.is_empty() {
501 self.lattice.reset(text);
502 self.build_lattice();
503 }
504 &self.lattice
505 }
506
507 pub fn wakati(&mut self, text: &str) -> Vec<String> {
538 self.tokenize(text).into_iter().map(|t| t.surface).collect()
539 }
540
541 pub fn nouns(&mut self, text: &str) -> Vec<String> {
551 self.tokenize(text)
552 .into_iter()
553 .filter(|t| t.pos.starts_with("NN"))
554 .map(|t| t.surface)
555 .collect()
556 }
557
558 pub fn morphs(&mut self, text: &str) -> Vec<String> {
571 self.wakati(text)
572 }
573
574 pub fn pos(&mut self, text: &str) -> Vec<(String, String)> {
597 self.tokenize(text)
598 .into_iter()
599 .map(|t| (t.surface, t.pos))
600 .collect()
601 }
602
603 #[must_use]
608 pub const fn dictionary(&self) -> &SystemDictionary {
609 &self.dictionary
610 }
611
612 #[must_use]
617 pub fn lattice_stats(&self) -> crate::lattice::LatticeStats {
618 self.lattice.stats()
619 }
620
621 #[must_use]
625 pub fn pool_stats(&self) -> PoolStats {
626 self.pool_manager.stats()
627 }
628
629 #[must_use]
633 pub fn memory_stats(&self) -> crate::memory::MemoryStats {
634 crate::memory::MemoryStats {
635 dictionary_bytes: 0, lattice_bytes: self.lattice.memory_usage(),
637 pool_bytes: self.pool_manager.total_memory_usage(),
638 cache_bytes: 0,
639 interner_bytes: 0,
640 token_bytes: 0,
641 }
642 }
643
644 pub fn clear_pools(&self) {
649 self.pool_manager.clear_all();
650 }
651
652 pub fn set_normalization(
663 &mut self,
664 enable: bool,
665 config: Option<NormalizationConfig>,
666 ) -> Result<()> {
667 self.enable_normalization = enable;
668
669 if enable {
670 let normalizer_config = config.unwrap_or_default();
671 self.normalizer = Some(Normalizer::new(normalizer_config)?);
672 } else {
673 self.normalizer = None;
674 }
675
676 Ok(())
677 }
678
679 #[must_use]
681 pub const fn normalizer(&self) -> Option<&Normalizer> {
682 self.normalizer.as_ref()
683 }
684
685 #[must_use]
687 pub const fn is_normalization_enabled(&self) -> bool {
688 self.enable_normalization
689 }
690
691 pub fn tokenize_with_normalization(&mut self, text: &str) -> Vec<Token> {
703 let mut tokens = self.tokenize(text);
704
705 if let Some(normalizer) = &self.normalizer {
707 for token in &mut tokens {
708 token.normalized = Some(normalizer.normalize(&token.surface));
709 }
710 }
711
712 tokens
713 }
714
715 #[must_use]
727 pub fn get_word_variants(&self, word: &str) -> (String, Vec<String>) {
728 self.normalizer.as_ref().map_or_else(
729 || (word.to_string(), Vec::new()),
730 |normalizer| {
731 let standard = normalizer.normalize(word);
732 let variants = normalizer.get_variants(&standard);
733 (standard, variants)
734 },
735 )
736 }
737}
738
739#[cfg(test)]
743#[allow(clippy::expect_used, clippy::vec_init_then_push)]
744mod tests {
745 use super::*;
746 use mecab_ko_dict::{matrix::DenseMatrix, trie::TrieBuilder, DictEntry};
747
748 fn create_test_tokenizer() -> Tokenizer {
750 let mut trie_entries = vec![
752 ("아버지", 0u32),
753 ("가", 1),
754 ("방", 2),
755 ("에", 3),
756 ("들어가", 4),
757 ("신다", 5),
758 ];
759 let trie_bytes = TrieBuilder::build_unsorted(&mut trie_entries).expect("should build trie");
760 let trie = mecab_ko_dict::Trie::from_vec(trie_bytes);
761
762 let matrix = DenseMatrix::new(10, 10, 100);
764 let matrix = mecab_ko_dict::matrix::ConnectionMatrix::Dense(matrix);
765
766 let mut entries = Vec::new();
768 entries.push(DictEntry::new(
769 "아버지",
770 1,
771 1,
772 1000,
773 "NNG,*,T,아버지,*,*,*,*",
774 ));
775 entries.push(DictEntry::new("가", 5, 5, 500, "JKS,*,F,가,*,*,*,*"));
776 entries.push(DictEntry::new("방", 2, 2, 2000, "NNG,*,T,방,*,*,*,*"));
777 entries.push(DictEntry::new("에", 6, 6, 400, "JKB,*,F,에,*,*,*,*"));
778 entries.push(DictEntry::new(
779 "들어가",
780 3,
781 3,
782 1500,
783 "VV,*,F,들어가다,*,*,*,*",
784 ));
785 entries.push(DictEntry::new("신다", 4, 4, 1800, "VV+EP,*,F,신다,*,*,*,*"));
786
787 let dictionary = SystemDictionary::new_test(
788 std::path::PathBuf::from("./test_dic"),
789 trie,
790 matrix,
791 entries,
792 );
793
794 let unknown_handler = UnknownHandler::korean_default();
795 let viterbi_searcher =
796 ViterbiSearcher::new();
797 let lattice = Lattice::new("");
798
799 Tokenizer {
800 dictionary,
801 unknown_handler,
802 viterbi_searcher,
803 lattice,
804 normalizer: None,
805 enable_normalization: false,
806 pool_manager: PoolManager::new(),
807 }
808 }
809
810 #[test]
811 fn test_token_creation() {
812 let token = Token::new("안녕".to_string(), "NNG".to_string(), 0, 2, 0, 6);
813
814 assert_eq!(token.surface, "안녕");
815 assert_eq!(token.pos, "NNG");
816 assert_eq!(token.start_pos, 0);
817 assert_eq!(token.end_pos, 2);
818 assert_eq!(token.char_len(), 2);
819 assert_eq!(token.byte_len(), 6);
820 }
821
822 #[test]
823 fn test_parse_features() {
824 let features = "NNG,*,T,안녕,*,*,*,*";
825 let (pos, reading, lemma) = parse_features(features);
826
827 assert_eq!(pos, "NNG");
828 assert_eq!(reading, Some("안녕".to_string()));
829 assert_eq!(lemma, Some("안녕".to_string()));
830 }
831
832 #[test]
833 fn test_parse_features_no_reading() {
834 let features = "JKS,*,F,*,*,*,*,*";
835 let (pos, reading, _lemma) = parse_features(features);
836
837 assert_eq!(pos, "JKS");
838 assert_eq!(reading, None);
839 }
840
841 #[test]
842 fn test_tokenize_simple() {
843 let mut tokenizer = create_test_tokenizer();
844 let tokens = tokenizer.tokenize("아버지");
845
846 assert!(!tokens.is_empty());
847 assert_eq!(tokens[0].surface, "아버지");
848 assert_eq!(tokens[0].pos, "NNG");
849 }
850
851 #[test]
852 fn test_tokenize_with_particle() {
853 let mut tokenizer = create_test_tokenizer();
854 let tokens = tokenizer.tokenize("아버지가");
855
856 assert_eq!(tokens.len(), 2);
857 assert_eq!(tokens[0].surface, "아버지");
858 assert_eq!(tokens[0].pos, "NNG");
859 assert_eq!(tokens[1].surface, "가");
860 assert_eq!(tokens[1].pos, "JKS");
861 }
862
863 #[test]
864 fn test_tokenize_complex() {
865 let mut tokenizer = create_test_tokenizer();
866 let tokens = tokenizer.tokenize("아버지가방에들어가신다");
867
868 assert!(!tokens.is_empty());
870
871 assert_eq!(tokens[0].surface, "아버지");
873 }
874
875 #[test]
876 fn test_tokenize_empty() {
877 let mut tokenizer = create_test_tokenizer();
878 let tokens = tokenizer.tokenize("");
879
880 assert!(tokens.is_empty());
881 }
882
883 #[test]
884 fn test_tokenize_with_spaces() {
885 let mut tokenizer = create_test_tokenizer();
886 let tokens = tokenizer.tokenize("아버지 가방");
887
888 assert!(!tokens.is_empty());
890 }
891
892 #[test]
893 fn test_wakati() {
894 let mut tokenizer = create_test_tokenizer();
895 let surfaces = tokenizer.wakati("아버지가");
896
897 assert_eq!(surfaces.len(), 2);
898 assert_eq!(surfaces[0], "아버지");
899 assert_eq!(surfaces[1], "가");
900 }
901
902 #[test]
903 fn test_nouns() {
904 let mut tokenizer = create_test_tokenizer();
905 let nouns = tokenizer.nouns("아버지가방에");
906
907 assert!(nouns.contains(&"아버지".to_string()));
909 assert!(nouns.contains(&"방".to_string()));
910 assert!(!nouns.contains(&"가".to_string())); }
912
913 #[test]
914 fn test_pos() {
915 let mut tokenizer = create_test_tokenizer();
916 let pos_tags = tokenizer.pos("아버지가");
917
918 assert_eq!(pos_tags.len(), 2);
919 assert_eq!(pos_tags[0], ("아버지".to_string(), "NNG".to_string()));
920 assert_eq!(pos_tags[1], ("가".to_string(), "JKS".to_string()));
921 }
922
923 #[test]
924 fn test_tokenize_to_lattice() {
925 let mut tokenizer = create_test_tokenizer();
926 let lattice = tokenizer.tokenize_to_lattice("아버지가");
927
928 assert!(lattice.node_count() > 2); let stats = lattice.stats();
933 assert!(stats.total_nodes > 2);
934 }
935
936 #[test]
937 fn test_lattice_stats() {
938 let mut tokenizer = create_test_tokenizer();
939 tokenizer.tokenize("아버지가");
940
941 let stats = tokenizer.lattice_stats();
942 assert!(stats.total_nodes > 0);
943 assert!(stats.char_length > 0);
944 }
945
946 #[test]
947 fn test_token_positions() {
948 let mut tokenizer = create_test_tokenizer();
949 let tokens = tokenizer.tokenize("아버지가");
950
951 assert_eq!(tokens[0].start_pos, 0);
953 assert_eq!(tokens[0].end_pos, 3);
954
955 assert_eq!(tokens[1].start_pos, 3);
957 assert_eq!(tokens[1].end_pos, 4);
958 }
959
960 #[test]
961 fn test_multiple_tokenize_calls() {
962 let mut tokenizer = create_test_tokenizer();
963
964 let tokens1 = tokenizer.tokenize("아버지");
966 assert!(!tokens1.is_empty());
967
968 let tokens2 = tokenizer.tokenize("가방");
970 assert!(!tokens2.is_empty());
971
972 assert_ne!(tokens1[0].surface, tokens2[0].surface);
974 }
975
976 #[test]
977 fn test_token_from_node() {
978 use crate::lattice::Node;
979 use std::borrow::Cow;
980
981 let node = Node {
982 id: 1,
983 surface: Cow::Borrowed("테스트"),
984 start_pos: 0,
985 end_pos: 3,
986 start_byte: 0,
987 end_byte: 9,
988 left_id: 1,
989 right_id: 1,
990 word_cost: 1000,
991 total_cost: 1500,
992 prev_node_id: 0,
993 node_type: NodeType::Known,
994 feature: Cow::Borrowed("NNG,*,T,테스트,*,*,*,*"),
995 has_space_before: false,
996 };
997
998 let token = Token::from_node(&node);
999
1000 assert_eq!(token.surface, "테스트");
1001 assert_eq!(token.pos, "NNG");
1002 assert_eq!(token.start_pos, 0);
1003 assert_eq!(token.end_pos, 3);
1004 assert_eq!(token.reading, Some("테스트".to_string()));
1005 assert_eq!(token.cost, 1500);
1006 }
1007
1008 #[test]
1009 fn test_with_user_dict() {
1010 let mut tokenizer = create_test_tokenizer();
1011
1012 let mut user_dict = UserDictionary::new();
1013 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1014
1015 tokenizer.set_user_dict(user_dict);
1016
1017 assert!(tokenizer.dictionary().user_dictionary().is_some());
1019 }
1020}