1use std::collections::HashMap;
30use std::io::BufRead;
31
32use mecab_ko_hangul::{classify_char, CharType};
33
34use crate::error::{Error, Result};
35use crate::lattice::{Lattice, NodeBuilder, NodeType};
36
37pub type CategoryId = u8;
39
40pub const DEFAULT_CATEGORY: CategoryId = 0;
42pub const SPACE_CATEGORY: CategoryId = 1;
44pub const HANGUL_CATEGORY: CategoryId = 2;
46pub const HANJA_CATEGORY: CategoryId = 3;
48pub const ALPHA_CATEGORY: CategoryId = 4;
50pub const NUMERIC_CATEGORY: CategoryId = 5;
52pub const SYMBOL_CATEGORY: CategoryId = 6;
54
55#[derive(Debug, Clone)]
59pub struct CharCategoryDef {
60 pub name: String,
62 pub id: CategoryId,
64 pub invoke: bool,
66 pub group: bool,
68 pub length: usize,
70}
71
72impl CharCategoryDef {
73 #[must_use]
75 pub fn new(name: &str, id: CategoryId, invoke: bool, group: bool, length: usize) -> Self {
76 Self {
77 name: name.to_string(),
78 id,
79 invoke,
80 group,
81 length,
82 }
83 }
84}
85
86#[derive(Debug, Clone)]
90pub struct UnknownDef {
91 pub category_id: CategoryId,
93 pub left_id: u16,
95 pub right_id: u16,
97 pub cost: i16,
99 pub pos: String,
101 pub feature: String,
103}
104
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum WordPattern {
110 Plain,
112 ProperNoun,
114 CamelCase,
116 HangulAlphaMix,
118 NumberUnit,
120 Emoji,
122}
123
124impl UnknownDef {
125 #[must_use]
127 pub fn new(
128 category_id: CategoryId,
129 left_id: u16,
130 right_id: u16,
131 cost: i16,
132 pos: &str,
133 feature: &str,
134 ) -> Self {
135 Self {
136 category_id,
137 left_id,
138 right_id,
139 cost,
140 pos: pos.to_string(),
141 feature: feature.to_string(),
142 }
143 }
144}
145
146#[derive(Debug, Clone)]
150pub struct CharCategoryMap {
151 categories: Vec<CharCategoryDef>,
153 name_to_id: HashMap<String, CategoryId>,
155 type_to_category: HashMap<CharType, CategoryId>,
157 range_overrides: Vec<(u32, u32, CategoryId)>,
159}
160
161impl Default for CharCategoryMap {
162 fn default() -> Self {
163 Self::korean_default()
164 }
165}
166
167impl CharCategoryMap {
168 #[must_use]
170 pub fn new() -> Self {
171 Self {
172 categories: Vec::new(),
173 name_to_id: HashMap::new(),
174 type_to_category: HashMap::new(),
175 range_overrides: Vec::new(),
176 }
177 }
178
179 #[must_use]
183 pub fn korean_default() -> Self {
184 let mut map = Self::new();
185
186 let defaults = [
189 ("DEFAULT", DEFAULT_CATEGORY, false, true, 0),
190 ("SPACE", SPACE_CATEGORY, false, true, 0),
191 ("HANGUL", HANGUL_CATEGORY, false, true, 2), ("HANJA", HANJA_CATEGORY, false, false, 1),
193 ("ALPHA", ALPHA_CATEGORY, true, true, 0), ("NUMERIC", NUMERIC_CATEGORY, true, true, 0), ("SYMBOL", SYMBOL_CATEGORY, true, true, 0),
196 ];
197
198 for (name, id, invoke, group, length) in defaults {
199 map.add_category(CharCategoryDef::new(name, id, invoke, group, length));
200 }
201
202 map.type_to_category
204 .insert(CharType::HangulSyllable, HANGUL_CATEGORY);
205 map.type_to_category
206 .insert(CharType::HangulJamo, HANGUL_CATEGORY);
207 map.type_to_category.insert(CharType::Hanja, HANJA_CATEGORY);
208 map.type_to_category
209 .insert(CharType::Katakana, ALPHA_CATEGORY);
210 map.type_to_category
211 .insert(CharType::Hiragana, ALPHA_CATEGORY);
212 map.type_to_category
213 .insert(CharType::Alphabet, ALPHA_CATEGORY);
214 map.type_to_category
215 .insert(CharType::Digit, NUMERIC_CATEGORY);
216 map.type_to_category
217 .insert(CharType::Whitespace, SPACE_CATEGORY);
218 map.type_to_category
219 .insert(CharType::Punctuation, SYMBOL_CATEGORY);
220 map.type_to_category
221 .insert(CharType::Other, DEFAULT_CATEGORY);
222
223 map
224 }
225
226 pub fn add_category(&mut self, def: CharCategoryDef) {
228 self.name_to_id.insert(def.name.clone(), def.id);
229 self.categories.push(def);
230 }
231
232 pub fn add_range(&mut self, start: u32, end: u32, category_id: CategoryId) {
234 self.range_overrides.push((start, end, category_id));
235 }
236
237 #[must_use]
239 pub fn get_category(&self, c: char) -> CategoryId {
240 let code = c as u32;
241
242 for &(start, end, cat_id) in &self.range_overrides {
244 if code >= start && code <= end {
245 return cat_id;
246 }
247 }
248
249 let char_type = classify_char(c);
251 self.type_to_category
252 .get(&char_type)
253 .copied()
254 .unwrap_or(DEFAULT_CATEGORY)
255 }
256
257 #[must_use]
259 pub fn get_category_def(&self, id: CategoryId) -> Option<&CharCategoryDef> {
260 self.categories.iter().find(|c| c.id == id)
261 }
262
263 #[must_use]
265 pub fn get_id_by_name(&self, name: &str) -> Option<CategoryId> {
266 self.name_to_id.get(name).copied()
267 }
268
269 pub fn from_char_def<R: BufRead>(reader: R) -> Result<Self> {
282 let mut map = Self::new();
283 let mut next_id: CategoryId = 0;
284
285 for line in reader.lines() {
286 let line = line.map_err(|e| Error::Init(e.to_string()))?;
287 let line = line.trim();
288
289 if line.is_empty() || line.starts_with('#') {
291 continue;
292 }
293
294 if !line.starts_with("0x") && !line.chars().next().is_some_and(|c| c.is_ascii_digit()) {
296 let parts: Vec<&str> = line.split_whitespace().collect();
297 if parts.len() >= 4 {
298 let name = parts[0];
299 let invoke = parts[1] == "1";
300 let group = parts[2] == "1";
301 let length: usize = parts[3].parse().unwrap_or(0);
302
303 map.add_category(CharCategoryDef::new(name, next_id, invoke, group, length));
304 next_id += 1;
305 }
306 }
307 else if line.starts_with("0x") {
309 let parts: Vec<&str> = line.split_whitespace().collect();
310 if parts.len() >= 2 {
311 let range_part = parts[0];
312 let category_name = parts[1];
313
314 if let Some(cat_id) = map.get_id_by_name(category_name) {
315 if let Some((start, end)) = parse_unicode_range(range_part) {
317 map.add_range(start, end, cat_id);
318 }
319 }
320 }
321 }
322 }
323
324 Ok(map)
325 }
326}
327
328fn parse_unicode_range(s: &str) -> Option<(u32, u32)> {
333 if let Some((start_str, end_str)) = s.split_once("..") {
334 let start = parse_hex(start_str)?;
335 let end = parse_hex(end_str)?;
336 Some((start, end))
337 } else {
338 let value = parse_hex(s)?;
339 Some((value, value))
340 }
341}
342
343fn parse_hex(s: &str) -> Option<u32> {
345 let s = s.trim_start_matches("0x").trim_start_matches("0X");
346 u32::from_str_radix(s, 16).ok()
347}
348
349#[must_use]
353const fn is_emoji(c: char) -> bool {
354 let code = c as u32;
355 matches!(code,
357 0x1F300..=0x1F9FF | 0x2600..=0x27BF )
360}
361
362#[derive(Debug, Clone, Default)]
366pub struct UnknownDictionary {
367 entries: HashMap<CategoryId, Vec<UnknownDef>>,
369}
370
371impl UnknownDictionary {
372 #[must_use]
374 pub fn new() -> Self {
375 Self {
376 entries: HashMap::new(),
377 }
378 }
379
380 #[must_use]
382 pub fn korean_default() -> Self {
383 let mut dict = Self::new();
384
385 let defaults = [
388 (DEFAULT_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
389 (SPACE_CATEGORY, 1799, 3559, 0, "SP", "SP,*,*,*,*,*,*,*"),
390 (
391 HANGUL_CATEGORY,
392 1800,
393 3565,
394 5000,
395 "UNKNOWN",
396 "UNKNOWN,*,*,*,*,*,*,*",
397 ),
398 (HANJA_CATEGORY, 1800, 3560, 6000, "SH", "SH,*,*,*,*,*,*,*"),
399 (ALPHA_CATEGORY, 1800, 3558, 4000, "SL", "SL,*,*,*,*,*,*,*"),
400 (NUMERIC_CATEGORY, 1800, 3561, 3000, "SN", "SN,*,*,*,*,*,*,*"),
401 (SYMBOL_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
402 ];
403
404 for (cat_id, left_id, right_id, cost, pos, feature) in defaults {
405 dict.add_entry(UnknownDef::new(
406 cat_id, left_id, right_id, cost, pos, feature,
407 ));
408 }
409
410 dict
411 }
412
413 pub fn add_entry(&mut self, def: UnknownDef) {
415 self.entries.entry(def.category_id).or_default().push(def);
416 }
417
418 #[must_use]
420 pub fn get_entries(&self, category_id: CategoryId) -> &[UnknownDef] {
421 self.entries
422 .get(&category_id)
423 .map_or(&[], std::vec::Vec::as_slice)
424 }
425
426 pub fn from_unk_def<R: BufRead>(reader: R, category_map: &CharCategoryMap) -> Result<Self> {
438 let mut dict = Self::new();
439
440 for line in reader.lines() {
441 let line = line.map_err(|e| Error::Init(e.to_string()))?;
442 let line = line.trim();
443
444 if line.is_empty() || line.starts_with('#') {
445 continue;
446 }
447
448 let parts: Vec<&str> = line.split(',').collect();
449 if parts.len() >= 5 {
450 let category_name = parts[0];
451 let left_id: u16 = parts[1].parse().unwrap_or(0);
452 let right_id: u16 = parts[2].parse().unwrap_or(0);
453 let cost: i16 = parts[3].parse().unwrap_or(0);
454 let pos = parts[4];
455 let feature = line; if let Some(cat_id) = category_map.get_id_by_name(category_name) {
458 dict.add_entry(UnknownDef::new(
459 cat_id, left_id, right_id, cost, pos, feature,
460 ));
461 }
462 }
463 }
464
465 Ok(dict)
466 }
467}
468
469#[derive(Debug, Clone)]
471pub struct UnknownCandidate {
472 pub surface: String,
474 pub start_pos: usize,
476 pub end_pos: usize,
478 pub left_id: u16,
480 pub right_id: u16,
482 pub cost: i16,
484 pub pos: String,
486 pub category_id: CategoryId,
488 pub pattern: WordPattern,
490}
491
492#[derive(Debug, Clone)]
496pub struct UnknownHandler {
497 pub category_map: CharCategoryMap,
499 pub unknown_dict: UnknownDictionary,
501}
502
503impl Default for UnknownHandler {
504 fn default() -> Self {
505 Self::korean_default()
506 }
507}
508
509impl UnknownHandler {
510 #[must_use]
512 pub const fn new(category_map: CharCategoryMap, unknown_dict: UnknownDictionary) -> Self {
513 Self {
514 category_map,
515 unknown_dict,
516 }
517 }
518
519 #[must_use]
521 pub fn korean_default() -> Self {
522 Self::new(
523 CharCategoryMap::korean_default(),
524 UnknownDictionary::korean_default(),
525 )
526 }
527
528 #[must_use]
532 fn detect_pattern(&self, surface: &str) -> WordPattern {
533 let chars: Vec<char> = surface.chars().collect();
534 if chars.is_empty() {
535 return WordPattern::Plain;
536 }
537
538 if chars.iter().any(|&c| is_emoji(c)) {
540 return WordPattern::Emoji;
541 }
542
543 let has_hangul = chars.iter().any(|&c| {
545 let cat = self.category_map.get_category(c);
546 cat == HANGUL_CATEGORY
547 });
548 let has_alpha = chars.iter().any(|&c| {
549 let cat = self.category_map.get_category(c);
550 cat == ALPHA_CATEGORY
551 });
552
553 if has_hangul && has_alpha {
554 return WordPattern::HangulAlphaMix;
555 }
556
557 let has_digit = chars.iter().any(|&c| {
559 let cat = self.category_map.get_category(c);
560 cat == NUMERIC_CATEGORY
561 });
562
563 if has_digit && (has_hangul || has_alpha) {
564 return WordPattern::NumberUnit;
565 }
566
567 if has_alpha && !has_hangul {
569 if chars.len() > 1 {
571 let mut has_internal_uppercase = false;
572 for (i, &c) in chars.iter().enumerate() {
573 if i > 0 && c.is_uppercase() {
574 has_internal_uppercase = true;
575 break;
576 }
577 }
578 if has_internal_uppercase {
579 return WordPattern::CamelCase;
580 }
581 }
582
583 if chars[0].is_uppercase() && chars.len() > 1 {
585 return WordPattern::ProperNoun;
586 }
587 }
588
589 WordPattern::Plain
590 }
591
592 #[must_use]
596 #[allow(clippy::unused_self)]
597 fn adjust_cost_by_pattern(&self, base_cost: i16, pattern: WordPattern, length: usize) -> i16 {
598 let mut cost = i32::from(base_cost);
599
600 match pattern {
602 WordPattern::Plain => {
603 if length > 6 {
606 #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
607 let penalty = ((length - 6) * 80) as i32; cost += penalty;
609 }
610 }
611 WordPattern::ProperNoun => {
612 cost -= 600; }
615 WordPattern::CamelCase => {
616 cost -= 400; }
619 WordPattern::HangulAlphaMix => {
620 cost -= 100; }
624 WordPattern::NumberUnit => {
625 cost -= 300; }
628 WordPattern::Emoji => {
629 cost += 1500; }
632 }
633
634 #[allow(clippy::cast_possible_truncation)]
636 {
637 cost.clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16
638 }
639 }
640
641 #[must_use]
645 #[allow(clippy::unused_self)]
646 fn estimate_pos(
647 &self,
648 pattern: WordPattern,
649 category_id: CategoryId,
650 base_pos: &str,
651 ) -> String {
652 match pattern {
653 WordPattern::ProperNoun | WordPattern::CamelCase => {
654 if category_id == ALPHA_CATEGORY {
656 return "NNP".to_string();
657 }
658 }
659 WordPattern::HangulAlphaMix => {
660 if category_id == HANGUL_CATEGORY {
662 return "NNG".to_string();
663 }
664 }
665 _ => {}
666 }
667
668 base_pos.to_string()
669 }
670
671 #[must_use]
683 pub fn generate_candidates(
684 &self,
685 text: &str,
686 start_pos: usize,
687 has_dict_entry: bool,
688 ) -> Vec<UnknownCandidate> {
689 let start_byte = text
698 .char_indices()
699 .nth(start_pos)
700 .map_or(text.len(), |(b, _)| b);
701
702 let suffix = &text[start_byte..];
703 let Some(first_char) = suffix.chars().next() else {
704 return Vec::new();
705 };
706 let category_id = self.category_map.get_category(first_char);
707 let Some(category_def) = self.category_map.get_category_def(category_id) else {
708 return Vec::new();
709 };
710
711 if !category_def.invoke && has_dict_entry {
713 return Vec::new();
714 }
715
716 let unknown_defs = self.unknown_dict.get_entries(category_id);
717 if unknown_defs.is_empty() {
718 return Vec::new();
719 }
720
721 let mut candidates = Vec::new();
722
723 if category_def.group {
724 let mut char_count = 0usize;
727 let mut byte_end = 0usize;
728
729 for c in suffix.chars() {
730 if self.category_map.get_category(c) != category_id {
731 break;
732 }
733 byte_end += c.len_utf8();
734 char_count += 1;
735 }
736
737 let group_char_count = char_count; let max_len = if category_def.length > 0 {
739 category_def.length.min(group_char_count)
740 } else {
741 group_char_count
742 };
743
744 let mut byte_offset = 0usize;
746 let mut char_iter = suffix.chars();
747 for len in 1..=max_len {
748 if let Some(c) = char_iter.next() {
749 byte_offset += c.len_utf8();
750 } else {
751 break;
752 }
753 let end_pos = start_pos + len;
754 let surface = &suffix[..byte_offset];
755
756 let pattern = self.detect_pattern(surface);
758
759 for def in unknown_defs {
760 let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
762
763 let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
765
766 candidates.push(UnknownCandidate {
767 surface: surface.to_string(),
768 start_pos,
769 end_pos,
770 left_id: def.left_id,
771 right_id: def.right_id,
772 cost: adjusted_cost,
773 pos: estimated_pos,
774 category_id,
775 pattern,
776 });
777 }
778 }
779 let _ = byte_end; } else {
781 let char_total = suffix.chars().count();
783 let max_len = if category_def.length > 0 {
784 category_def.length.min(char_total)
785 } else {
786 1
787 };
788
789 let mut byte_offset = 0usize;
790 let mut char_iter = suffix.chars();
791 for len in 1..=max_len {
792 if let Some(c) = char_iter.next() {
793 byte_offset += c.len_utf8();
794 } else {
795 break;
796 }
797 let end_pos = start_pos + len;
798 let surface = &suffix[..byte_offset];
799
800 let pattern = self.detect_pattern(surface);
802
803 for def in unknown_defs {
804 let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
806
807 let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
809
810 candidates.push(UnknownCandidate {
811 surface: surface.to_string(),
812 start_pos,
813 end_pos,
814 left_id: def.left_id,
815 right_id: def.right_id,
816 cost: adjusted_cost,
817 pos: estimated_pos,
818 category_id,
819 pattern,
820 });
821 }
822 }
823 }
824
825 candidates
826 }
827
828 #[cfg(test)]
833 fn find_group_end(&self, chars: &[char], start_pos: usize, category_id: CategoryId) -> usize {
834 let mut pos = start_pos;
835 while pos < chars.len() {
836 if self.category_map.get_category(chars[pos]) != category_id {
837 break;
838 }
839 pos += 1;
840 }
841 pos
842 }
843
844 pub fn add_unknown_nodes(
856 &self,
857 lattice: &mut Lattice,
858 start_pos: usize,
859 has_dict_entry: bool,
860 ) -> usize {
861 let text = lattice.text();
862 let candidates = self.generate_candidates(text, start_pos, has_dict_entry);
863 let mut count = 0;
864
865 for candidate in candidates {
866 lattice.add_node(
867 NodeBuilder::new(&candidate.surface, candidate.start_pos, candidate.end_pos)
868 .left_id(candidate.left_id)
869 .right_id(candidate.right_id)
870 .word_cost(i32::from(candidate.cost))
871 .node_type(NodeType::Unknown),
872 );
873 count += 1;
874 }
875
876 count
877 }
878}
879
880#[cfg(test)]
881#[allow(clippy::unwrap_used, clippy::needless_collect)]
882mod tests {
883 use super::*;
884
885 #[test]
886 fn test_category_map_default() {
887 let map = CharCategoryMap::korean_default();
888
889 assert_eq!(map.get_category('가'), HANGUL_CATEGORY);
890 assert_eq!(map.get_category('A'), ALPHA_CATEGORY);
891 assert_eq!(map.get_category('1'), NUMERIC_CATEGORY);
892 assert_eq!(map.get_category(' '), SPACE_CATEGORY);
893 assert_eq!(map.get_category('.'), SYMBOL_CATEGORY);
894 assert_eq!(map.get_category('韓'), HANJA_CATEGORY);
895 }
896
897 #[test]
898 fn test_category_def() {
899 let map = CharCategoryMap::korean_default();
900
901 let hangul_def = map.get_category_def(HANGUL_CATEGORY).unwrap();
902 assert_eq!(hangul_def.name, "HANGUL");
903 assert!(!hangul_def.invoke);
904 assert!(hangul_def.group);
905 assert_eq!(hangul_def.length, 2);
906
907 let alpha_def = map.get_category_def(ALPHA_CATEGORY).unwrap();
908 assert!(alpha_def.invoke); }
910
911 #[test]
912 fn test_unknown_dict_default() {
913 let dict = UnknownDictionary::korean_default();
914
915 let hangul_entries = dict.get_entries(HANGUL_CATEGORY);
916 assert!(!hangul_entries.is_empty());
917 assert_eq!(hangul_entries[0].pos, "UNKNOWN");
918
919 let alpha_entries = dict.get_entries(ALPHA_CATEGORY);
920 assert!(!alpha_entries.is_empty());
921 assert_eq!(alpha_entries[0].pos, "SL");
922 }
923
924 #[test]
925 fn test_generate_candidates_hangul() {
926 let handler = UnknownHandler::korean_default();
927
928 let candidates = handler.generate_candidates("가나다라", 0, false);
930
931 assert!(!candidates.is_empty());
933 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
934 assert!(surfaces.contains(&"가"));
935 assert!(surfaces.contains(&"가나"));
936 }
937
938 #[test]
939 fn test_generate_candidates_alpha() {
940 let handler = UnknownHandler::korean_default();
941
942 let candidates = handler.generate_candidates("ABC", 0, false);
944
945 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
947 assert!(surfaces.contains(&"A"));
948 assert!(surfaces.contains(&"AB"));
949 assert!(surfaces.contains(&"ABC"));
950 }
951
952 #[test]
953 fn test_generate_candidates_with_dict_entry() {
954 let handler = UnknownHandler::korean_default();
955
956 let candidates = handler.generate_candidates("가나다", 0, true);
958 assert!(candidates.is_empty());
959
960 let candidates = handler.generate_candidates("ABC", 0, true);
962 assert!(!candidates.is_empty());
963 }
964
965 #[test]
966 fn test_generate_candidates_mixed() {
967 let handler = UnknownHandler::korean_default();
968
969 let text = "가ABC";
971
972 let candidates = handler.generate_candidates(text, 0, false);
974 assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
975
976 let candidates = handler.generate_candidates(text, 1, false);
978 assert!(candidates.iter().all(|c| c.category_id == ALPHA_CATEGORY));
979 }
980
981 #[test]
982 fn test_find_group_end() {
983 let handler = UnknownHandler::korean_default();
984 let chars: Vec<char> = "가나다ABC".chars().collect();
985
986 let end = handler.find_group_end(&chars, 0, HANGUL_CATEGORY);
988 assert_eq!(end, 3);
989
990 let end = handler.find_group_end(&chars, 3, ALPHA_CATEGORY);
992 assert_eq!(end, 6);
993 }
994
995 #[test]
996 fn test_add_unknown_nodes() {
997 let handler = UnknownHandler::korean_default();
998 let mut lattice = Lattice::new("테스트ABC");
999
1000 let count = handler.add_unknown_nodes(&mut lattice, 0, false);
1001 assert!(count > 0);
1002
1003 let nodes_at_0: Vec<_> = lattice.nodes_starting_at(0).collect();
1005 assert!(!nodes_at_0.is_empty());
1006 }
1007
1008 #[test]
1009 fn test_pattern_detection_proper_noun() {
1010 let handler = UnknownHandler::korean_default();
1011
1012 let pattern = handler.detect_pattern("Apple");
1013 assert_eq!(pattern, WordPattern::ProperNoun);
1014
1015 let pattern = handler.detect_pattern("Google");
1016 assert_eq!(pattern, WordPattern::ProperNoun);
1017 }
1018
1019 #[test]
1020 fn test_pattern_detection_camel_case() {
1021 let handler = UnknownHandler::korean_default();
1022
1023 let pattern = handler.detect_pattern("iPhone");
1024 assert_eq!(pattern, WordPattern::CamelCase);
1025
1026 let pattern = handler.detect_pattern("HelloWorld");
1027 assert_eq!(pattern, WordPattern::CamelCase);
1028
1029 let pattern = handler.detect_pattern("iPad");
1030 assert_eq!(pattern, WordPattern::CamelCase);
1031 }
1032
1033 #[test]
1034 fn test_pattern_detection_hangul_alpha_mix() {
1035 let handler = UnknownHandler::korean_default();
1036
1037 let pattern = handler.detect_pattern("카카오톡");
1038 assert_eq!(pattern, WordPattern::Plain);
1040
1041 let pattern = handler.detect_pattern("API키");
1043 assert_eq!(pattern, WordPattern::HangulAlphaMix);
1044 }
1045
1046 #[test]
1047 fn test_pattern_detection_number_unit() {
1048 let handler = UnknownHandler::korean_default();
1049
1050 let pattern = handler.detect_pattern("15kg");
1051 assert_eq!(pattern, WordPattern::NumberUnit);
1052
1053 let pattern = handler.detect_pattern("3개");
1054 assert_eq!(pattern, WordPattern::NumberUnit);
1055
1056 let pattern = handler.detect_pattern("100원");
1057 assert_eq!(pattern, WordPattern::NumberUnit);
1058 }
1059
1060 #[test]
1061 fn test_pattern_detection_emoji() {
1062 let handler = UnknownHandler::korean_default();
1063
1064 let pattern = handler.detect_pattern("😀");
1065 assert_eq!(pattern, WordPattern::Emoji);
1066
1067 let pattern = handler.detect_pattern("안녕😊");
1068 assert_eq!(pattern, WordPattern::Emoji);
1069 }
1070
1071 #[test]
1072 fn test_pattern_detection_plain() {
1073 let handler = UnknownHandler::korean_default();
1074
1075 let pattern = handler.detect_pattern("hello");
1076 assert_eq!(pattern, WordPattern::Plain);
1077
1078 let _pattern = handler.detect_pattern("test123");
1079 }
1082
1083 #[test]
1084 fn test_cost_adjustment_by_pattern() {
1085 let handler = UnknownHandler::korean_default();
1086
1087 let base_cost = 4000i16;
1089 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::ProperNoun, 5);
1090 assert!(adjusted < base_cost);
1091
1092 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::CamelCase, 5);
1094 assert!(adjusted < base_cost);
1095
1096 let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::Emoji, 1);
1098 assert!(adjusted > base_cost);
1099 }
1100
1101 #[test]
1102 fn test_cost_adjustment_by_length() {
1103 let handler = UnknownHandler::korean_default();
1104 let base_cost = 5000i16;
1105
1106 let cost_short = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 3);
1108
1109 let cost_long = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 10);
1111
1112 assert!(cost_long > cost_short);
1114 }
1115
1116 #[test]
1117 fn test_pos_estimation_proper_noun() {
1118 let handler = UnknownHandler::korean_default();
1119
1120 let pos = handler.estimate_pos(WordPattern::ProperNoun, ALPHA_CATEGORY, "SL");
1121 assert_eq!(pos, "NNP");
1122
1123 let pos = handler.estimate_pos(WordPattern::CamelCase, ALPHA_CATEGORY, "SL");
1124 assert_eq!(pos, "NNP");
1125 }
1126
1127 #[test]
1128 fn test_pos_estimation_hangul_alpha_mix() {
1129 let handler = UnknownHandler::korean_default();
1130
1131 let pos = handler.estimate_pos(WordPattern::HangulAlphaMix, HANGUL_CATEGORY, "UNKNOWN");
1132 assert_eq!(pos, "NNG");
1133 }
1134
1135 #[test]
1136 fn test_generate_candidates_with_patterns() {
1137 let handler = UnknownHandler::korean_default();
1138
1139 let candidates = handler.generate_candidates("Apple", 0, false);
1141 assert!(!candidates.is_empty());
1142
1143 let has_proper_noun = candidates
1145 .iter()
1146 .any(|c| c.pattern == WordPattern::ProperNoun);
1147 assert!(has_proper_noun);
1148
1149 let proper_noun_candidates: Vec<_> = candidates
1151 .iter()
1152 .filter(|c| c.pattern == WordPattern::ProperNoun)
1153 .collect();
1154 assert!(proper_noun_candidates.iter().any(|c| c.pos == "NNP"));
1155 }
1156
1157 #[test]
1158 fn test_generate_candidates_abbreviation() {
1159 let handler = UnknownHandler::korean_default();
1160
1161 let candidates = handler.generate_candidates("API", 0, false);
1163 assert!(!candidates.is_empty());
1164
1165 let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
1167 assert!(surfaces.contains(&"API") || surfaces.contains(&"A"));
1168 }
1169
1170 #[test]
1171 fn test_generate_candidates_camel_case() {
1172 let handler = UnknownHandler::korean_default();
1173
1174 let candidates = handler.generate_candidates("iPhone", 0, false);
1175 assert!(!candidates.is_empty());
1176
1177 let has_camel = candidates
1179 .iter()
1180 .any(|c| c.pattern == WordPattern::CamelCase);
1181 assert!(has_camel);
1182 }
1183
1184 #[test]
1185 fn test_unknown_korean_word() {
1186 let handler = UnknownHandler::korean_default();
1187
1188 let candidates = handler.generate_candidates("테스트", 0, false);
1190 assert!(!candidates.is_empty());
1191
1192 assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
1194 }
1195
1196 #[test]
1197 fn test_is_emoji() {
1198 assert!(is_emoji('😀'));
1199 assert!(is_emoji('😊'));
1200 assert!(is_emoji('🚀'));
1201 assert!(is_emoji('❤'));
1202
1203 assert!(!is_emoji('a'));
1204 assert!(!is_emoji('가'));
1205 assert!(!is_emoji('1'));
1206 }
1207
1208 #[test]
1209 fn test_parse_unicode_range() {
1210 assert_eq!(
1211 parse_unicode_range("0xAC00..0xD7A3"),
1212 Some((0xAC00, 0xD7A3))
1213 );
1214 assert_eq!(parse_unicode_range("0xAC00"), Some((0xAC00, 0xAC00)));
1215 assert_eq!(parse_unicode_range("0x0020"), Some((0x0020, 0x0020)));
1216 }
1217
1218 #[test]
1219 fn test_char_def_parsing() {
1220 let char_def = r"
1221# Comment line
1222DEFAULT 0 1 0
1223SPACE 0 1 0
1224HANGUL 0 1 2
1225ALPHA 1 1 0
1226
12270xAC00..0xD7A3 HANGUL
12280x0041..0x005A ALPHA
1229";
1230
1231 let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
1232
1233 assert!(map.get_id_by_name("DEFAULT").is_some());
1234 assert!(map.get_id_by_name("HANGUL").is_some());
1235 assert!(map.get_id_by_name("ALPHA").is_some());
1236
1237 assert_eq!(
1239 map.get_category('가'),
1240 map.get_id_by_name("HANGUL").unwrap()
1241 );
1242 assert_eq!(map.get_category('A'), map.get_id_by_name("ALPHA").unwrap());
1243 }
1244
1245 #[test]
1246 fn test_unk_def_parsing() {
1247 let char_def = "DEFAULT 0 1 0\nHANGUL 0 1 2\n";
1248 let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
1249
1250 let unk_def = r"
1251DEFAULT,1800,3562,7000,SY,*,*,*,*,*,*,*
1252HANGUL,1800,3565,5000,UNKNOWN,*,*,*,*,*,*,*
1253";
1254
1255 let dict = UnknownDictionary::from_unk_def(unk_def.as_bytes(), &map).unwrap();
1256
1257 let hangul_id = map.get_id_by_name("HANGUL").unwrap();
1258 let entries = dict.get_entries(hangul_id);
1259 assert!(!entries.is_empty());
1260 assert_eq!(entries[0].pos, "UNKNOWN");
1261 assert_eq!(entries[0].cost, 5000);
1262 }
1263}