1use crate::Result;
33use mecab_ko_hangul::{compose, decompose, is_hangul_syllable};
34use std::collections::{HashMap, HashSet};
35use std::path::Path;
36use std::sync::Arc;
37
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum RuleType {
41 VowelLength,
43 ConsonantVariation,
45 JongseongVariation,
47 VowelVariation,
49 PhoneticSimilarity,
51}
52
53#[derive(Debug, Clone, PartialEq)]
55pub struct NormalizationRule {
56 pub rule_type: RuleType,
58 pub from: String,
60 pub to: String,
62 pub confidence: f32,
64}
65
66impl NormalizationRule {
67 #[must_use]
69 pub const fn new(rule_type: RuleType, from: String, to: String, confidence: f32) -> Self {
70 Self {
71 rule_type,
72 from,
73 to,
74 confidence,
75 }
76 }
77}
78
79#[derive(Debug, Clone)]
81#[allow(clippy::struct_excessive_bools)]
82pub struct NormalizationConfig {
83 pub vowel_length: bool,
85 pub consonant_variation: bool,
87 pub jongseong_variation: bool,
89 pub vowel_variation: bool,
91 pub phonetic_similarity: bool,
93 pub min_confidence: f32,
95}
96
97impl Default for NormalizationConfig {
98 fn default() -> Self {
99 Self {
100 vowel_length: true,
101 consonant_variation: true,
102 jongseong_variation: true,
103 vowel_variation: true,
104 phonetic_similarity: true,
105 min_confidence: 0.7,
106 }
107 }
108}
109
110pub struct Normalizer {
112 config: NormalizationConfig,
114 standard_to_variants: Arc<HashMap<String, HashSet<String>>>,
116 variant_to_standard: Arc<HashMap<String, String>>,
118}
119
120impl Normalizer {
121 pub fn new(config: NormalizationConfig) -> Result<Self> {
135 let rules = Self::load_rules(&config);
136 let (standard_to_variants, variant_to_standard) = Self::build_variant_maps(&rules);
137
138 Ok(Self {
139 config,
140 standard_to_variants: Arc::new(standard_to_variants),
141 variant_to_standard: Arc::new(variant_to_standard),
142 })
143 }
144
145 pub fn with_data_file(config: NormalizationConfig, variant_csv_path: &Path) -> Result<Self> {
160 let rules = Self::load_rules(&config);
161 let mut variant_pairs = Self::builtin_variant_pairs();
162
163 if let Ok(external_pairs) = Self::load_variant_csv(variant_csv_path) {
165 variant_pairs.extend(external_pairs);
166 }
167
168 let (standard_to_variants, variant_to_standard) =
169 Self::build_variant_maps_with_pairs(&rules, &variant_pairs);
170
171 Ok(Self {
172 config,
173 standard_to_variants: Arc::new(standard_to_variants),
174 variant_to_standard: Arc::new(variant_to_standard),
175 })
176 }
177
178 fn load_variant_csv(path: &Path) -> Result<Vec<(String, String)>> {
180 use std::fs::File;
181 use std::io::{BufRead, BufReader};
182
183 let file = File::open(path)
184 .map_err(|e| crate::error::Error::Init(format!("Failed to open variant CSV: {e}")))?;
185
186 let reader = BufReader::new(file);
187 let mut pairs = Vec::new();
188
189 for (line_num, line) in reader.lines().enumerate() {
190 let line = line.map_err(|e| {
191 crate::error::Error::Init(format!("Failed to read line {line_num}: {e}"))
192 })?;
193
194 if line_num == 0 || line.trim().is_empty() || line.starts_with('#') {
196 continue;
197 }
198
199 let parts: Vec<&str> = line.split(',').collect();
200 if parts.len() >= 2 {
201 let standard = parts[0].trim().to_string();
202 let variant = parts[1].trim().to_string();
203
204 if standard != variant {
206 pairs.push((standard, variant));
207 }
208 }
209 }
210
211 Ok(pairs)
212 }
213
214 #[allow(clippy::should_implement_trait)]
220 pub fn default() -> Result<Self> {
221 Self::new(NormalizationConfig::default())
222 }
223
224 #[must_use]
244 pub fn normalize(&self, text: &str) -> String {
245 if let Some(standard) = self.variant_to_standard.get(text) {
247 return standard.clone();
248 }
249
250 Self::apply_rules(text)
252 }
253
254 #[must_use]
274 pub fn get_variants(&self, standard: &str) -> Vec<String> {
275 let mut variants = self
277 .standard_to_variants
278 .get(standard)
279 .map(|set| set.iter().cloned().collect::<Vec<_>>())
280 .unwrap_or_default();
281
282 let generated = self.generate_variants(standard);
284 variants.extend(generated);
285
286 variants.sort();
287 variants.dedup();
288 variants
289 }
290
291 #[must_use]
312 pub fn is_variant(&self, word1: &str, word2: &str) -> bool {
313 if word1 == word2 {
314 return true;
315 }
316
317 let norm1 = self.normalize(word1);
318 let norm2 = self.normalize(word2);
319 norm1 == norm2
320 }
321
322 #[must_use]
333 pub fn phonetic_similarity(&self, word1: &str, word2: &str) -> f32 {
334 if word1 == word2 {
335 return 1.0;
336 }
337
338 let jamo1 = Self::to_phonetic_jamo(word1);
339 let jamo2 = Self::to_phonetic_jamo(word2);
340
341 Self::string_similarity(&jamo1, &jamo2)
342 }
343
344 fn load_rules(config: &NormalizationConfig) -> Vec<NormalizationRule> {
348 let mut rules = Vec::new();
349
350 if config.vowel_length {
352 rules.extend(Self::vowel_length_rules());
353 }
354
355 if config.consonant_variation {
357 rules.extend(Self::consonant_variation_rules());
358 }
359
360 if config.jongseong_variation {
362 rules.extend(Self::jongseong_variation_rules());
363 }
364
365 if config.vowel_variation {
367 rules.extend(Self::vowel_variation_rules());
368 }
369
370 rules
371 }
372
373 fn vowel_length_rules() -> Vec<NormalizationRule> {
375 vec![
376 NormalizationRule::new(RuleType::VowelLength, "오".into(), "어".into(), 0.9),
377 NormalizationRule::new(RuleType::VowelLength, "어".into(), "오".into(), 0.9),
378 NormalizationRule::new(RuleType::VowelLength, "우".into(), "유".into(), 0.85),
379 NormalizationRule::new(RuleType::VowelLength, "유".into(), "우".into(), 0.85),
380 ]
381 }
382
383 fn consonant_variation_rules() -> Vec<NormalizationRule> {
385 vec![
386 NormalizationRule::new(RuleType::ConsonantVariation, "ㅂ".into(), "ㅍ".into(), 0.9),
387 NormalizationRule::new(RuleType::ConsonantVariation, "ㅍ".into(), "ㅂ".into(), 0.9),
388 NormalizationRule::new(RuleType::ConsonantVariation, "ㄷ".into(), "ㅌ".into(), 0.9),
389 NormalizationRule::new(RuleType::ConsonantVariation, "ㅌ".into(), "ㄷ".into(), 0.9),
390 NormalizationRule::new(RuleType::ConsonantVariation, "ㄱ".into(), "ㅋ".into(), 0.9),
391 NormalizationRule::new(RuleType::ConsonantVariation, "ㅋ".into(), "ㄱ".into(), 0.9),
392 NormalizationRule::new(RuleType::ConsonantVariation, "ㅈ".into(), "ㅊ".into(), 0.9),
393 NormalizationRule::new(RuleType::ConsonantVariation, "ㅊ".into(), "ㅈ".into(), 0.9),
394 NormalizationRule::new(RuleType::ConsonantVariation, "ㅅ".into(), "ㅆ".into(), 0.85),
395 NormalizationRule::new(RuleType::ConsonantVariation, "ㅆ".into(), "ㅅ".into(), 0.85),
396 ]
397 }
398
399 fn jongseong_variation_rules() -> Vec<NormalizationRule> {
401 vec![
402 NormalizationRule::new(
403 RuleType::JongseongVariation,
404 "ㄹ".into(),
405 String::new(),
406 0.85,
407 ),
408 NormalizationRule::new(
409 RuleType::JongseongVariation,
410 String::new(),
411 "ㄹ".into(),
412 0.85,
413 ),
414 NormalizationRule::new(RuleType::JongseongVariation, "ㅁ".into(), "ㅂ".into(), 0.8),
415 NormalizationRule::new(RuleType::JongseongVariation, "ㅂ".into(), "ㅁ".into(), 0.8),
416 ]
417 }
418
419 fn vowel_variation_rules() -> Vec<NormalizationRule> {
421 vec![
422 NormalizationRule::new(RuleType::VowelVariation, "에이".into(), "에".into(), 0.9),
423 NormalizationRule::new(RuleType::VowelVariation, "에".into(), "에이".into(), 0.9),
424 NormalizationRule::new(RuleType::VowelVariation, "이".into(), "익".into(), 0.85),
425 NormalizationRule::new(RuleType::VowelVariation, "익".into(), "이".into(), 0.85),
426 ]
427 }
428
429 fn build_variant_maps(
431 rules: &[NormalizationRule],
432 ) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
433 let builtin_variants = Self::builtin_variant_pairs();
434 Self::build_variant_maps_with_pairs(rules, &builtin_variants)
435 }
436
437 fn build_variant_maps_with_pairs(
439 _rules: &[NormalizationRule],
440 variant_pairs: &[(String, String)],
441 ) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
442 let mut standard_to_variants = HashMap::new();
443 let mut variant_to_standard = HashMap::new();
444
445 for (standard, variant) in variant_pairs {
446 standard_to_variants
447 .entry(standard.clone())
448 .or_insert_with(HashSet::new)
449 .insert(variant.clone());
450
451 variant_to_standard.insert(variant.clone(), standard.clone());
452 }
453
454 (standard_to_variants, variant_to_standard)
455 }
456
457 fn builtin_variant_pairs() -> Vec<(String, String)> {
459 vec![
460 ("커피".into(), "코피".into()),
462 ("쿠버네티스".into(), "쿠베르네테스".into()),
463 ("쿠버네티스".into(), "쿠베르네티즈".into()),
464 ("소프트웨어".into(), "소프트웨아".into()),
465 ("라이브러리".into(), "라이브러이".into()),
466 ("디렉토리".into(), "디렉터리".into()),
467 ("디렉터리".into(), "디렉토리".into()),
468 ("서버".into(), "서버".into()),
469 ("클라이언트".into(), "클라이언트".into()),
470 ("인터페이스".into(), "인터페이스".into()),
471 ("알고리즘".into(), "알고리듬".into()),
472 ("컴퓨터".into(), "컴퓨타".into()),
473 ("프로그램".into(), "프로그래밍".into()),
474 ("데이터베이스".into(), "데이타베이스".into()),
475 ("케이크".into(), "케익".into()),
477 ("스테이크".into(), "스테익".into()),
478 ("메이크업".into(), "메이컵".into()),
479 ("샴푸".into(), "샴프".into()),
480 ("컵".into(), "컵".into()),
481 ("버스".into(), "버스".into()),
482 ("택시".into(), "택시".into()),
483 ("카메라".into(), "카메라".into()),
484 ("비디오".into(), "비데오".into()),
485 ("라디오".into(), "라지오".into()),
486 ]
487 }
488
489 fn apply_rules(text: &str) -> String {
491 let chars: Vec<char> = text.chars().collect();
492 let mut result = String::with_capacity(text.len());
493
494 for &ch in &chars {
495 result.push(ch);
496 }
497
498 result
499 }
500
501 fn generate_variants(&self, text: &str) -> Vec<String> {
503 let mut variants = HashSet::new();
504
505 if self.config.vowel_length {
507 variants.extend(Self::generate_vowel_length_variants(text));
508 }
509
510 if self.config.jongseong_variation {
512 variants.extend(Self::generate_jongseong_variants(text));
513 }
514
515 variants.into_iter().collect()
516 }
517
518 fn generate_vowel_length_variants(text: &str) -> Vec<String> {
520 let mut variants = Vec::new();
521
522 for i in 0..text.chars().count() {
523 let chars: Vec<char> = text.chars().collect();
524 let ch = chars[i];
525
526 if !is_hangul_syllable(ch) {
527 continue;
528 }
529
530 if let Some((cho, jung, jong)) = decompose(ch) {
531 if jung == 'ㅓ' {
533 if let Some(variant_char) = compose(cho, 'ㅗ', jong) {
534 let mut variant: Vec<char> = chars.clone();
535 variant[i] = variant_char;
536 variants.push(variant.into_iter().collect());
537 }
538 } else if jung == 'ㅗ' {
539 if let Some(variant_char) = compose(cho, 'ㅓ', jong) {
540 let mut variant: Vec<char> = chars.clone();
541 variant[i] = variant_char;
542 variants.push(variant.into_iter().collect());
543 }
544 }
545 }
546 }
547
548 variants
549 }
550
551 fn generate_jongseong_variants(text: &str) -> Vec<String> {
553 let mut variants = Vec::new();
554
555 for i in 0..text.chars().count() {
556 let chars: Vec<char> = text.chars().collect();
557 let ch = chars[i];
558
559 if !is_hangul_syllable(ch) {
560 continue;
561 }
562
563 if let Some((cho, jung, jong)) = decompose(ch) {
564 if jong.is_none() {
566 for &new_jong in &['ㄹ', 'ㅁ', 'ㅂ'] {
568 if let Some(variant_char) = compose(cho, jung, Some(new_jong)) {
569 let mut variant: Vec<char> = chars.clone();
570 variant[i] = variant_char;
571 variants.push(variant.into_iter().collect());
572 }
573 }
574 } else {
575 if let Some(variant_char) = compose(cho, jung, None) {
577 let mut variant: Vec<char> = chars.clone();
578 variant[i] = variant_char;
579 variants.push(variant.into_iter().collect());
580 }
581 }
582 }
583 }
584
585 variants
586 }
587
588 fn to_phonetic_jamo(text: &str) -> String {
590 let mut result = String::new();
591
592 for ch in text.chars() {
593 if let Some((cho, jung, jong)) = decompose(ch) {
594 result.push(cho);
595 result.push(jung);
596 if let Some(j) = jong {
597 result.push(j);
598 }
599 } else {
600 result.push(ch);
601 }
602 }
603
604 result
605 }
606
607 fn string_similarity(s1: &str, s2: &str) -> f32 {
609 let len1 = s1.chars().count();
610 let len2 = s2.chars().count();
611
612 if len1 == 0 && len2 == 0 {
613 return 1.0;
614 }
615
616 let max_len = len1.max(len2);
617 let distance = Self::levenshtein_distance(s1, s2);
618
619 #[allow(clippy::cast_precision_loss)]
620 let result = 1.0 - (distance as f32 / max_len as f32);
621 result
622 }
623
624 fn levenshtein_distance(s1: &str, s2: &str) -> usize {
626 let chars1: Vec<char> = s1.chars().collect();
627 let chars2: Vec<char> = s2.chars().collect();
628 let len1 = chars1.len();
629 let len2 = chars2.len();
630
631 if len1 == 0 {
632 return len2;
633 }
634 if len2 == 0 {
635 return len1;
636 }
637
638 let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
639
640 for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
641 row[0] = i;
642 }
643 for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
644 *val = j;
645 }
646
647 for i in 1..=len1 {
648 for j in 1..=len2 {
649 let cost = usize::from(chars1[i - 1] != chars2[j - 1]);
650 matrix[i][j] = (matrix[i - 1][j] + 1)
651 .min(matrix[i][j - 1] + 1)
652 .min(matrix[i - 1][j - 1] + cost);
653 }
654 }
655
656 matrix[len1][len2]
657 }
658}
659
660#[cfg(test)]
661#[allow(
662 clippy::unwrap_used,
663 clippy::float_cmp,
664 clippy::field_reassign_with_default
665)]
666mod tests {
667 use super::*;
668
669 #[test]
670 fn test_normalizer_creation() {
671 let result = Normalizer::default();
672 assert!(result.is_ok());
673 }
674
675 #[test]
676 fn test_normalize_builtin() {
677 let normalizer = Normalizer::default().unwrap();
678
679 assert_eq!(normalizer.normalize("코피"), "커피");
680 assert_eq!(normalizer.normalize("커피"), "커피");
681 assert_eq!(normalizer.normalize("소프트웨아"), "소프트웨어");
682 assert_eq!(normalizer.normalize("케익"), "케이크");
683 }
684
685 #[test]
686 fn test_get_variants() {
687 let normalizer = Normalizer::default().unwrap();
688
689 let variants = normalizer.get_variants("커피");
690 assert!(variants.contains(&"코피".to_string()));
691
692 let variants = normalizer.get_variants("케이크");
693 assert!(variants.contains(&"케익".to_string()));
694 }
695
696 #[test]
697 fn test_is_variant() {
698 let normalizer = Normalizer::default().unwrap();
699
700 assert!(normalizer.is_variant("커피", "코피"));
701 assert!(normalizer.is_variant("코피", "커피"));
702 assert!(normalizer.is_variant("커피", "커피"));
703 assert!(!normalizer.is_variant("커피", "라면"));
704 }
705
706 #[test]
707 fn test_phonetic_similarity() {
708 let normalizer = Normalizer::default().unwrap();
709
710 assert_eq!(normalizer.phonetic_similarity("커피", "커피"), 1.0);
711 assert!(normalizer.phonetic_similarity("커피", "코피") > 0.6);
713 assert!(normalizer.phonetic_similarity("커피", "라면") < 0.5);
714 }
715
716 #[test]
717 fn test_levenshtein_distance() {
718 assert_eq!(Normalizer::levenshtein_distance("", ""), 0);
719 assert_eq!(Normalizer::levenshtein_distance("a", ""), 1);
720 assert_eq!(Normalizer::levenshtein_distance("", "a"), 1);
721 assert_eq!(Normalizer::levenshtein_distance("abc", "abc"), 0);
722 assert_eq!(Normalizer::levenshtein_distance("abc", "abd"), 1);
723 assert_eq!(Normalizer::levenshtein_distance("abc", "def"), 3);
724 }
725
726 #[test]
727 fn test_vowel_length_variants() {
728 let variants = Normalizer::generate_vowel_length_variants("커피");
730 assert!(!variants.is_empty());
731 }
732
733 #[test]
734 fn test_jongseong_variants() {
735 let variants = Normalizer::generate_jongseong_variants("소프트웨어");
737 assert!(!variants.is_empty());
738 }
739
740 #[test]
741 fn test_it_terms() {
742 let normalizer = Normalizer::default().unwrap();
743
744 assert_eq!(normalizer.normalize("쿠베르네테스"), "쿠버네티스");
746 assert_eq!(normalizer.normalize("라이브러이"), "라이브러리");
747 assert_eq!(normalizer.normalize("디렉터리"), "디렉토리");
748 }
749
750 #[test]
751 fn test_config() {
752 let mut config = NormalizationConfig::default();
753 config.vowel_length = false;
754 config.min_confidence = 0.9;
755
756 let normalizer = Normalizer::new(config);
757 assert!(normalizer.is_ok());
758 }
759}