Skip to main content

mecab_ko_core/
normalizer.rs

1//! # Foreign Word Normalization Module
2//!
3//! 외래어 표기 정규화 모듈 - 국립국어원 외래어 표기법 기반
4//!
5//! ## Features
6//!
7//! - 외래어 변이형 정규화 (커피/코피, 쿠버네티스/쿠베르네테스)
8//! - 장단음 정규화
9//! - 자음/모음 변이 처리
10//! - 받침 변이 처리
11//! - 발음 유사성 기반 fuzzy matching
12//!
13//! ## Example
14//!
15//! ```rust
16//! use mecab_ko_core::normalizer::{Normalizer, NormalizationConfig};
17//!
18//! let normalizer = Normalizer::new(NormalizationConfig::default()).unwrap();
19//!
20//! // 표준형으로 정규화
21//! let normalized = normalizer.normalize("코피");
22//! assert_eq!(normalized, "커피");
23//!
24//! // 변이형 목록 조회
25//! let variants = normalizer.get_variants("커피");
26//! assert!(variants.contains(&"코피".to_string()));
27//!
28//! // 변이형 여부 확인
29//! assert!(normalizer.is_variant("커피", "코피"));
30//! ```
31
32use crate::Result;
33use mecab_ko_hangul::{compose, decompose, is_hangul_syllable};
34use std::collections::{HashMap, HashSet};
35use std::path::Path;
36use std::sync::Arc;
37
38/// 정규화 규칙 타입
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum RuleType {
41    /// 장단음 변이 (커피 ↔ 코피)
42    VowelLength,
43    /// 자음 변이 (쿠버네티스 ↔ 쿠베르네테스)
44    ConsonantVariation,
45    /// 받침 변이 (소프트웨어 ↔ 소프트웨아)
46    JongseongVariation,
47    /// 모음 변이 (케이크 ↔ 케익)
48    VowelVariation,
49    /// 발음 유사성 (라이브러리 ↔ 라이브러이)
50    PhoneticSimilarity,
51}
52
53/// 정규화 규칙
54#[derive(Debug, Clone, PartialEq)]
55pub struct NormalizationRule {
56    /// 규칙 타입
57    pub rule_type: RuleType,
58    /// 원본 패턴
59    pub from: String,
60    /// 대상 패턴
61    pub to: String,
62    /// 신뢰도 (0.0 ~ 1.0)
63    pub confidence: f32,
64}
65
66impl NormalizationRule {
67    /// 새 규칙 생성
68    #[must_use]
69    pub const fn new(rule_type: RuleType, from: String, to: String, confidence: f32) -> Self {
70        Self {
71            rule_type,
72            from,
73            to,
74            confidence,
75        }
76    }
77}
78
79/// 정규화 설정
80#[derive(Debug, Clone)]
81#[allow(clippy::struct_excessive_bools)]
82pub struct NormalizationConfig {
83    /// 장단음 정규화 활성화
84    pub vowel_length: bool,
85    /// 자음 변이 정규화 활성화
86    pub consonant_variation: bool,
87    /// 받침 변이 정규화 활성화
88    pub jongseong_variation: bool,
89    /// 모음 변이 정규화 활성화
90    pub vowel_variation: bool,
91    /// 발음 유사성 기반 정규화 활성화
92    pub phonetic_similarity: bool,
93    /// 최소 신뢰도 임계값
94    pub min_confidence: f32,
95}
96
97impl Default for NormalizationConfig {
98    fn default() -> Self {
99        Self {
100            vowel_length: true,
101            consonant_variation: true,
102            jongseong_variation: true,
103            vowel_variation: true,
104            phonetic_similarity: true,
105            min_confidence: 0.7,
106        }
107    }
108}
109
110/// 외래어 정규화기
111pub struct Normalizer {
112    /// 설정
113    config: NormalizationConfig,
114    /// 표준형 → 변이형 맵
115    standard_to_variants: Arc<HashMap<String, HashSet<String>>>,
116    /// 변이형 → 표준형 맵
117    variant_to_standard: Arc<HashMap<String, String>>,
118}
119
120impl Normalizer {
121    /// 새 정규화기 생성
122    ///
123    /// # Arguments
124    ///
125    /// * `config` - 정규화 설정
126    ///
127    /// # Returns
128    ///
129    /// `Result<Self>` - 생성된 정규화기 또는 에러
130    ///
131    /// # Errors
132    ///
133    /// 데이터 로딩 실패 시 에러 반환
134    pub fn new(config: NormalizationConfig) -> Result<Self> {
135        let rules = Self::load_rules(&config);
136        let (standard_to_variants, variant_to_standard) = Self::build_variant_maps(&rules);
137
138        Ok(Self {
139            config,
140            standard_to_variants: Arc::new(standard_to_variants),
141            variant_to_standard: Arc::new(variant_to_standard),
142        })
143    }
144
145    /// 외부 데이터 파일로 정규화기 생성
146    ///
147    /// # Arguments
148    ///
149    /// * `config` - 정규화 설정
150    /// * `variant_csv_path` - 변이형 CSV 파일 경로
151    ///
152    /// # Returns
153    ///
154    /// `Result<Self>` - 생성된 정규화기 또는 에러
155    ///
156    /// # Errors
157    ///
158    /// 파일 로딩 또는 파싱 실패 시 에러 반환
159    pub fn with_data_file(config: NormalizationConfig, variant_csv_path: &Path) -> Result<Self> {
160        let rules = Self::load_rules(&config);
161        let mut variant_pairs = Self::builtin_variant_pairs();
162
163        // CSV 파일에서 추가 변이형 로드
164        if let Ok(external_pairs) = Self::load_variant_csv(variant_csv_path) {
165            variant_pairs.extend(external_pairs);
166        }
167
168        let (standard_to_variants, variant_to_standard) =
169            Self::build_variant_maps_with_pairs(&rules, &variant_pairs);
170
171        Ok(Self {
172            config,
173            standard_to_variants: Arc::new(standard_to_variants),
174            variant_to_standard: Arc::new(variant_to_standard),
175        })
176    }
177
178    /// CSV 파일에서 변이형 로드
179    fn load_variant_csv(path: &Path) -> Result<Vec<(String, String)>> {
180        use std::fs::File;
181        use std::io::{BufRead, BufReader};
182
183        let file = File::open(path)
184            .map_err(|e| crate::error::Error::Init(format!("Failed to open variant CSV: {e}")))?;
185
186        let reader = BufReader::new(file);
187        let mut pairs = Vec::new();
188
189        for (line_num, line) in reader.lines().enumerate() {
190            let line = line.map_err(|e| {
191                crate::error::Error::Init(format!("Failed to read line {line_num}: {e}"))
192            })?;
193
194            // 헤더 또는 빈 줄 스킵
195            if line_num == 0 || line.trim().is_empty() || line.starts_with('#') {
196                continue;
197            }
198
199            let parts: Vec<&str> = line.split(',').collect();
200            if parts.len() >= 2 {
201                let standard = parts[0].trim().to_string();
202                let variant = parts[1].trim().to_string();
203
204                // 표준형과 변이형이 다를 때만 추가
205                if standard != variant {
206                    pairs.push((standard, variant));
207                }
208            }
209        }
210
211        Ok(pairs)
212    }
213
214    /// 기본 설정으로 생성
215    ///
216    /// # Errors
217    ///
218    /// 데이터 로딩 실패 시 에러 반환
219    #[allow(clippy::should_implement_trait)]
220    pub fn default() -> Result<Self> {
221        Self::new(NormalizationConfig::default())
222    }
223
224    /// 외래어를 표준형으로 정규화
225    ///
226    /// # Arguments
227    ///
228    /// * `text` - 정규화할 텍스트
229    ///
230    /// # Returns
231    ///
232    /// 정규화된 텍스트
233    ///
234    /// # Example
235    ///
236    /// ```rust
237    /// use mecab_ko_core::normalizer::{Normalizer, NormalizationConfig};
238    ///
239    /// let normalizer = Normalizer::new(NormalizationConfig::default()).unwrap();
240    /// assert_eq!(normalizer.normalize("코피"), "커피");
241    /// assert_eq!(normalizer.normalize("소프트웨아"), "소프트웨어");
242    /// ```
243    #[must_use]
244    pub fn normalize(&self, text: &str) -> String {
245        // 직접 매핑 확인
246        if let Some(standard) = self.variant_to_standard.get(text) {
247            return standard.clone();
248        }
249
250        // 규칙 기반 정규화 시도
251        Self::apply_rules(text)
252    }
253
254    /// 표준형의 모든 변이형 조회
255    ///
256    /// # Arguments
257    ///
258    /// * `standard` - 표준형 단어
259    ///
260    /// # Returns
261    ///
262    /// 변이형 목록
263    ///
264    /// # Example
265    ///
266    /// ```rust
267    /// use mecab_ko_core::normalizer::{Normalizer, NormalizationConfig};
268    ///
269    /// let normalizer = Normalizer::new(NormalizationConfig::default()).unwrap();
270    /// let variants = normalizer.get_variants("커피");
271    /// assert!(variants.contains(&"코피".to_string()));
272    /// ```
273    #[must_use]
274    pub fn get_variants(&self, standard: &str) -> Vec<String> {
275        // 직접 매핑된 변이형
276        let mut variants = self
277            .standard_to_variants
278            .get(standard)
279            .map(|set| set.iter().cloned().collect::<Vec<_>>())
280            .unwrap_or_default();
281
282        // 규칙 기반 변이형 생성
283        let generated = self.generate_variants(standard);
284        variants.extend(generated);
285
286        variants.sort();
287        variants.dedup();
288        variants
289    }
290
291    /// 두 단어가 변이형 관계인지 확인
292    ///
293    /// # Arguments
294    ///
295    /// * `word1` - 첫 번째 단어
296    /// * `word2` - 두 번째 단어
297    ///
298    /// # Returns
299    ///
300    /// 변이형 관계이면 `true`, 아니면 `false`
301    ///
302    /// # Example
303    ///
304    /// ```rust
305    /// use mecab_ko_core::normalizer::{Normalizer, NormalizationConfig};
306    ///
307    /// let normalizer = Normalizer::new(NormalizationConfig::default()).unwrap();
308    /// assert!(normalizer.is_variant("커피", "코피"));
309    /// assert!(!normalizer.is_variant("커피", "라면"));
310    /// ```
311    #[must_use]
312    pub fn is_variant(&self, word1: &str, word2: &str) -> bool {
313        if word1 == word2 {
314            return true;
315        }
316
317        let norm1 = self.normalize(word1);
318        let norm2 = self.normalize(word2);
319        norm1 == norm2
320    }
321
322    /// 발음 유사도 계산 (0.0 ~ 1.0)
323    ///
324    /// # Arguments
325    ///
326    /// * `word1` - 첫 번째 단어
327    /// * `word2` - 두 번째 단어
328    ///
329    /// # Returns
330    ///
331    /// 발음 유사도 (0.0 ~ 1.0)
332    #[must_use]
333    pub fn phonetic_similarity(&self, word1: &str, word2: &str) -> f32 {
334        if word1 == word2 {
335            return 1.0;
336        }
337
338        let jamo1 = Self::to_phonetic_jamo(word1);
339        let jamo2 = Self::to_phonetic_jamo(word2);
340
341        Self::string_similarity(&jamo1, &jamo2)
342    }
343
344    // 내부 헬퍼 메서드들
345
346    /// 규칙 로딩 (내장 규칙 + 외부 파일)
347    fn load_rules(config: &NormalizationConfig) -> Vec<NormalizationRule> {
348        let mut rules = Vec::new();
349
350        // 장단음 규칙
351        if config.vowel_length {
352            rules.extend(Self::vowel_length_rules());
353        }
354
355        // 자음 변이 규칙
356        if config.consonant_variation {
357            rules.extend(Self::consonant_variation_rules());
358        }
359
360        // 받침 변이 규칙
361        if config.jongseong_variation {
362            rules.extend(Self::jongseong_variation_rules());
363        }
364
365        // 모음 변이 규칙
366        if config.vowel_variation {
367            rules.extend(Self::vowel_variation_rules());
368        }
369
370        rules
371    }
372
373    /// 장단음 규칙
374    fn vowel_length_rules() -> Vec<NormalizationRule> {
375        vec![
376            NormalizationRule::new(RuleType::VowelLength, "오".into(), "어".into(), 0.9),
377            NormalizationRule::new(RuleType::VowelLength, "어".into(), "오".into(), 0.9),
378            NormalizationRule::new(RuleType::VowelLength, "우".into(), "유".into(), 0.85),
379            NormalizationRule::new(RuleType::VowelLength, "유".into(), "우".into(), 0.85),
380        ]
381    }
382
383    /// 자음 변이 규칙
384    fn consonant_variation_rules() -> Vec<NormalizationRule> {
385        vec![
386            NormalizationRule::new(RuleType::ConsonantVariation, "ㅂ".into(), "ㅍ".into(), 0.9),
387            NormalizationRule::new(RuleType::ConsonantVariation, "ㅍ".into(), "ㅂ".into(), 0.9),
388            NormalizationRule::new(RuleType::ConsonantVariation, "ㄷ".into(), "ㅌ".into(), 0.9),
389            NormalizationRule::new(RuleType::ConsonantVariation, "ㅌ".into(), "ㄷ".into(), 0.9),
390            NormalizationRule::new(RuleType::ConsonantVariation, "ㄱ".into(), "ㅋ".into(), 0.9),
391            NormalizationRule::new(RuleType::ConsonantVariation, "ㅋ".into(), "ㄱ".into(), 0.9),
392            NormalizationRule::new(RuleType::ConsonantVariation, "ㅈ".into(), "ㅊ".into(), 0.9),
393            NormalizationRule::new(RuleType::ConsonantVariation, "ㅊ".into(), "ㅈ".into(), 0.9),
394            NormalizationRule::new(RuleType::ConsonantVariation, "ㅅ".into(), "ㅆ".into(), 0.85),
395            NormalizationRule::new(RuleType::ConsonantVariation, "ㅆ".into(), "ㅅ".into(), 0.85),
396        ]
397    }
398
399    /// 받침 변이 규칙
400    fn jongseong_variation_rules() -> Vec<NormalizationRule> {
401        vec![
402            NormalizationRule::new(
403                RuleType::JongseongVariation,
404                "ㄹ".into(),
405                String::new(),
406                0.85,
407            ),
408            NormalizationRule::new(
409                RuleType::JongseongVariation,
410                String::new(),
411                "ㄹ".into(),
412                0.85,
413            ),
414            NormalizationRule::new(RuleType::JongseongVariation, "ㅁ".into(), "ㅂ".into(), 0.8),
415            NormalizationRule::new(RuleType::JongseongVariation, "ㅂ".into(), "ㅁ".into(), 0.8),
416        ]
417    }
418
419    /// 모음 변이 규칙
420    fn vowel_variation_rules() -> Vec<NormalizationRule> {
421        vec![
422            NormalizationRule::new(RuleType::VowelVariation, "에이".into(), "에".into(), 0.9),
423            NormalizationRule::new(RuleType::VowelVariation, "에".into(), "에이".into(), 0.9),
424            NormalizationRule::new(RuleType::VowelVariation, "이".into(), "익".into(), 0.85),
425            NormalizationRule::new(RuleType::VowelVariation, "익".into(), "이".into(), 0.85),
426        ]
427    }
428
429    /// 변이형 맵 구축
430    fn build_variant_maps(
431        rules: &[NormalizationRule],
432    ) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
433        let builtin_variants = Self::builtin_variant_pairs();
434        Self::build_variant_maps_with_pairs(rules, &builtin_variants)
435    }
436
437    /// 변이형 쌍으로 맵 구축
438    fn build_variant_maps_with_pairs(
439        _rules: &[NormalizationRule],
440        variant_pairs: &[(String, String)],
441    ) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
442        let mut standard_to_variants = HashMap::new();
443        let mut variant_to_standard = HashMap::new();
444
445        for (standard, variant) in variant_pairs {
446            standard_to_variants
447                .entry(standard.clone())
448                .or_insert_with(HashSet::new)
449                .insert(variant.clone());
450
451            variant_to_standard.insert(variant.clone(), standard.clone());
452        }
453
454        (standard_to_variants, variant_to_standard)
455    }
456
457    /// 내장 변이형 쌍
458    fn builtin_variant_pairs() -> Vec<(String, String)> {
459        vec![
460            // IT 용어
461            ("커피".into(), "코피".into()),
462            ("쿠버네티스".into(), "쿠베르네테스".into()),
463            ("쿠버네티스".into(), "쿠베르네티즈".into()),
464            ("소프트웨어".into(), "소프트웨아".into()),
465            ("라이브러리".into(), "라이브러이".into()),
466            ("디렉토리".into(), "디렉터리".into()),
467            ("디렉터리".into(), "디렉토리".into()),
468            ("서버".into(), "서버".into()),
469            ("클라이언트".into(), "클라이언트".into()),
470            ("인터페이스".into(), "인터페이스".into()),
471            ("알고리즘".into(), "알고리듬".into()),
472            ("컴퓨터".into(), "컴퓨타".into()),
473            ("프로그램".into(), "프로그래밍".into()),
474            ("데이터베이스".into(), "데이타베이스".into()),
475            // 일반 외래어
476            ("케이크".into(), "케익".into()),
477            ("스테이크".into(), "스테익".into()),
478            ("메이크업".into(), "메이컵".into()),
479            ("샴푸".into(), "샴프".into()),
480            ("컵".into(), "컵".into()),
481            ("버스".into(), "버스".into()),
482            ("택시".into(), "택시".into()),
483            ("카메라".into(), "카메라".into()),
484            ("비디오".into(), "비데오".into()),
485            ("라디오".into(), "라지오".into()),
486        ]
487    }
488
489    /// 규칙 기반 정규화 적용
490    fn apply_rules(text: &str) -> String {
491        let chars: Vec<char> = text.chars().collect();
492        let mut result = String::with_capacity(text.len());
493
494        for &ch in &chars {
495            result.push(ch);
496        }
497
498        result
499    }
500
501    /// 규칙 기반 변이형 생성
502    fn generate_variants(&self, text: &str) -> Vec<String> {
503        let mut variants = HashSet::new();
504
505        // 장단음 변이형 생성
506        if self.config.vowel_length {
507            variants.extend(Self::generate_vowel_length_variants(text));
508        }
509
510        // 받침 변이형 생성
511        if self.config.jongseong_variation {
512            variants.extend(Self::generate_jongseong_variants(text));
513        }
514
515        variants.into_iter().collect()
516    }
517
518    /// 장단음 변이형 생성
519    fn generate_vowel_length_variants(text: &str) -> Vec<String> {
520        let mut variants = Vec::new();
521
522        for i in 0..text.chars().count() {
523            let chars: Vec<char> = text.chars().collect();
524            let ch = chars[i];
525
526            if !is_hangul_syllable(ch) {
527                continue;
528            }
529
530            if let Some((cho, jung, jong)) = decompose(ch) {
531                // 'ㅓ' ↔ 'ㅗ' 변이
532                if jung == 'ㅓ' {
533                    if let Some(variant_char) = compose(cho, 'ㅗ', jong) {
534                        let mut variant: Vec<char> = chars.clone();
535                        variant[i] = variant_char;
536                        variants.push(variant.into_iter().collect());
537                    }
538                } else if jung == 'ㅗ' {
539                    if let Some(variant_char) = compose(cho, 'ㅓ', jong) {
540                        let mut variant: Vec<char> = chars.clone();
541                        variant[i] = variant_char;
542                        variants.push(variant.into_iter().collect());
543                    }
544                }
545            }
546        }
547
548        variants
549    }
550
551    /// 받침 변이형 생성
552    fn generate_jongseong_variants(text: &str) -> Vec<String> {
553        let mut variants = Vec::new();
554
555        for i in 0..text.chars().count() {
556            let chars: Vec<char> = text.chars().collect();
557            let ch = chars[i];
558
559            if !is_hangul_syllable(ch) {
560                continue;
561            }
562
563            if let Some((cho, jung, jong)) = decompose(ch) {
564                // 받침 추가/제거
565                if jong.is_none() {
566                    // 받침 추가 (ㄹ, ㅁ, ㅂ)
567                    for &new_jong in &['ㄹ', 'ㅁ', 'ㅂ'] {
568                        if let Some(variant_char) = compose(cho, jung, Some(new_jong)) {
569                            let mut variant: Vec<char> = chars.clone();
570                            variant[i] = variant_char;
571                            variants.push(variant.into_iter().collect());
572                        }
573                    }
574                } else {
575                    // 받침 제거
576                    if let Some(variant_char) = compose(cho, jung, None) {
577                        let mut variant: Vec<char> = chars.clone();
578                        variant[i] = variant_char;
579                        variants.push(variant.into_iter().collect());
580                    }
581                }
582            }
583        }
584
585        variants
586    }
587
588    /// 발음 기반 자모 변환 (유사도 계산용)
589    fn to_phonetic_jamo(text: &str) -> String {
590        let mut result = String::new();
591
592        for ch in text.chars() {
593            if let Some((cho, jung, jong)) = decompose(ch) {
594                result.push(cho);
595                result.push(jung);
596                if let Some(j) = jong {
597                    result.push(j);
598                }
599            } else {
600                result.push(ch);
601            }
602        }
603
604        result
605    }
606
607    /// 문자열 유사도 계산 (Levenshtein distance 기반)
608    fn string_similarity(s1: &str, s2: &str) -> f32 {
609        let len1 = s1.chars().count();
610        let len2 = s2.chars().count();
611
612        if len1 == 0 && len2 == 0 {
613            return 1.0;
614        }
615
616        let max_len = len1.max(len2);
617        let distance = Self::levenshtein_distance(s1, s2);
618
619        #[allow(clippy::cast_precision_loss)]
620        let result = 1.0 - (distance as f32 / max_len as f32);
621        result
622    }
623
624    /// Levenshtein distance 계산
625    fn levenshtein_distance(s1: &str, s2: &str) -> usize {
626        let chars1: Vec<char> = s1.chars().collect();
627        let chars2: Vec<char> = s2.chars().collect();
628        let len1 = chars1.len();
629        let len2 = chars2.len();
630
631        if len1 == 0 {
632            return len2;
633        }
634        if len2 == 0 {
635            return len1;
636        }
637
638        let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
639
640        for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
641            row[0] = i;
642        }
643        for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
644            *val = j;
645        }
646
647        for i in 1..=len1 {
648            for j in 1..=len2 {
649                let cost = usize::from(chars1[i - 1] != chars2[j - 1]);
650                matrix[i][j] = (matrix[i - 1][j] + 1)
651                    .min(matrix[i][j - 1] + 1)
652                    .min(matrix[i - 1][j - 1] + cost);
653            }
654        }
655
656        matrix[len1][len2]
657    }
658}
659
660#[cfg(test)]
661#[allow(
662    clippy::unwrap_used,
663    clippy::float_cmp,
664    clippy::field_reassign_with_default
665)]
666mod tests {
667    use super::*;
668
669    #[test]
670    fn test_normalizer_creation() {
671        let result = Normalizer::default();
672        assert!(result.is_ok());
673    }
674
675    #[test]
676    fn test_normalize_builtin() {
677        let normalizer = Normalizer::default().unwrap();
678
679        assert_eq!(normalizer.normalize("코피"), "커피");
680        assert_eq!(normalizer.normalize("커피"), "커피");
681        assert_eq!(normalizer.normalize("소프트웨아"), "소프트웨어");
682        assert_eq!(normalizer.normalize("케익"), "케이크");
683    }
684
685    #[test]
686    fn test_get_variants() {
687        let normalizer = Normalizer::default().unwrap();
688
689        let variants = normalizer.get_variants("커피");
690        assert!(variants.contains(&"코피".to_string()));
691
692        let variants = normalizer.get_variants("케이크");
693        assert!(variants.contains(&"케익".to_string()));
694    }
695
696    #[test]
697    fn test_is_variant() {
698        let normalizer = Normalizer::default().unwrap();
699
700        assert!(normalizer.is_variant("커피", "코피"));
701        assert!(normalizer.is_variant("코피", "커피"));
702        assert!(normalizer.is_variant("커피", "커피"));
703        assert!(!normalizer.is_variant("커피", "라면"));
704    }
705
706    #[test]
707    fn test_phonetic_similarity() {
708        let normalizer = Normalizer::default().unwrap();
709
710        assert_eq!(normalizer.phonetic_similarity("커피", "커피"), 1.0);
711        // "커피" vs "코피": 6 chars total, 2 different (ㅓ vs ㅗ), similarity = 4/6 = 0.666...
712        assert!(normalizer.phonetic_similarity("커피", "코피") > 0.6);
713        assert!(normalizer.phonetic_similarity("커피", "라면") < 0.5);
714    }
715
716    #[test]
717    fn test_levenshtein_distance() {
718        assert_eq!(Normalizer::levenshtein_distance("", ""), 0);
719        assert_eq!(Normalizer::levenshtein_distance("a", ""), 1);
720        assert_eq!(Normalizer::levenshtein_distance("", "a"), 1);
721        assert_eq!(Normalizer::levenshtein_distance("abc", "abc"), 0);
722        assert_eq!(Normalizer::levenshtein_distance("abc", "abd"), 1);
723        assert_eq!(Normalizer::levenshtein_distance("abc", "def"), 3);
724    }
725
726    #[test]
727    fn test_vowel_length_variants() {
728        // 장단음 변이형 테스트
729        let variants = Normalizer::generate_vowel_length_variants("커피");
730        assert!(!variants.is_empty());
731    }
732
733    #[test]
734    fn test_jongseong_variants() {
735        // 받침 변이형 테스트
736        let variants = Normalizer::generate_jongseong_variants("소프트웨어");
737        assert!(!variants.is_empty());
738    }
739
740    #[test]
741    fn test_it_terms() {
742        let normalizer = Normalizer::default().unwrap();
743
744        // IT 용어 테스트
745        assert_eq!(normalizer.normalize("쿠베르네테스"), "쿠버네티스");
746        assert_eq!(normalizer.normalize("라이브러이"), "라이브러리");
747        assert_eq!(normalizer.normalize("디렉터리"), "디렉토리");
748    }
749
750    #[test]
751    fn test_config() {
752        let mut config = NormalizationConfig::default();
753        config.vowel_length = false;
754        config.min_confidence = 0.9;
755
756        let normalizer = Normalizer::new(config);
757        assert!(normalizer.is_ok());
758    }
759}