use crate::Result;
use mecab_ko_hangul::{compose, decompose, is_hangul_syllable};
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::sync::Arc;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RuleType {
VowelLength,
ConsonantVariation,
JongseongVariation,
VowelVariation,
PhoneticSimilarity,
}
#[derive(Debug, Clone, PartialEq)]
pub struct NormalizationRule {
pub rule_type: RuleType,
pub from: String,
pub to: String,
pub confidence: f32,
}
impl NormalizationRule {
#[must_use]
pub const fn new(rule_type: RuleType, from: String, to: String, confidence: f32) -> Self {
Self {
rule_type,
from,
to,
confidence,
}
}
}
#[derive(Debug, Clone)]
#[allow(clippy::struct_excessive_bools)]
pub struct NormalizationConfig {
pub vowel_length: bool,
pub consonant_variation: bool,
pub jongseong_variation: bool,
pub vowel_variation: bool,
pub phonetic_similarity: bool,
pub min_confidence: f32,
}
impl Default for NormalizationConfig {
fn default() -> Self {
Self {
vowel_length: true,
consonant_variation: true,
jongseong_variation: true,
vowel_variation: true,
phonetic_similarity: true,
min_confidence: 0.7,
}
}
}
pub struct Normalizer {
config: NormalizationConfig,
standard_to_variants: Arc<HashMap<String, HashSet<String>>>,
variant_to_standard: Arc<HashMap<String, String>>,
}
impl Normalizer {
pub fn new(config: NormalizationConfig) -> Result<Self> {
let rules = Self::load_rules(&config);
let (standard_to_variants, variant_to_standard) = Self::build_variant_maps(&rules);
Ok(Self {
config,
standard_to_variants: Arc::new(standard_to_variants),
variant_to_standard: Arc::new(variant_to_standard),
})
}
pub fn with_data_file(config: NormalizationConfig, variant_csv_path: &Path) -> Result<Self> {
let rules = Self::load_rules(&config);
let mut variant_pairs = Self::builtin_variant_pairs();
if let Ok(external_pairs) = Self::load_variant_csv(variant_csv_path) {
variant_pairs.extend(external_pairs);
}
let (standard_to_variants, variant_to_standard) =
Self::build_variant_maps_with_pairs(&rules, &variant_pairs);
Ok(Self {
config,
standard_to_variants: Arc::new(standard_to_variants),
variant_to_standard: Arc::new(variant_to_standard),
})
}
fn load_variant_csv(path: &Path) -> Result<Vec<(String, String)>> {
use std::fs::File;
use std::io::{BufRead, BufReader};
let file = File::open(path)
.map_err(|e| crate::error::Error::Init(format!("Failed to open variant CSV: {e}")))?;
let reader = BufReader::new(file);
let mut pairs = Vec::new();
for (line_num, line) in reader.lines().enumerate() {
let line = line.map_err(|e| {
crate::error::Error::Init(format!("Failed to read line {line_num}: {e}"))
})?;
if line_num == 0 || line.trim().is_empty() || line.starts_with('#') {
continue;
}
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
let standard = parts[0].trim().to_string();
let variant = parts[1].trim().to_string();
if standard != variant {
pairs.push((standard, variant));
}
}
}
Ok(pairs)
}
#[allow(clippy::should_implement_trait)]
pub fn default() -> Result<Self> {
Self::new(NormalizationConfig::default())
}
#[must_use]
pub fn normalize(&self, text: &str) -> String {
if let Some(standard) = self.variant_to_standard.get(text) {
return standard.clone();
}
Self::apply_rules(text)
}
#[must_use]
pub fn get_variants(&self, standard: &str) -> Vec<String> {
let mut variants = self
.standard_to_variants
.get(standard)
.map(|set| set.iter().cloned().collect::<Vec<_>>())
.unwrap_or_default();
let generated = self.generate_variants(standard);
variants.extend(generated);
variants.sort();
variants.dedup();
variants
}
#[must_use]
pub fn is_variant(&self, word1: &str, word2: &str) -> bool {
if word1 == word2 {
return true;
}
let norm1 = self.normalize(word1);
let norm2 = self.normalize(word2);
norm1 == norm2
}
#[must_use]
pub fn phonetic_similarity(&self, word1: &str, word2: &str) -> f32 {
if word1 == word2 {
return 1.0;
}
let jamo1 = Self::to_phonetic_jamo(word1);
let jamo2 = Self::to_phonetic_jamo(word2);
Self::string_similarity(&jamo1, &jamo2)
}
fn load_rules(config: &NormalizationConfig) -> Vec<NormalizationRule> {
let mut rules = Vec::new();
if config.vowel_length {
rules.extend(Self::vowel_length_rules());
}
if config.consonant_variation {
rules.extend(Self::consonant_variation_rules());
}
if config.jongseong_variation {
rules.extend(Self::jongseong_variation_rules());
}
if config.vowel_variation {
rules.extend(Self::vowel_variation_rules());
}
rules
}
fn vowel_length_rules() -> Vec<NormalizationRule> {
vec![
NormalizationRule::new(RuleType::VowelLength, "오".into(), "어".into(), 0.9),
NormalizationRule::new(RuleType::VowelLength, "어".into(), "오".into(), 0.9),
NormalizationRule::new(RuleType::VowelLength, "우".into(), "유".into(), 0.85),
NormalizationRule::new(RuleType::VowelLength, "유".into(), "우".into(), 0.85),
]
}
fn consonant_variation_rules() -> Vec<NormalizationRule> {
vec![
NormalizationRule::new(RuleType::ConsonantVariation, "ㅂ".into(), "ㅍ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅍ".into(), "ㅂ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㄷ".into(), "ㅌ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅌ".into(), "ㄷ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㄱ".into(), "ㅋ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅋ".into(), "ㄱ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅈ".into(), "ㅊ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅊ".into(), "ㅈ".into(), 0.9),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅅ".into(), "ㅆ".into(), 0.85),
NormalizationRule::new(RuleType::ConsonantVariation, "ㅆ".into(), "ㅅ".into(), 0.85),
]
}
fn jongseong_variation_rules() -> Vec<NormalizationRule> {
vec![
NormalizationRule::new(
RuleType::JongseongVariation,
"ㄹ".into(),
String::new(),
0.85,
),
NormalizationRule::new(
RuleType::JongseongVariation,
String::new(),
"ㄹ".into(),
0.85,
),
NormalizationRule::new(RuleType::JongseongVariation, "ㅁ".into(), "ㅂ".into(), 0.8),
NormalizationRule::new(RuleType::JongseongVariation, "ㅂ".into(), "ㅁ".into(), 0.8),
]
}
fn vowel_variation_rules() -> Vec<NormalizationRule> {
vec![
NormalizationRule::new(RuleType::VowelVariation, "에이".into(), "에".into(), 0.9),
NormalizationRule::new(RuleType::VowelVariation, "에".into(), "에이".into(), 0.9),
NormalizationRule::new(RuleType::VowelVariation, "이".into(), "익".into(), 0.85),
NormalizationRule::new(RuleType::VowelVariation, "익".into(), "이".into(), 0.85),
]
}
fn build_variant_maps(
rules: &[NormalizationRule],
) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
let builtin_variants = Self::builtin_variant_pairs();
Self::build_variant_maps_with_pairs(rules, &builtin_variants)
}
fn build_variant_maps_with_pairs(
_rules: &[NormalizationRule],
variant_pairs: &[(String, String)],
) -> (HashMap<String, HashSet<String>>, HashMap<String, String>) {
let mut standard_to_variants = HashMap::new();
let mut variant_to_standard = HashMap::new();
for (standard, variant) in variant_pairs {
standard_to_variants
.entry(standard.clone())
.or_insert_with(HashSet::new)
.insert(variant.clone());
variant_to_standard.insert(variant.clone(), standard.clone());
}
(standard_to_variants, variant_to_standard)
}
fn builtin_variant_pairs() -> Vec<(String, String)> {
vec![
("커피".into(), "코피".into()),
("쿠버네티스".into(), "쿠베르네테스".into()),
("쿠버네티스".into(), "쿠베르네티즈".into()),
("소프트웨어".into(), "소프트웨아".into()),
("라이브러리".into(), "라이브러이".into()),
("디렉토리".into(), "디렉터리".into()),
("디렉터리".into(), "디렉토리".into()),
("서버".into(), "서버".into()),
("클라이언트".into(), "클라이언트".into()),
("인터페이스".into(), "인터페이스".into()),
("알고리즘".into(), "알고리듬".into()),
("컴퓨터".into(), "컴퓨타".into()),
("프로그램".into(), "프로그래밍".into()),
("데이터베이스".into(), "데이타베이스".into()),
("케이크".into(), "케익".into()),
("스테이크".into(), "스테익".into()),
("메이크업".into(), "메이컵".into()),
("샴푸".into(), "샴프".into()),
("컵".into(), "컵".into()),
("버스".into(), "버스".into()),
("택시".into(), "택시".into()),
("카메라".into(), "카메라".into()),
("비디오".into(), "비데오".into()),
("라디오".into(), "라지오".into()),
]
}
fn apply_rules(text: &str) -> String {
let chars: Vec<char> = text.chars().collect();
let mut result = String::with_capacity(text.len());
for &ch in &chars {
result.push(ch);
}
result
}
fn generate_variants(&self, text: &str) -> Vec<String> {
let mut variants = HashSet::new();
if self.config.vowel_length {
variants.extend(Self::generate_vowel_length_variants(text));
}
if self.config.jongseong_variation {
variants.extend(Self::generate_jongseong_variants(text));
}
variants.into_iter().collect()
}
fn generate_vowel_length_variants(text: &str) -> Vec<String> {
let mut variants = Vec::new();
for i in 0..text.chars().count() {
let chars: Vec<char> = text.chars().collect();
let ch = chars[i];
if !is_hangul_syllable(ch) {
continue;
}
if let Some((cho, jung, jong)) = decompose(ch) {
if jung == 'ㅓ' {
if let Some(variant_char) = compose(cho, 'ㅗ', jong) {
let mut variant: Vec<char> = chars.clone();
variant[i] = variant_char;
variants.push(variant.into_iter().collect());
}
} else if jung == 'ㅗ' {
if let Some(variant_char) = compose(cho, 'ㅓ', jong) {
let mut variant: Vec<char> = chars.clone();
variant[i] = variant_char;
variants.push(variant.into_iter().collect());
}
}
}
}
variants
}
fn generate_jongseong_variants(text: &str) -> Vec<String> {
let mut variants = Vec::new();
for i in 0..text.chars().count() {
let chars: Vec<char> = text.chars().collect();
let ch = chars[i];
if !is_hangul_syllable(ch) {
continue;
}
if let Some((cho, jung, jong)) = decompose(ch) {
if jong.is_none() {
for &new_jong in &['ㄹ', 'ㅁ', 'ㅂ'] {
if let Some(variant_char) = compose(cho, jung, Some(new_jong)) {
let mut variant: Vec<char> = chars.clone();
variant[i] = variant_char;
variants.push(variant.into_iter().collect());
}
}
} else {
if let Some(variant_char) = compose(cho, jung, None) {
let mut variant: Vec<char> = chars.clone();
variant[i] = variant_char;
variants.push(variant.into_iter().collect());
}
}
}
}
variants
}
fn to_phonetic_jamo(text: &str) -> String {
let mut result = String::new();
for ch in text.chars() {
if let Some((cho, jung, jong)) = decompose(ch) {
result.push(cho);
result.push(jung);
if let Some(j) = jong {
result.push(j);
}
} else {
result.push(ch);
}
}
result
}
fn string_similarity(s1: &str, s2: &str) -> f32 {
let len1 = s1.chars().count();
let len2 = s2.chars().count();
if len1 == 0 && len2 == 0 {
return 1.0;
}
let max_len = len1.max(len2);
let distance = Self::levenshtein_distance(s1, s2);
#[allow(clippy::cast_precision_loss)]
let result = 1.0 - (distance as f32 / max_len as f32);
result
}
fn levenshtein_distance(s1: &str, s2: &str) -> usize {
let chars1: Vec<char> = s1.chars().collect();
let chars2: Vec<char> = s2.chars().collect();
let len1 = chars1.len();
let len2 = chars2.len();
if len1 == 0 {
return len2;
}
if len2 == 0 {
return len1;
}
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
row[0] = i;
}
for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
*val = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
let cost = usize::from(chars1[i - 1] != chars2[j - 1]);
matrix[i][j] = (matrix[i - 1][j] + 1)
.min(matrix[i][j - 1] + 1)
.min(matrix[i - 1][j - 1] + cost);
}
}
matrix[len1][len2]
}
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::float_cmp,
clippy::field_reassign_with_default
)]
mod tests {
use super::*;
#[test]
fn test_normalizer_creation() {
let result = Normalizer::default();
assert!(result.is_ok());
}
#[test]
fn test_normalize_builtin() {
let normalizer = Normalizer::default().unwrap();
assert_eq!(normalizer.normalize("코피"), "커피");
assert_eq!(normalizer.normalize("커피"), "커피");
assert_eq!(normalizer.normalize("소프트웨아"), "소프트웨어");
assert_eq!(normalizer.normalize("케익"), "케이크");
}
#[test]
fn test_get_variants() {
let normalizer = Normalizer::default().unwrap();
let variants = normalizer.get_variants("커피");
assert!(variants.contains(&"코피".to_string()));
let variants = normalizer.get_variants("케이크");
assert!(variants.contains(&"케익".to_string()));
}
#[test]
fn test_is_variant() {
let normalizer = Normalizer::default().unwrap();
assert!(normalizer.is_variant("커피", "코피"));
assert!(normalizer.is_variant("코피", "커피"));
assert!(normalizer.is_variant("커피", "커피"));
assert!(!normalizer.is_variant("커피", "라면"));
}
#[test]
fn test_phonetic_similarity() {
let normalizer = Normalizer::default().unwrap();
assert_eq!(normalizer.phonetic_similarity("커피", "커피"), 1.0);
assert!(normalizer.phonetic_similarity("커피", "코피") > 0.6);
assert!(normalizer.phonetic_similarity("커피", "라면") < 0.5);
}
#[test]
fn test_levenshtein_distance() {
assert_eq!(Normalizer::levenshtein_distance("", ""), 0);
assert_eq!(Normalizer::levenshtein_distance("a", ""), 1);
assert_eq!(Normalizer::levenshtein_distance("", "a"), 1);
assert_eq!(Normalizer::levenshtein_distance("abc", "abc"), 0);
assert_eq!(Normalizer::levenshtein_distance("abc", "abd"), 1);
assert_eq!(Normalizer::levenshtein_distance("abc", "def"), 3);
}
#[test]
fn test_vowel_length_variants() {
let variants = Normalizer::generate_vowel_length_variants("커피");
assert!(!variants.is_empty());
}
#[test]
fn test_jongseong_variants() {
let variants = Normalizer::generate_jongseong_variants("소프트웨어");
assert!(!variants.is_empty());
}
#[test]
fn test_it_terms() {
let normalizer = Normalizer::default().unwrap();
assert_eq!(normalizer.normalize("쿠베르네테스"), "쿠버네티스");
assert_eq!(normalizer.normalize("라이브러이"), "라이브러리");
assert_eq!(normalizer.normalize("디렉터리"), "디렉토리");
}
#[test]
fn test_config() {
let mut config = NormalizationConfig::default();
config.vowel_length = false;
config.min_confidence = 0.9;
let normalizer = Normalizer::new(config);
assert!(normalizer.is_ok());
}
}