use crate::error::G2pError;
use crate::phonemizer::{Phonemizer, ProsodyInfo};
const HANGUL_START: u32 = 0xAC00;
const HANGUL_END: u32 = 0xD7A3;
const N_INITIALS: usize = 19;
const N_MEDIALS: usize = 21;
const N_FINALS: usize = 28;
const PUA_PH: char = '\u{E020}'; const PUA_TH: char = '\u{E021}'; const PUA_KH: char = '\u{E022}';
const PUA_TC: char = '\u{E023}'; const PUA_TCH: char = '\u{E024}';
const PUA_PP: char = '\u{E04B}'; const PUA_TT: char = '\u{E04C}'; const PUA_KK: char = '\u{E04D}'; const PUA_SS: char = '\u{E04E}'; const PUA_TTCH: char = '\u{E04F}';
const PUA_K_UNREL: char = '\u{E050}'; const PUA_T_UNREL: char = '\u{E051}'; const PUA_P_UNREL: char = '\u{E052}';
const IPA_FLAP: char = '\u{027E}'; const IPA_ENG: char = '\u{014B}'; const IPA_OPEN_E: char = '\u{025B}'; const IPA_OPEN_MID_BACK: char = '\u{028C}'; const IPA_CLOSE_BACK_UNR: char = '\u{026F}'; const IPA_VELAR_APPROX: char = '\u{0270}';
const INITIAL_TABLE: [Option<char>; N_INITIALS] = [
Some('k'), Some(PUA_KK), Some('n'), Some('t'), Some(PUA_TT), Some(IPA_FLAP), Some('m'), Some('p'), Some(PUA_PP), Some('s'), Some(PUA_SS), None, Some(PUA_TC), Some(PUA_TTCH), Some(PUA_TCH), Some(PUA_KH), Some(PUA_TH), Some(PUA_PH), Some('h'), ];
const MEDIAL_TABLE: [(char, Option<char>); N_MEDIALS] = [
('a', None), (IPA_OPEN_E, None), ('j', Some('a')), ('j', Some(IPA_OPEN_E)), (IPA_OPEN_MID_BACK, None), ('e', None), ('j', Some(IPA_OPEN_MID_BACK)), ('j', Some('e')), ('o', None), ('w', Some('a')), ('w', Some(IPA_OPEN_E)), ('w', Some('e')), ('j', Some('o')), ('u', None), ('w', Some(IPA_OPEN_MID_BACK)), ('w', Some('e')), ('w', Some('i')), ('j', Some('u')), (IPA_CLOSE_BACK_UNR, None), (IPA_VELAR_APPROX, Some('i')), ('i', None), ];
struct FinalEntry {
ph: Option<char>,
liaison_initial: i32,
residual_final: usize,
}
const FINAL_TABLE: [FinalEntry; N_FINALS] = [
FinalEntry {
ph: None,
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_K_UNREL),
liaison_initial: 0,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_K_UNREL),
liaison_initial: 1,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_K_UNREL),
liaison_initial: 9,
residual_final: 1,
}, FinalEntry {
ph: Some('n'),
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some('n'),
liaison_initial: 12,
residual_final: 4,
}, FinalEntry {
ph: Some('n'),
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 3,
residual_final: 0,
}, FinalEntry {
ph: Some('l'),
liaison_initial: 5,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_K_UNREL),
liaison_initial: 0,
residual_final: 8,
}, FinalEntry {
ph: Some('m'),
liaison_initial: 6,
residual_final: 8,
}, FinalEntry {
ph: Some('l'),
liaison_initial: 7,
residual_final: 8,
}, FinalEntry {
ph: Some('l'),
liaison_initial: 9,
residual_final: 8,
}, FinalEntry {
ph: Some('l'),
liaison_initial: 16,
residual_final: 8,
}, FinalEntry {
ph: Some('l'),
liaison_initial: 17,
residual_final: 8,
}, FinalEntry {
ph: Some('l'),
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some('m'),
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_P_UNREL),
liaison_initial: 7,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_P_UNREL),
liaison_initial: 9,
residual_final: 17,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 9,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 10,
residual_final: 0,
}, FinalEntry {
ph: Some(IPA_ENG),
liaison_initial: -1,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 12,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 14,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_K_UNREL),
liaison_initial: 15,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: 16,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_P_UNREL),
liaison_initial: 17,
residual_final: 0,
}, FinalEntry {
ph: Some(PUA_T_UNREL),
liaison_initial: -1,
residual_final: 0,
}, ];
fn is_hangul_syllable(ch: char) -> bool {
let code = ch as u32;
(HANGUL_START..=HANGUL_END).contains(&code)
}
fn decompose(ch: char) -> (usize, usize, usize) {
let code = (ch as u32 - HANGUL_START) as usize;
let initial = code / (N_MEDIALS * N_FINALS);
let medial = (code % (N_MEDIALS * N_FINALS)) / N_FINALS;
let final_ = code % N_FINALS;
(initial, medial, final_)
}
fn is_leading_jamo(ch: char) -> bool {
let c = ch as u32;
(0x1100..=0x1112).contains(&c)
}
fn is_vowel_jamo(ch: char) -> bool {
let c = ch as u32;
(0x1161..=0x1175).contains(&c)
}
fn is_trailing_jamo(ch: char) -> bool {
let c = ch as u32;
(0x11A8..=0x11C2).contains(&c)
}
fn compose_hangul_jamo(cps: &[char]) -> Vec<char> {
let mut out = Vec::with_capacity(cps.len());
let n = cps.len();
let mut i = 0;
while i < n {
if is_leading_jamo(cps[i]) && i + 1 < n && is_vowel_jamo(cps[i + 1]) {
let leading = cps[i] as u32 - 0x1100;
let vowel = cps[i + 1] as u32 - 0x1161;
let trailing;
if i + 2 < n && is_trailing_jamo(cps[i + 2]) {
trailing = cps[i + 2] as u32 - 0x11A8 + 1;
i += 3;
} else {
trailing = 0;
i += 2;
}
let composed = (leading * 21 + vowel) * 28 + trailing + 0xAC00;
if let Some(c) = char::from_u32(composed) {
out.push(c);
}
} else {
out.push(cps[i]);
i += 1;
}
}
out
}
fn is_punctuation(ch: char) -> bool {
matches!(
ch,
',' | '.' | ';' | ':' | '!' | '?'
| '\u{3002}' | '\u{FF0C}' | '\u{FF01}' | '\u{FF1F}' | '\u{3001}' )
}
struct KoSyllable {
initial: usize,
medial: usize,
final_: usize,
}
fn emit_syllable(syl: &KoSyllable, out: &mut Vec<char>) {
if syl.initial < N_INITIALS
&& let Some(ph) = INITIAL_TABLE[syl.initial]
{
out.push(ph);
}
if syl.medial < N_MEDIALS {
let (ph1, ph2) = MEDIAL_TABLE[syl.medial];
out.push(ph1);
if let Some(p2) = ph2 {
out.push(p2);
}
}
if syl.final_ > 0
&& syl.final_ < N_FINALS
&& let Some(ph) = FINAL_TABLE[syl.final_].ph
{
out.push(ph);
}
}
fn process_hangul_run(cps: &[char], out: &mut Vec<char>) {
if cps.is_empty() {
return;
}
let mut syls: Vec<KoSyllable> = cps
.iter()
.map(|&ch| {
let (initial, medial, final_) = decompose(ch);
KoSyllable {
initial,
medial,
final_,
}
})
.collect();
for i in 0..syls.len().saturating_sub(1) {
let fi = syls[i].final_;
if fi == 0 || fi >= N_FINALS {
continue;
}
if syls[i + 1].initial != 11 {
continue;
}
let liaison_init = FINAL_TABLE[fi].liaison_initial;
if liaison_init < 0 {
continue;
}
syls[i + 1].initial = liaison_init as usize;
syls[i].final_ = FINAL_TABLE[fi].residual_final;
}
for syl in &syls {
emit_syllable(syl, out);
}
}
fn text_to_phoneme_chars(text: &str) -> Vec<char> {
let cps: Vec<char> = text.chars().collect();
if cps.is_empty() {
return Vec::new();
}
let cps = compose_hangul_jamo(&cps);
let mut sentence: Vec<char> = Vec::new();
let mut need_space = false;
let n = cps.len();
let mut i = 0;
while i < n {
let ch = cps[i];
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
need_space = true;
i += 1;
continue;
}
if is_punctuation(ch) {
sentence.push(ch);
need_space = false;
i += 1;
continue;
}
if is_hangul_syllable(ch) {
if need_space && !sentence.is_empty() {
sentence.push(' ');
}
let run_start = i;
while i < n && is_hangul_syllable(cps[i]) {
i += 1;
}
process_hangul_run(&cps[run_start..i], &mut sentence);
need_space = true;
continue;
}
if ch.is_ascii_alphabetic() {
if need_space && !sentence.is_empty() {
sentence.push(' ');
}
sentence.push(ch.to_ascii_lowercase());
need_space = true;
i += 1;
continue;
}
i += 1;
}
sentence
}
pub struct KoreanPhonemizer;
impl KoreanPhonemizer {
pub fn new() -> Self {
Self
}
}
impl Default for KoreanPhonemizer {
fn default() -> Self {
Self::new()
}
}
impl Phonemizer for KoreanPhonemizer {
fn phonemize_with_prosody(
&self,
text: &str,
) -> Result<(Vec<String>, Vec<Option<ProsodyInfo>>), G2pError> {
let chars = text_to_phoneme_chars(text);
let tokens: Vec<String> = chars.iter().map(|c| c.to_string()).collect();
let prosody: Vec<Option<ProsodyInfo>> = tokens
.iter()
.map(|_| {
Some(ProsodyInfo {
a1: 0,
a2: 0,
a3: 0,
})
})
.collect();
Ok((tokens, prosody))
}
fn language_code(&self) -> &str {
"ko"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decompose_ga() {
let (i, m, f) = decompose('\u{AC00}');
assert_eq!((i, m, f), (0, 0, 0));
}
#[test]
fn test_decompose_han() {
let (i, m, f) = decompose('한');
assert_eq!((i, m, f), (18, 0, 4));
}
#[test]
fn test_decompose_gul() {
let (i, m, f) = decompose('글');
assert_eq!((i, m, f), (0, 18, 8));
}
#[test]
fn test_is_hangul_syllable() {
assert!(is_hangul_syllable('\u{AC00}')); assert!(is_hangul_syllable('\u{D7A3}')); assert!(!is_hangul_syllable('A'));
assert!(!is_hangul_syllable('あ'));
assert!(!is_hangul_syllable(' '));
}
#[test]
fn test_compose_hangul_jamo_with_trailing() {
let nfd = vec!['\u{1112}', '\u{1161}', '\u{11AB}'];
let composed = compose_hangul_jamo(&nfd);
assert_eq!(composed, vec!['한']);
}
#[test]
fn test_compose_hangul_jamo_no_trailing() {
let nfd = vec!['\u{1100}', '\u{1161}'];
let composed = compose_hangul_jamo(&nfd);
assert_eq!(composed, vec!['\u{AC00}']); }
#[test]
fn test_single_syllable_ga() {
let chars = text_to_phoneme_chars("가");
assert_eq!(chars, vec!['k', 'a']);
}
#[test]
fn test_single_syllable_han() {
let chars = text_to_phoneme_chars("한");
assert_eq!(chars, vec!['h', 'a', 'n']);
}
#[test]
fn test_single_syllable_eung() {
let chars = text_to_phoneme_chars("앙");
assert_eq!(chars, vec!['a', IPA_ENG]);
}
#[test]
fn test_word_hangul() {
let chars = text_to_phoneme_chars("한글");
assert_eq!(chars, vec!['h', 'a', 'n', 'k', IPA_CLOSE_BACK_UNR, 'l']);
}
#[test]
fn test_liaison_guk_eo() {
let chars = text_to_phoneme_chars("국어");
assert_eq!(chars, vec!['k', 'u', 'k', IPA_OPEN_MID_BACK]);
}
#[test]
fn test_liaison_complex_final() {
let chars = text_to_phoneme_chars("읽어");
assert_eq!(chars, vec!['i', 'l', 'k', IPA_OPEN_MID_BACK]);
}
#[test]
fn test_tense_initial_kk() {
let chars = text_to_phoneme_chars("까");
assert_eq!(chars, vec![PUA_KK, 'a']);
}
#[test]
fn test_aspirated_initial_kh() {
let chars = text_to_phoneme_chars("카");
assert_eq!(chars, vec![PUA_KH, 'a']);
}
#[test]
fn test_diphthong_wa() {
let chars = text_to_phoneme_chars("와");
assert_eq!(chars, vec!['w', 'a']);
}
#[test]
fn test_unreleased_final_k() {
let chars = text_to_phoneme_chars("박");
assert_eq!(chars, vec!['p', 'a', PUA_K_UNREL]);
}
#[test]
fn test_unreleased_final_t() {
let chars = text_to_phoneme_chars("맛");
assert_eq!(chars, vec!['m', 'a', PUA_T_UNREL]);
}
#[test]
fn test_unreleased_final_p() {
let chars = text_to_phoneme_chars("밥");
assert_eq!(chars, vec!['p', 'a', PUA_P_UNREL]);
}
#[test]
fn test_punctuation_passthrough() {
let chars = text_to_phoneme_chars("가.");
assert_eq!(chars, vec!['k', 'a', '.']);
}
#[test]
fn test_latin_passthrough() {
let chars = text_to_phoneme_chars("Hello");
assert_eq!(chars, vec!['h', ' ', 'e', ' ', 'l', ' ', 'l', ' ', 'o']);
}
#[test]
fn test_mixed_hangul_latin() {
let chars = text_to_phoneme_chars("가 OK");
assert_eq!(chars, vec!['k', 'a', ' ', 'o', ' ', 'k']);
}
#[test]
fn test_phonemizer_language_code() {
let p = KoreanPhonemizer::new();
assert_eq!(p.language_code(), "ko");
}
#[test]
fn test_phonemizer_prosody_all_zero() {
let p = KoreanPhonemizer::new();
let (tokens, prosody) = p.phonemize_with_prosody("가").unwrap();
assert!(!tokens.is_empty());
assert_eq!(tokens.len(), prosody.len());
for pi in &prosody {
let info = pi.unwrap();
assert_eq!((info.a1, info.a2, info.a3), (0, 0, 0));
}
}
#[test]
fn test_phonemizer_returns_single_char_tokens() {
let p = KoreanPhonemizer::new();
let (tokens, _) = p.phonemize_with_prosody("한글").unwrap();
for t in &tokens {
assert_eq!(
t.chars().count(),
1,
"Expected single-char token, got: {:?}",
t
);
}
}
#[test]
fn test_phonemizer_empty_input() {
let p = KoreanPhonemizer::new();
let (tokens, prosody) = p.phonemize_with_prosody("").unwrap();
assert!(tokens.is_empty());
assert!(prosody.is_empty());
}
#[test]
fn test_affricate_j() {
let chars = text_to_phoneme_chars("자");
assert_eq!(chars, vec![PUA_TC, 'a']);
}
#[test]
fn test_affricate_ch() {
let chars = text_to_phoneme_chars("차");
assert_eq!(chars, vec![PUA_TCH, 'a']);
}
#[test]
fn test_initial_rieul() {
let chars = text_to_phoneme_chars("라");
assert_eq!(chars, vec![IPA_FLAP, 'a']);
}
#[test]
fn test_word_boundary_space() {
let chars = text_to_phoneme_chars("가 나");
assert_eq!(chars, vec!['k', 'a', ' ', 'n', 'a']);
}
#[test]
fn test_no_leading_space() {
let chars = text_to_phoneme_chars(" 가");
assert_eq!(chars, vec!['k', 'a']);
}
#[test]
fn test_medial_ui() {
let chars = text_to_phoneme_chars("의");
assert_eq!(chars, vec![IPA_VELAR_APPROX, 'i']);
}
#[test]
fn test_no_liaison_non_ieung_initial() {
let chars = text_to_phoneme_chars("국민");
assert_eq!(chars, vec!['k', 'u', PUA_K_UNREL, 'm', 'i', 'n']);
}
#[test]
fn test_tense_affricate_jj() {
let chars = text_to_phoneme_chars("짜");
assert_eq!(chars, vec![PUA_TTCH, 'a']);
}
#[test]
fn test_liaison_does_not_cascade() {
let chars = text_to_phoneme_chars("먹어요");
assert_eq!(
chars,
vec!['m', IPA_OPEN_MID_BACK, 'k', IPA_OPEN_MID_BACK, 'j', 'o']
);
}
}