use std::{collections::HashMap, error::Error, fmt, sync::LazyLock};
const VALID_FINALS: [&str; 37] = [
"i", "u", "ü", "a", "ia", "ua", "o", "uo", "e", "ie", "üe", "ai", "uai", "ei", "uei", "ao",
"iao", "ou", "iou", "an", "ian", "uan", "üan", "en", "in", "uen", "ün", "ang", "iang", "uang",
"eng", "ing", "ueng", "ong", "iong", "er", "ê",
];
const INITIALS: [&str; 21] = [
"zh", "ch", "sh", "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s",
"t", "x", "z",
];
#[derive(Debug)]
pub enum PinyinError {
FinalNotFound(String),
}
impl fmt::Display for PinyinError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
PinyinError::FinalNotFound(tip) => write!(f, "Final not found: {}", tip),
}
}
}
impl Error for PinyinError {}
static INITIAL_MAPPING: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("b", vec![vec!["p"]]);
map.insert("c", vec![vec!["ʦʰ"]]);
map.insert("ch", vec![vec!["ꭧʰ"]]);
map.insert("d", vec![vec!["t"]]);
map.insert("f", vec![vec!["f"]]);
map.insert("g", vec![vec!["k"]]);
map.insert("h", vec![vec!["x"], vec!["h"]]);
map.insert("j", vec![vec!["ʨ"]]);
map.insert("k", vec![vec!["kʰ"]]);
map.insert("l", vec![vec!["l"]]);
map.insert("m", vec![vec!["m"]]);
map.insert("n", vec![vec!["n"]]);
map.insert("p", vec![vec!["pʰ"]]);
map.insert("q", vec![vec!["ʨʰ"]]);
map.insert("r", vec![vec!["ɻ"], vec!["ʐ"]]);
map.insert("s", vec![vec!["s"]]);
map.insert("sh", vec![vec!["ʂ"]]);
map.insert("t", vec![vec!["tʰ"]]);
map.insert("x", vec![vec!["ɕ"]]);
map.insert("z", vec![vec!["ʦ"]]);
map.insert("zh", vec![vec!["ꭧ"]]);
map
});
static SYLLABIC_CONSONANT_MAPPINGS: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("hm", vec![vec!["h", "m0"]]);
map.insert("hng", vec![vec!["h", "ŋ0"]]);
map.insert("m", vec![vec!["m0"]]);
map.insert("n", vec![vec!["n0"]]);
map.insert("ng", vec![vec!["ŋ0"]]);
map
});
static INTERJECTION_MAPPINGS: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("io", vec![vec!["j", "ɔ0"]]);
map.insert("ê", vec![vec!["ɛ0"]]);
map.insert("er", vec![vec!["ɚ0"], vec!["aɚ̯0"]]);
map.insert("o", vec![vec!["ɔ0"]]);
map
});
static FINAL_MAPPING: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("a", vec![vec!["a0"]]);
map.insert("ai", vec![vec!["ai0"]]);
map.insert("an", vec![vec!["a0", "n"]]);
map.insert("ang", vec![vec!["a0", "ŋ"]]);
map.insert("ao", vec![vec!["au0"]]);
map.insert("e", vec![vec!["ɤ0"]]);
map.insert("ei", vec![vec!["ei0"]]);
map.insert("en", vec![vec!["ə0", "n"]]);
map.insert("eng", vec![vec!["ə0", "ŋ"]]);
map.insert("i", vec![vec!["i0"]]);
map.insert("ia", vec![vec!["j", "a0"]]);
map.insert("ian", vec![vec!["j", "ɛ0", "n"]]);
map.insert("iang", vec![vec!["j", "a0", "ŋ"]]);
map.insert("iao", vec![vec!["j", "au0"]]);
map.insert("ie", vec![vec!["j", "e0"]]);
map.insert("in", vec![vec!["i0", "n"]]);
map.insert("iou", vec![vec!["j", "ou0"]]);
map.insert("ing", vec![vec!["i0", "ŋ"]]);
map.insert("iong", vec![vec!["j", "ʊ0", "ŋ"]]);
map.insert("ong", vec![vec!["ʊ0", "ŋ"]]);
map.insert("ou", vec![vec!["ou0"]]);
map.insert("u", vec![vec!["u0"]]);
map.insert("uei", vec![vec!["w", "ei0"]]);
map.insert("ua", vec![vec!["w", "a0"]]);
map.insert("uai", vec![vec!["w", "ai0"]]);
map.insert("uan", vec![vec!["w", "a0", "n"]]);
map.insert("uen", vec![vec!["w", "ə0", "n"]]);
map.insert("uang", vec![vec!["w", "a0", "ŋ"]]);
map.insert("ueng", vec![vec!["w", "ə0", "ŋ"]]);
map.insert("ui", vec![vec!["w", "ei0"]]);
map.insert("un", vec![vec!["w", "ə0", "n"]]);
map.insert("uo", vec![vec!["w", "o0"]]);
map.insert("o", vec![vec!["w", "o0"]]); map.insert("ü", vec![vec!["y0"]]);
map.insert("üe", vec![vec!["ɥ", "e0"]]);
map.insert("üan", vec![vec!["ɥ", "ɛ0", "n"]]);
map.insert("ün", vec![vec!["y0", "n"]]);
map
});
static FINAL_MAPPING_AFTER_ZH_CH_SH_R: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("i", vec![vec!["ɻ0"], vec!["ʐ0"]]);
map
});
static FINAL_MAPPING_AFTER_Z_C_S: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
LazyLock::new(|| {
let mut map = HashMap::new();
map.insert("i", vec![vec!["ɹ0"], vec!["z0"]]);
map
});
static TONE_MAPPING: LazyLock<HashMap<u8, &'static str>> = LazyLock::new(|| {
let mut map = HashMap::new();
map.insert(1u8, "˥");
map.insert(2u8, "˧˥");
map.insert(3u8, "˧˩˧");
map.insert(4u8, "˥˩");
map.insert(5u8, "");
map
});
pub(crate) fn split_tone(pinyin: &str) -> (&str, u8) {
if let Some(t) = pinyin
.chars()
.last()
.and_then(|c| c.to_digit(10).map(|n| n as u8))
{
return (&pinyin[..pinyin.len() - 1], t);
}
(pinyin, 5)
}
fn convert_uen(s: &str) -> String {
match s.strip_suffix('n') {
Some(stem) if stem.ends_with(['u', 'ū', 'ú', 'ǔ', 'ù']) => {
format!("{}en", stem)
}
_ => s.to_string(),
}
}
fn convert_uv(pinyin: &str) -> String {
let chars = pinyin.chars().collect::<Vec<_>>();
match chars.as_slice() {
[
c @ ('j' | 'q' | 'x'),
tone @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù'),
rest @ ..,
] => {
let new_tone = match tone {
'u' => 'ü',
'ū' => 'ǖ',
'ú' => 'ǘ',
'ǔ' => 'ǚ',
'ù' => 'ǜ',
_ => unreachable!(),
};
format!("{}{}{}", c, new_tone, rest.iter().collect::<String>())
}
_ => pinyin.to_string(),
}
}
fn convert_iou(pinyin: &str) -> String {
let chars = pinyin.chars().collect::<Vec<_>>();
match chars.as_slice() {
[.., 'i', u @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù')] => {
format!("{}o{}", &pinyin[..pinyin.len() - 1], u)
}
_ => pinyin.to_string(),
}
}
fn convert_uei(pinyin: &str) -> String {
let chars = pinyin.chars().collect::<Vec<_>>();
match chars.as_slice() {
[.., 'u', i @ ('i' | 'ī' | 'í' | 'ǐ' | 'ì')] => {
format!("{}e{}", &pinyin[..pinyin.len() - 1], i)
}
_ => pinyin.to_string(),
}
}
pub(crate) fn convert_zero_consonant(pinyin: &str) -> String {
let mut buffer = String::with_capacity(pinyin.len() + 2);
let chars: Vec<char> = pinyin.chars().collect();
match chars.as_slice() {
['y', 'u', rest @ ..] => {
buffer.push('ü');
buffer.extend(rest.iter());
}
['y', u @ ('ū' | 'ú' | 'ǔ' | 'ù'), rest @ ..] => {
buffer.push(match u {
'ū' => 'ǖ', 'ú' => 'ǘ', 'ǔ' => 'ǚ', 'ù' => 'ǜ', _ => unreachable!(),
});
buffer.extend(rest.iter());
}
['y', i @ ('i' | 'ī' | 'í' | 'ǐ' | 'ì'), rest @ ..] => {
buffer.push(*i);
buffer.extend(rest.iter());
}
['y', rest @ ..] => {
buffer.push('i');
buffer.extend(rest);
}
['w', u @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù'), rest @ ..] => {
buffer.push(*u);
buffer.extend(rest.iter());
}
['w', rest @ ..] => {
buffer.push('u');
buffer.extend(rest);
}
_ => return pinyin.to_string(),
}
if VALID_FINALS.contains(&buffer.as_str()) {
buffer
} else {
pinyin.to_string()
}
}
pub(crate) fn split_initial(pinyin: &str) -> (&'static str, &str) {
for &initial in &INITIALS {
if let Some(stripped) = pinyin.strip_prefix(initial) {
return (initial, stripped);
}
}
("", pinyin)
}
fn apply_tone(variants: &[Vec<&str>], tone: u8) -> Vec<Vec<String>> {
let tone_str = TONE_MAPPING.get(&tone).unwrap_or(&"");
variants
.iter()
.map(|v| v.iter().map(|s| s.replace("0", tone_str)).collect())
.collect()
}
pub fn pinyin_to_ipa(pinyin: &str) -> Result<Vec<Vec<String>>, PinyinError> {
let (pinyin, tone) = split_tone(pinyin);
let pinyin = convert_zero_consonant(pinyin);
let pinyin = convert_uv(&pinyin);
let pinyin = convert_iou(&pinyin);
let pinyin = convert_uei(&pinyin);
let pinyin = convert_uen(&pinyin);
if let Some(ipa) = SYLLABIC_CONSONANT_MAPPINGS.get(pinyin.as_str()) {
return Ok(apply_tone(ipa, tone)
.into_iter()
.map(|i| i.into_iter().collect())
.collect());
}
if let Some(ipa) = INTERJECTION_MAPPINGS.get(pinyin.as_str()) {
return Ok(apply_tone(ipa, tone)
.into_iter()
.map(|i| i.into_iter().collect())
.collect());
}
let (initial_part, final_part) = split_initial(pinyin.as_str());
let final_ipa = match initial_part {
"zh" | "ch" | "sh" | "r" if FINAL_MAPPING_AFTER_ZH_CH_SH_R.contains_key(final_part) => {
FINAL_MAPPING_AFTER_ZH_CH_SH_R.get(final_part)
}
"z" | "c" | "s" if FINAL_MAPPING_AFTER_Z_C_S.contains_key(final_part) => {
FINAL_MAPPING_AFTER_Z_C_S.get(final_part)
}
_ => FINAL_MAPPING.get(final_part),
}
.ok_or(PinyinError::FinalNotFound(final_part.to_owned()))?;
let mut result = Vec::<Vec<String>>::new();
let initials = INITIAL_MAPPING
.get(initial_part)
.map_or(vec![vec![Default::default()]], |i| {
i.iter()
.map(|i| i.iter().map(|i| i.to_string()).collect())
.collect()
});
for i in initials.into_iter() {
for j in apply_tone(final_ipa, tone).into_iter() {
result.push(
i.iter()
.chain(j.iter())
.map(|i| i.to_owned())
.collect::<Vec<_>>(),
)
}
}
Ok(result)
}